• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2019 Google LLC
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "include/core/SkStream.h"
9 #include "include/core/SkString.h"
10 #include "include/private/SkHalf.h"
11 #include "include/private/SkTFitsIn.h"
12 #include "include/private/SkThreadID.h"
13 #include "src/core/SkColorSpacePriv.h"
14 #include "src/core/SkColorSpaceXformSteps.h"
15 #include "src/core/SkCpu.h"
16 #include "src/core/SkEnumerate.h"
17 #include "src/core/SkOpts.h"
18 #include "src/core/SkVM.h"
19 #include <algorithm>
20 #include <atomic>
21 #include <queue>
22 
23 #if defined(SKVM_LLVM)
24     #include <future>
25     #include <llvm/Bitcode/BitcodeWriter.h>
26     #include <llvm/ExecutionEngine/ExecutionEngine.h>
27     #include <llvm/IR/IRBuilder.h>
28     #include <llvm/IR/Verifier.h>
29     #include <llvm/Support/TargetSelect.h>
30     #include <llvm/Support/Host.h>
31 
32     // Platform-specific intrinsics got their own files in LLVM 10.
33     #if __has_include(<llvm/IR/IntrinsicsX86.h>)
34         #include <llvm/IR/IntrinsicsX86.h>
35     #endif
36 #endif
37 
38 // #define SKVM_LLVM_WAIT_FOR_COMPILATION
39 
40 bool gSkVMAllowJIT{false};
41 bool gSkVMJITViaDylib{false};
42 
43 #if defined(SKVM_JIT)
44     #if defined(SK_BUILD_FOR_WIN)
45         #include "src/core/SkLeanWindows.h"
46         #include <memoryapi.h>
47 
alloc_jit_buffer(size_t * len)48         static void* alloc_jit_buffer(size_t* len) {
49             return VirtualAlloc(NULL, *len, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
50         }
remap_as_executable(void * ptr,size_t len)51         static void remap_as_executable(void* ptr, size_t len) {
52             DWORD old;
53             VirtualProtect(ptr, len, PAGE_EXECUTE_READ, &old);
54             SkASSERT(old == PAGE_READWRITE);
55         }
56         #if !defined(SKVM_LLVM)
unmap_jit_buffer(void * ptr,size_t len)57         static void unmap_jit_buffer(void* ptr, size_t len) {
58             VirtualFree(ptr, 0, MEM_RELEASE);
59         }
close_dylib(void * dylib)60         static void close_dylib(void* dylib) {
61             SkASSERT(false);  // TODO?  For now just assert we never make one.
62         }
63         #endif
64     #else
65         #include <dlfcn.h>
66         #include <sys/mman.h>
67 
alloc_jit_buffer(size_t * len)68         static void* alloc_jit_buffer(size_t* len) {
69             // While mprotect and VirtualAlloc both work at page granularity,
70             // mprotect doesn't round up for you, and instead requires *len is at page granularity.
71             const size_t page = sysconf(_SC_PAGESIZE);
72             *len = ((*len + page - 1) / page) * page;
73             return mmap(nullptr,*len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1,0);
74         }
remap_as_executable(void * ptr,size_t len)75         static void remap_as_executable(void* ptr, size_t len) {
76             mprotect(ptr, len, PROT_READ|PROT_EXEC);
77             __builtin___clear_cache((char*)ptr,
78                                     (char*)ptr + len);
79         }
80         #if !defined(SKVM_LLVM)
unmap_jit_buffer(void * ptr,size_t len)81         static void unmap_jit_buffer(void* ptr, size_t len) {
82             munmap(ptr, len);
83         }
close_dylib(void * dylib)84         static void close_dylib(void* dylib) {
85             dlclose(dylib);
86         }
87         #endif
88     #endif
89 
90     #if defined(SKVM_JIT_VTUNE)
91         #include <jitprofiling.h>
notify_vtune(const char * name,void * addr,size_t len)92         static void notify_vtune(const char* name, void* addr, size_t len) {
93             if (iJIT_IsProfilingActive() == iJIT_SAMPLING_ON) {
94                 iJIT_Method_Load event;
95                 memset(&event, 0, sizeof(event));
96                 event.method_id           = iJIT_GetNewMethodID();
97                 event.method_name         = const_cast<char*>(name);
98                 event.method_load_address = addr;
99                 event.method_size         = len;
100                 iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, &event);
101             }
102         }
103     #else
notify_vtune(const char * name,void * addr,size_t len)104         static void notify_vtune(const char* name, void* addr, size_t len) {}
105     #endif
106 #endif
107 
108 // JIT code isn't MSAN-instrumented, so we won't see when it uses
109 // uninitialized memory, and we'll not see the writes it makes as properly
110 // initializing memory.  Instead force the interpreter, which should let
111 // MSAN see everything our programs do properly.
112 //
113 // Similarly, we can't get ASAN's checks unless we let it instrument our interpreter.
114 #if defined(__has_feature)
115     #if __has_feature(memory_sanitizer) || __has_feature(address_sanitizer)
116         #define SKVM_JIT_BUT_IGNORE_IT
117     #endif
118 #endif
119 
120 #if defined(SKSL_STANDALONE)
121     // skslc needs to link against this module (for the VM code generator). This module pulls in
122     // color-space code, but attempting to add those transitive dependencies to skslc gets out of
123     // hand. So we terminate the chain here with stub functions. Note that skslc's usage of SkVM
124     // never cares about color management.
sk_program_transfer_fn(skvm::F32 v,TFKind tf_kind,skvm::F32 G,skvm::F32 A,skvm::F32 B,skvm::F32 C,skvm::F32 D,skvm::F32 E,skvm::F32 F)125     skvm::F32 sk_program_transfer_fn(
126         skvm::F32 v, TFKind tf_kind,
127         skvm::F32 G, skvm::F32 A, skvm::F32 B, skvm::F32 C, skvm::F32 D, skvm::F32 E, skvm::F32 F) {
128             return v;
129     }
130 
skcms_sRGB_TransferFunction()131     const skcms_TransferFunction* skcms_sRGB_TransferFunction() { return nullptr; }
skcms_sRGB_Inverse_TransferFunction()132     const skcms_TransferFunction* skcms_sRGB_Inverse_TransferFunction() { return nullptr; }
133 #endif
134 
135 namespace skvm {
136 
detect_features()137     static Features detect_features() {
138         static const bool fma =
139         #if defined(SK_CPU_X86)
140             SkCpu::Supports(SkCpu::HSW);
141         #elif defined(SK_CPU_ARM64)
142             true;
143         #else
144             false;
145         #endif
146 
147         static const bool fp16 = false;  // TODO
148 
149         return { fma, fp16 };
150     }
151 
Builder()152     Builder::Builder()                  : fFeatures(detect_features()) {}
Builder(Features features)153     Builder::Builder(Features features) : fFeatures(features         ) {}
154 
155 
156     struct Program::Impl {
157         std::vector<InterpreterInstruction> instructions;
158         int regs = 0;
159         int loop = 0;
160         std::vector<int> strides;
161 
162         std::atomic<void*> jit_entry{nullptr};   // TODO: minimal std::memory_orders
163         size_t jit_size = 0;
164         void*  dylib    = nullptr;
165 
166     #if defined(SKVM_LLVM)
167         std::unique_ptr<llvm::LLVMContext>     llvm_ctx;
168         std::unique_ptr<llvm::ExecutionEngine> llvm_ee;
169         std::future<void>                      llvm_compiling;
170     #endif
171     };
172 
173     // Debugging tools, mostly for printing various data structures out to a stream.
174 
175     namespace {
176         class SkDebugfStream final : public SkWStream {
177             size_t fBytesWritten = 0;
178 
write(const void * buffer,size_t size)179             bool write(const void* buffer, size_t size) override {
180                 SkDebugf("%.*s", (int)size, (const char*)buffer);
181                 fBytesWritten += size;
182                 return true;
183             }
184 
bytesWritten() const185             size_t bytesWritten() const override {
186                 return fBytesWritten;
187             }
188         };
189 
190         struct V { Val id; };
191         struct R { Reg id; };
192         struct Shift { int bits; };
193         struct Splat { int bits; };
194         struct Hex   { int bits; };
195         // For op `trace_line` or `trace_call`
196         struct Line  { int bits; };
197         // For op `trace_var`
198         struct VarSlot { int bits; };
199         struct VarType { int bits; };
200         static constexpr VarType kVarTypeInt{0};
201         static constexpr VarType kVarTypeFloat{1};
202         static constexpr VarType kVarTypeBool{2};
203         // For op `trace_call`
204         struct CallType { int bits; };
205         static constexpr CallType kCallTypeEnter{1};
206         static constexpr CallType kCallTypeExit{0};
207 
write(SkWStream * o,const char * s)208         static void write(SkWStream* o, const char* s) {
209             o->writeText(s);
210         }
211 
name(Op op)212         static const char* name(Op op) {
213             switch (op) {
214             #define M(x) case Op::x: return #x;
215                 SKVM_OPS(M)
216             #undef M
217             }
218             return "unknown op";
219         }
220 
write(SkWStream * o,Op op)221         static void write(SkWStream* o, Op op) {
222             o->writeText(name(op));
223         }
write(SkWStream * o,Ptr p)224         static void write(SkWStream* o, Ptr p) {
225             write(o, "ptr");
226             o->writeDecAsText(p.ix);
227         }
write(SkWStream * o,V v)228         static void write(SkWStream* o, V v) {
229             write(o, "v");
230             o->writeDecAsText(v.id);
231         }
write(SkWStream * o,R r)232         static void write(SkWStream* o, R r) {
233             write(o, "r");
234             o->writeDecAsText(r.id);
235         }
write(SkWStream * o,Shift s)236         static void write(SkWStream* o, Shift s) {
237             o->writeDecAsText(s.bits);
238         }
write(SkWStream * o,Splat s)239         static void write(SkWStream* o, Splat s) {
240             float f;
241             memcpy(&f, &s.bits, 4);
242             o->writeHexAsText(s.bits);
243             write(o, " (");
244             o->writeScalarAsText(f);
245             write(o, ")");
246         }
write(SkWStream * o,Hex h)247         static void write(SkWStream* o, Hex h) {
248             o->writeHexAsText(h.bits);
249         }
write(SkWStream * o,Line d)250         static void write(SkWStream* o, Line d) {
251             write(o, "L");
252             o->writeDecAsText(d.bits);
253         }
write(SkWStream * o,VarSlot s)254         static void write(SkWStream* o, VarSlot s) {
255             write(o, "$");
256             o->writeDecAsText(s.bits);
257         }
write(SkWStream * o,VarType n)258         static void write(SkWStream* o, VarType n) {
259             if (n.bits == kVarTypeFloat.bits) {
260                 write(o, "(F32)");
261             } else if (n.bits == kVarTypeInt.bits) {
262                 write(o, "(I32)");
263             } else if (n.bits == kVarTypeBool.bits) {
264                 write(o, "(bool)");
265             } else {
266                 write(o, "???");
267             }
268         }
write(SkWStream * o,CallType n)269         static void write(SkWStream* o, CallType n) {
270             if (n.bits == kCallTypeEnter.bits) {
271                 write(o, "(enter)");
272             } else if (n.bits == kCallTypeExit.bits) {
273                 write(o, "(exit)");
274             } else {
275                 write(o, "???");
276             }
277         }
278 
279         template <typename T, typename... Ts>
write(SkWStream * o,T first,Ts...rest)280         static void write(SkWStream* o, T first, Ts... rest) {
281             write(o, first);
282             write(o, " ");
283             write(o, rest...);
284         }
285     }  // namespace
286 
write_one_instruction(Val id,const OptimizedInstruction & inst,SkWStream * o)287     static void write_one_instruction(Val id, const OptimizedInstruction& inst, SkWStream* o) {
288         Op  op = inst.op;
289         Val  x = inst.x,
290              y = inst.y,
291              z = inst.z,
292              w = inst.w;
293         int immA = inst.immA,
294             immB = inst.immB,
295             immC = inst.immC;
296         switch (op) {
297             case Op::assert_true: write(o, op, V{x}, V{y}); break;
298 
299             case Op::trace_line: write(o, op, V{x}, Line{immA}); break;
300             case Op::trace_var:  write(o, op, V{x}, VarSlot{immA}, "=", V{y}, VarType{immB}); break;
301             case Op::trace_call: write(o, op, V{x}, Line{immA}, CallType{immB}); break;
302 
303             case Op::store8:   write(o, op, Ptr{immA}, V{x}               ); break;
304             case Op::store16:  write(o, op, Ptr{immA}, V{x}               ); break;
305             case Op::store32:  write(o, op, Ptr{immA}, V{x}               ); break;
306             case Op::store64:  write(o, op, Ptr{immA}, V{x},V{y}          ); break;
307             case Op::store128: write(o, op, Ptr{immA}, V{x},V{y},V{z},V{w}); break;
308 
309             case Op::index: write(o, V{id}, "=", op); break;
310 
311             case Op::load8:   write(o, V{id}, "=", op, Ptr{immA}); break;
312             case Op::load16:  write(o, V{id}, "=", op, Ptr{immA}); break;
313             case Op::load32:  write(o, V{id}, "=", op, Ptr{immA}); break;
314             case Op::load64:  write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break;
315             case Op::load128: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break;
316 
317             case Op::gather8:  write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break;
318             case Op::gather16: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break;
319             case Op::gather32: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break;
320 
321             case Op::uniform32: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break;
322             case Op::array32:   write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, Hex{immC}); break;
323 
324             case Op::splat: write(o, V{id}, "=", op, Splat{immA}); break;
325 
326             case Op:: add_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
327             case Op:: sub_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
328             case Op:: mul_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
329             case Op:: div_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
330             case Op:: min_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
331             case Op:: max_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
332             case Op:: fma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
333             case Op:: fms_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
334             case Op::fnma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
335 
336 
337             case Op::sqrt_f32: write(o, V{id}, "=", op, V{x}); break;
338 
339             case Op:: eq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
340             case Op::neq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
341             case Op:: gt_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
342             case Op::gte_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
343 
344 
345             case Op::add_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
346             case Op::sub_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
347             case Op::mul_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
348 
349             case Op::shl_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break;
350             case Op::shr_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break;
351             case Op::sra_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break;
352 
353             case Op::eq_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
354             case Op::gt_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
355 
356 
357             case Op::bit_and  : write(o, V{id}, "=", op, V{x}, V{y}); break;
358             case Op::bit_or   : write(o, V{id}, "=", op, V{x}, V{y}); break;
359             case Op::bit_xor  : write(o, V{id}, "=", op, V{x}, V{y}); break;
360             case Op::bit_clear: write(o, V{id}, "=", op, V{x}, V{y}); break;
361 
362             case Op::select: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
363 
364             case Op::ceil:      write(o, V{id}, "=", op, V{x}); break;
365             case Op::floor:     write(o, V{id}, "=", op, V{x}); break;
366             case Op::to_f32:    write(o, V{id}, "=", op, V{x}); break;
367             case Op::to_fp16:   write(o, V{id}, "=", op, V{x}); break;
368             case Op::from_fp16: write(o, V{id}, "=", op, V{x}); break;
369             case Op::trunc:     write(o, V{id}, "=", op, V{x}); break;
370             case Op::round:     write(o, V{id}, "=", op, V{x}); break;
371         }
372 
373         write(o, "\n");
374     }
375 
dump(SkWStream * o) const376     void Builder::dump(SkWStream* o) const {
377         SkDebugfStream debug;
378         if (!o) { o = &debug; }
379 
380         std::vector<OptimizedInstruction> optimized = this->optimize();
381         o->writeDecAsText(optimized.size());
382         o->writeText(" values (originally ");
383         o->writeDecAsText(fProgram.size());
384         o->writeText("):\n");
385         for (Val id = 0; id < (Val)optimized.size(); id++) {
386             const OptimizedInstruction& inst = optimized[id];
387             write(o, inst.can_hoist ? "↑ " : "  ");
388             write_one_instruction(id, inst, o);
389         }
390     }
391 
dump(SkWStream * o) const392     void Program::dump(SkWStream* o) const {
393         SkDebugfStream debug;
394         if (!o) { o = &debug; }
395 
396         o->writeDecAsText(fImpl->regs);
397         o->writeText(" registers, ");
398         o->writeDecAsText(fImpl->instructions.size());
399         o->writeText(" instructions:\n");
400         for (Val i = 0; i < (Val)fImpl->instructions.size(); i++) {
401             if (i == fImpl->loop) { write(o, "loop:\n"); }
402             o->writeDecAsText(i);
403             o->writeText("\t");
404             if (i >= fImpl->loop) { write(o, "    "); }
405             const InterpreterInstruction& inst = fImpl->instructions[i];
406             Op   op = inst.op;
407             Reg   d = inst.d,
408                   x = inst.x,
409                   y = inst.y,
410                   z = inst.z,
411                   w = inst.w;
412             int immA = inst.immA,
413                 immB = inst.immB,
414                 immC = inst.immC;
415             switch (op) {
416                 case Op::assert_true: write(o, op, R{x}, R{y}); break;
417 
418                 case Op::trace_line: write(o, op, R{x}, Line{immA}); break;
419                 case Op::trace_var: write(o, op, R{x}, VarSlot{immA}, "=", R{y}, VarType{immB});
420                                     break;
421                 case Op::trace_call: write(o, op, R{x}, Line{immA}, CallType{immB}); break;
422 
423                 case Op::store8:   write(o, op, Ptr{immA}, R{x}                  ); break;
424                 case Op::store16:  write(o, op, Ptr{immA}, R{x}                  ); break;
425                 case Op::store32:  write(o, op, Ptr{immA}, R{x}                  ); break;
426                 case Op::store64:  write(o, op, Ptr{immA}, R{x}, R{y}            ); break;
427                 case Op::store128: write(o, op, Ptr{immA}, R{x}, R{y}, R{z}, R{w}); break;
428 
429                 case Op::index: write(o, R{d}, "=", op); break;
430 
431                 case Op::load8:   write(o, R{d}, "=", op, Ptr{immA}); break;
432                 case Op::load16:  write(o, R{d}, "=", op, Ptr{immA}); break;
433                 case Op::load32:  write(o, R{d}, "=", op, Ptr{immA}); break;
434                 case Op::load64:  write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break;
435                 case Op::load128: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break;
436 
437                 case Op::gather8:  write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break;
438                 case Op::gather16: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break;
439                 case Op::gather32: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break;
440 
441                 case Op::uniform32: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break;
442                 case Op::array32:   write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, Hex{immC}); break;
443 
444                 case Op::splat:     write(o, R{d}, "=", op, Splat{immA}); break;
445 
446                 case Op::add_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
447                 case Op::sub_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
448                 case Op::mul_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
449                 case Op::div_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
450                 case Op::min_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
451                 case Op::max_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
452                 case Op::fma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
453                 case Op::fms_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
454                 case Op::fnma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
455 
456                 case Op::sqrt_f32: write(o, R{d}, "=", op, R{x}); break;
457 
458                 case Op:: eq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
459                 case Op::neq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
460                 case Op:: gt_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
461                 case Op::gte_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
462 
463 
464                 case Op::add_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
465                 case Op::sub_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
466                 case Op::mul_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
467 
468                 case Op::shl_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break;
469                 case Op::shr_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break;
470                 case Op::sra_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break;
471 
472                 case Op::eq_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
473                 case Op::gt_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
474 
475                 case Op::bit_and  : write(o, R{d}, "=", op, R{x}, R{y}); break;
476                 case Op::bit_or   : write(o, R{d}, "=", op, R{x}, R{y}); break;
477                 case Op::bit_xor  : write(o, R{d}, "=", op, R{x}, R{y}); break;
478                 case Op::bit_clear: write(o, R{d}, "=", op, R{x}, R{y}); break;
479 
480                 case Op::select: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
481 
482                 case Op::ceil:      write(o, R{d}, "=", op, R{x}); break;
483                 case Op::floor:     write(o, R{d}, "=", op, R{x}); break;
484                 case Op::to_f32:    write(o, R{d}, "=", op, R{x}); break;
485                 case Op::to_fp16:   write(o, R{d}, "=", op, R{x}); break;
486                 case Op::from_fp16: write(o, R{d}, "=", op, R{x}); break;
487                 case Op::trunc:     write(o, R{d}, "=", op, R{x}); break;
488                 case Op::round:     write(o, R{d}, "=", op, R{x}); break;
489             }
490             write(o, "\n");
491         }
492     }
493 
eliminate_dead_code(std::vector<Instruction> program)494     std::vector<Instruction> eliminate_dead_code(std::vector<Instruction> program) {
495         // Determine which Instructions are live by working back from side effects.
496         std::vector<bool> live(program.size(), false);
497         for (Val id = program.size(); id--;) {
498             if (live[id] || has_side_effect(program[id].op)) {
499                 live[id] = true;
500                 const Instruction& inst = program[id];
501                 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) {
502                     if (arg != NA) { live[arg] = true; }
503                 }
504             }
505         }
506 
507         // After removing non-live instructions, we can be left with redundant back-to-back
508         // trace_line instructions. (e.g. one line could have multiple statements on it.)
509         // Eliminate any duplicate ops.
510         int lastId = -1;
511         for (Val id = 0; id < (Val)program.size(); id++) {
512             if (!live[id]) {
513                 continue;
514             }
515             const Instruction& inst = program[id];
516             if (inst.op != Op::trace_line) {
517                 lastId = -1;
518                 continue;
519             }
520             if (lastId >= 0) {
521                 const Instruction& last = program[lastId];
522                 if (inst.immA == last.immA && inst.x == last.x) {
523                     // Found two matching trace_lines in a row. Mark the first one as dead.
524                     live[lastId] = false;
525                 }
526             }
527             lastId = id;
528         }
529 
530         // Rewrite the program with only live Instructions:
531         //   - remap IDs in live Instructions to what they'll be once dead Instructions are removed;
532         //   - then actually remove the dead Instructions.
533         std::vector<Val> new_id(program.size(), NA);
534         for (Val id = 0, next = 0; id < (Val)program.size(); id++) {
535             if (live[id]) {
536                 Instruction& inst = program[id];
537                 for (Val* arg : {&inst.x, &inst.y, &inst.z, &inst.w}) {
538                     if (*arg != NA) {
539                         *arg = new_id[*arg];
540                         SkASSERT(*arg != NA);
541                     }
542                 }
543                 new_id[id] = next++;
544             }
545         }
546 
547         // Eliminate any non-live ops.
548         auto it = std::remove_if(program.begin(), program.end(), [&](const Instruction& inst) {
549             Val id = (Val)(&inst - program.data());
550             return !live[id];
551         });
552         program.erase(it, program.end());
553 
554         return program;
555     }
556 
finalize(const std::vector<Instruction> program)557     std::vector<OptimizedInstruction> finalize(const std::vector<Instruction> program) {
558         std::vector<OptimizedInstruction> optimized(program.size());
559         for (Val id = 0; id < (Val)program.size(); id++) {
560             Instruction inst = program[id];
561             optimized[id] = {inst.op, inst.x,inst.y,inst.z,inst.w,
562                              inst.immA,inst.immB,inst.immC,
563                              /*death=*/id, /*can_hoist=*/true};
564         }
565 
566         // Each Instruction's inputs need to live at least until that Instruction issues.
567         for (Val id = 0; id < (Val)optimized.size(); id++) {
568             OptimizedInstruction& inst = optimized[id];
569             for (Val arg : {inst.x, inst.y, inst.z, inst.w}) {
570                 // (We're walking in order, so this is the same as max()ing with the existing Val.)
571                 if (arg != NA) { optimized[arg].death = id; }
572             }
573         }
574 
575         // Mark which values don't depend on the loop and can be hoisted.
576         for (OptimizedInstruction& inst : optimized) {
577             // Varying loads (and gathers) and stores cannot be hoisted out of the loop.
578             if (is_always_varying(inst.op) || is_trace(inst.op)) {
579                 inst.can_hoist = false;
580             }
581 
582             // If any of an instruction's inputs can't be hoisted, it can't be hoisted itself.
583             if (inst.can_hoist) {
584                 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) {
585                     if (arg != NA) { inst.can_hoist &= optimized[arg].can_hoist; }
586                 }
587             }
588         }
589 
590         // Extend the lifetime of any hoisted value that's used in the loop to infinity.
591         for (OptimizedInstruction& inst : optimized) {
592             if (!inst.can_hoist /*i.e. we're in the loop, so the arguments are used-in-loop*/) {
593                 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) {
594                     if (arg != NA && optimized[arg].can_hoist) {
595                         optimized[arg].death = (Val)program.size();
596                     }
597                 }
598             }
599         }
600 
601         return optimized;
602     }
603 
optimize() const604     std::vector<OptimizedInstruction> Builder::optimize() const {
605         std::vector<Instruction> program = this->program();
606         program = eliminate_dead_code(std::move(program));
607         return    finalize           (std::move(program));
608     }
609 
done(const char * debug_name,bool allow_jit) const610     Program Builder::done(const char* debug_name, bool allow_jit) const {
611         char buf[64] = "skvm-jit-";
612         if (!debug_name) {
613             *SkStrAppendU32(buf+9, this->hash()) = '\0';
614             debug_name = buf;
615         }
616 
617         return {this->optimize(), fStrides, debug_name, allow_jit};
618     }
619 
hash() const620     uint64_t Builder::hash() const {
621         uint32_t lo = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 0),
622                  hi = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 1);
623         return (uint64_t)lo | (uint64_t)hi << 32;
624     }
625 
operator !=(Ptr a,Ptr b)626     bool operator!=(Ptr a, Ptr b) { return a.ix != b.ix; }
627 
operator ==(const Instruction & a,const Instruction & b)628     bool operator==(const Instruction& a, const Instruction& b) {
629         return a.op   == b.op
630             && a.x    == b.x
631             && a.y    == b.y
632             && a.z    == b.z
633             && a.w    == b.w
634             && a.immA == b.immA
635             && a.immB == b.immB
636             && a.immC == b.immC;
637     }
638 
operator ()(const Instruction & inst,uint32_t seed) const639     uint32_t InstructionHash::operator()(const Instruction& inst, uint32_t seed) const {
640         return SkOpts::hash(&inst, sizeof(inst), seed);
641     }
642 
643 
644     // Most instructions produce a value and return it by ID,
645     // the value-producing instruction's own index in the program vector.
push(Instruction inst)646     Val Builder::push(Instruction inst) {
647         // Basic common subexpression elimination:
648         // if we've already seen this exact Instruction, use it instead of creating a new one.
649         //
650         // But we never dedup loads or stores: an intervening store could change that memory.
651         // Uniforms and gathers touch only uniform memory, so they're fine to dedup,
652         // and index is varying but doesn't touch memory, so it's fine to dedup too.
653         if (!touches_varying_memory(inst.op) && !is_trace(inst.op)) {
654             if (Val* id = fIndex.find(inst)) {
655                 return *id;
656             }
657         }
658         Val id = static_cast<Val>(fProgram.size());
659         fProgram.push_back(inst);
660         fIndex.set(inst, id);
661         return id;
662     }
663 
arg(int stride)664     Ptr Builder::arg(int stride) {
665         int ix = (int)fStrides.size();
666         fStrides.push_back(stride);
667         return {ix};
668     }
669 
assert_true(I32 cond,I32 debug)670     void Builder::assert_true(I32 cond, I32 debug) {
671     #ifdef SK_DEBUG
672         int imm;
673         if (this->allImm(cond.id,&imm)) { SkASSERT(imm); return; }
674         (void)push(Op::assert_true, cond.id, debug.id);
675     #endif
676     }
677 
trace_line(I32 mask,int line)678     void Builder::trace_line(I32 mask, int line) {
679         if (this->isImm(mask.id, 0)) { return; }
680         (void)push(Op::trace_line, mask.id,NA,NA,NA, line);
681     }
trace_var(I32 mask,int slot,I32 val)682     void Builder::trace_var(I32 mask, int slot, I32 val) {
683         if (this->isImm(mask.id, 0)) { return; }
684         (void)push(Op::trace_var, mask.id,val.id,NA,NA, slot, kVarTypeInt.bits);
685     }
trace_var(I32 mask,int slot,F32 val)686     void Builder::trace_var(I32 mask, int slot, F32 val) {
687         if (this->isImm(mask.id, 0)) { return; }
688         (void)push(Op::trace_var, mask.id,val.id,NA,NA, slot, kVarTypeFloat.bits);
689     }
trace_var(I32 mask,int slot,bool b)690     void Builder::trace_var(I32 mask, int slot, bool b) {
691         if (this->isImm(mask.id, 0)) { return; }
692         I32 val = b ? this->splat(1) : this->splat(0);
693         (void)push(Op::trace_var, mask.id,val.id,NA,NA, slot, kVarTypeBool.bits);
694     }
trace_call_enter(I32 mask,int line)695     void Builder::trace_call_enter(I32 mask, int line) {
696         if (this->isImm(mask.id, 0)) { return; }
697         (void)push(Op::trace_call, mask.id,NA,NA,NA, line, kCallTypeEnter.bits);
698     }
trace_call_exit(I32 mask,int line)699     void Builder::trace_call_exit(I32 mask, int line) {
700         if (this->isImm(mask.id, 0)) { return; }
701         (void)push(Op::trace_call, mask.id,NA,NA,NA, line, kCallTypeExit.bits);
702     }
703 
store8(Ptr ptr,I32 val)704     void Builder::store8 (Ptr ptr, I32 val) { (void)push(Op::store8 , val.id,NA,NA,NA, ptr.ix); }
store16(Ptr ptr,I32 val)705     void Builder::store16(Ptr ptr, I32 val) { (void)push(Op::store16, val.id,NA,NA,NA, ptr.ix); }
store32(Ptr ptr,I32 val)706     void Builder::store32(Ptr ptr, I32 val) { (void)push(Op::store32, val.id,NA,NA,NA, ptr.ix); }
store64(Ptr ptr,I32 lo,I32 hi)707     void Builder::store64(Ptr ptr, I32 lo, I32 hi) {
708         (void)push(Op::store64, lo.id,hi.id,NA,NA, ptr.ix);
709     }
store128(Ptr ptr,I32 x,I32 y,I32 z,I32 w)710     void Builder::store128(Ptr ptr, I32 x, I32 y, I32 z, I32 w) {
711         (void)push(Op::store128, x.id,y.id,z.id,w.id, ptr.ix);
712     }
713 
index()714     I32 Builder::index() { return {this, push(Op::index)}; }
715 
load8(Ptr ptr)716     I32 Builder::load8 (Ptr ptr) { return {this, push(Op::load8 , NA,NA,NA,NA, ptr.ix) }; }
load16(Ptr ptr)717     I32 Builder::load16(Ptr ptr) { return {this, push(Op::load16, NA,NA,NA,NA, ptr.ix) }; }
load32(Ptr ptr)718     I32 Builder::load32(Ptr ptr) { return {this, push(Op::load32, NA,NA,NA,NA, ptr.ix) }; }
load64(Ptr ptr,int lane)719     I32 Builder::load64(Ptr ptr, int lane) {
720         return {this, push(Op::load64 , NA,NA,NA,NA, ptr.ix,lane) };
721     }
load128(Ptr ptr,int lane)722     I32 Builder::load128(Ptr ptr, int lane) {
723         return {this, push(Op::load128, NA,NA,NA,NA, ptr.ix,lane) };
724     }
725 
gather8(UPtr ptr,int offset,I32 index)726     I32 Builder::gather8 (UPtr ptr, int offset, I32 index) {
727         return {this, push(Op::gather8 , index.id,NA,NA,NA, ptr.ix,offset)};
728     }
gather16(UPtr ptr,int offset,I32 index)729     I32 Builder::gather16(UPtr ptr, int offset, I32 index) {
730         return {this, push(Op::gather16, index.id,NA,NA,NA, ptr.ix,offset)};
731     }
gather32(UPtr ptr,int offset,I32 index)732     I32 Builder::gather32(UPtr ptr, int offset, I32 index) {
733         return {this, push(Op::gather32, index.id,NA,NA,NA, ptr.ix,offset)};
734     }
735 
uniform32(UPtr ptr,int offset)736     I32 Builder::uniform32(UPtr ptr, int offset) {
737         return {this, push(Op::uniform32, NA,NA,NA,NA, ptr.ix, offset)};
738     }
739 
740     // Note: this converts the array index into a byte offset for the op.
array32(UPtr ptr,int offset,int index)741     I32 Builder::array32  (UPtr ptr, int offset, int index) {
742         return {this, push(Op::array32, NA,NA,NA,NA, ptr.ix, offset, index * sizeof(int))};
743     }
744 
splat(int n)745     I32 Builder::splat(int n) { return {this, push(Op::splat, NA,NA,NA,NA, n) }; }
746 
747     // Be careful peepholing float math!  Transformations you might expect to
748     // be legal can fail in the face of NaN/Inf, e.g. 0*x is not always 0.
749     // Float peepholes must pass this equivalence test for all ~4B floats:
750     //
751     //     bool equiv(float x, float y) { return (x == y) || (isnanf(x) && isnanf(y)); }
752     //
753     //     unsigned bits = 0;
754     //     do {
755     //        float f;
756     //        memcpy(&f, &bits, 4);
757     //        if (!equiv(f, ...)) {
758     //           abort();
759     //        }
760     //     } while (++bits != 0);
761 
add(F32 x,F32 y)762     F32 Builder::add(F32 x, F32 y) {
763         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); }
764         if (this->isImm(y.id, 0.0f)) { return x; }   // x+0 == x
765         if (this->isImm(x.id, 0.0f)) { return y; }   // 0+y == y
766 
767         if (fFeatures.fma) {
768             if (fProgram[x.id].op == Op::mul_f32) {
769                 return {this, this->push(Op::fma_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)};
770             }
771             if (fProgram[y.id].op == Op::mul_f32) {
772                 return {this, this->push(Op::fma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)};
773             }
774         }
775         return {this, this->push(Op::add_f32, x.id, y.id)};
776     }
777 
sub(F32 x,F32 y)778     F32 Builder::sub(F32 x, F32 y) {
779         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); }
780         if (this->isImm(y.id, 0.0f)) { return x; }   // x-0 == x
781         if (fFeatures.fma) {
782             if (fProgram[x.id].op == Op::mul_f32) {
783                 return {this, this->push(Op::fms_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)};
784             }
785             if (fProgram[y.id].op == Op::mul_f32) {
786                 return {this, this->push(Op::fnma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)};
787             }
788         }
789         return {this, this->push(Op::sub_f32, x.id, y.id)};
790     }
791 
mul(F32 x,F32 y)792     F32 Builder::mul(F32 x, F32 y) {
793         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); }
794         if (this->isImm(y.id, 1.0f)) { return x; }  // x*1 == x
795         if (this->isImm(x.id, 1.0f)) { return y; }  // 1*y == y
796         return {this, this->push(Op::mul_f32, x.id, y.id)};
797     }
798 
fast_mul(F32 x,F32 y)799     F32 Builder::fast_mul(F32 x, F32 y) {
800         if (this->isImm(x.id, 0.0f) || this->isImm(y.id, 0.0f)) { return splat(0.0f); }
801         return mul(x,y);
802     }
803 
div(F32 x,F32 y)804     F32 Builder::div(F32 x, F32 y) {
805         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(sk_ieee_float_divide(X,Y)); }
806         if (this->isImm(y.id, 1.0f)) { return x; }  // x/1 == x
807         return {this, this->push(Op::div_f32, x.id, y.id)};
808     }
809 
sqrt(F32 x)810     F32 Builder::sqrt(F32 x) {
811         if (float X; this->allImm(x.id,&X)) { return splat(std::sqrt(X)); }
812         return {this, this->push(Op::sqrt_f32, x.id)};
813     }
814 
815     // See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html.
approx_log2(F32 x)816     F32 Builder::approx_log2(F32 x) {
817         // e - 127 is a fair approximation of log2(x) in its own right...
818         F32 e = mul(to_F32(pun_to_I32(x)), splat(1.0f / (1<<23)));
819 
820         // ... but using the mantissa to refine its error is _much_ better.
821         F32 m = pun_to_F32(bit_or(bit_and(pun_to_I32(x), 0x007fffff),
822                                 0x3f000000));
823         F32 approx = sub(e,        124.225514990f);
824             approx = sub(approx, mul(1.498030302f, m));
825             approx = sub(approx, div(1.725879990f, add(0.3520887068f, m)));
826 
827         return approx;
828     }
829 
approx_pow2(F32 x)830     F32 Builder::approx_pow2(F32 x) {
831         F32 f = fract(x);
832         F32 approx = add(x,         121.274057500f);
833             approx = sub(approx, mul( 1.490129070f, f));
834             approx = add(approx, div(27.728023300f, sub(4.84252568f, f)));
835 
836         return pun_to_F32(round(mul(1.0f * (1<<23), approx)));
837     }
838 
approx_powf(F32 x,F32 y)839     F32 Builder::approx_powf(F32 x, F32 y) {
840         // TODO: assert this instead?  Sometimes x is very slightly negative.  See skia:10210.
841         x = max(0.0f, x);
842 
843         auto is_x = bit_or(eq(x, 0.0f),
844                            eq(x, 1.0f));
845         return select(is_x, x, approx_pow2(mul(approx_log2(x), y)));
846     }
847 
848     // Bhaskara I's sine approximation
849     // 16x(pi - x) / (5*pi^2 - 4x(pi - x)
850     // ... divide by 4
851     // 4x(pi - x) / 5*pi^2/4 - x(pi - x)
852     //
853     // This is a good approximation only for 0 <= x <= pi, so we use symmetries to get
854     // radians into that range first.
855     //
approx_sin(F32 radians)856     F32 Builder::approx_sin(F32 radians) {
857         constexpr float Pi = SK_ScalarPI;
858         // x = radians mod 2pi
859         F32 x = fract(radians * (0.5f/Pi)) * (2*Pi);
860         I32 neg = x > Pi;   // are we pi < x < 2pi --> need to negate result
861         x = select(neg, x - Pi, x);
862 
863         F32 pair = x * (Pi - x);
864         x = 4.0f * pair / ((5*Pi*Pi/4) - pair);
865         x = select(neg, -x, x);
866         return x;
867     }
868 
869     /*  "GENERATING ACCURATE VALUES FOR THE TANGENT FUNCTION"
870          https://mae.ufl.edu/~uhk/ACCURATE-TANGENT.pdf
871 
872         approx = x + (1/3)x^3 + (2/15)x^5 + (17/315)x^7 + (62/2835)x^9
873 
874         Some simplifications:
875         1. tan(x) is periodic, -PI/2 < x < PI/2
876         2. tan(x) is odd, so tan(-x) = -tan(x)
877         3. Our polynomial approximation is best near zero, so we use the following identity
878                         tan(x) + tan(y)
879            tan(x + y) = -----------------
880                        1 - tan(x)*tan(y)
881            tan(PI/4) = 1
882 
883            So for x > PI/8, we do the following refactor:
884            x' = x - PI/4
885 
886                     1 + tan(x')
887            tan(x) = ------------
888                     1 - tan(x')
889      */
approx_tan(F32 x)890     F32 Builder::approx_tan(F32 x) {
891         constexpr float Pi = SK_ScalarPI;
892         // periodic between -pi/2 ... pi/2
893         // shift to 0...Pi, scale 1/Pi to get into 0...1, then fract, scale-up, shift-back
894         x = fract((1/Pi)*x + 0.5f) * Pi - (Pi/2);
895 
896         I32 neg = (x < 0.0f);
897         x = select(neg, -x, x);
898 
899         // minimize total error by shifting if x > pi/8
900         I32 use_quotient = (x > (Pi/8));
901         x = select(use_quotient, x - (Pi/4), x);
902 
903         // 9th order poly = 4th order(x^2) * x
904         x = poly(x*x, 62/2835.0f, 17/315.0f, 2/15.0f, 1/3.0f, 1.0f) * x;
905         x = select(use_quotient, (1+x)/(1-x), x);
906         x = select(neg, -x, x);
907         return x;
908     }
909 
910      // http://mathforum.org/library/drmath/view/54137.html
911      // referencing Handbook of Mathematical Functions,
912      //             by Milton Abramowitz and Irene Stegun
approx_asin(F32 x)913      F32 Builder::approx_asin(F32 x) {
914          I32 neg = (x < 0.0f);
915          x = select(neg, -x, x);
916          x = SK_ScalarPI/2 - sqrt(1-x) * poly(x, -0.0187293f, 0.0742610f, -0.2121144f, 1.5707288f);
917          x = select(neg, -x, x);
918          return x;
919      }
920 
921     /*  Use 4th order polynomial approximation from https://arachnoid.com/polysolve/
922      *      with 129 values of x,atan(x) for x:[0...1]
923      *  This only works for 0 <= x <= 1
924      */
approx_atan_unit(F32 x)925     static F32 approx_atan_unit(F32 x) {
926         // for now we might be given NaN... let that through
927         x->assert_true((x != x) | ((x >= 0) & (x <= 1)));
928         return poly(x, 0.14130025741326729f,
929                       -0.34312835980675116f,
930                       -0.016172900528248768f,
931                        1.0037696976200385f,
932                       -0.00014758242182738969f);
933     }
934 
935     /*  Use identity atan(x) = pi/2 - atan(1/x) for x > 1
936      */
approx_atan(F32 x)937     F32 Builder::approx_atan(F32 x) {
938         I32 neg = (x < 0.0f);
939         x = select(neg, -x, x);
940         I32 flip = (x > 1.0f);
941         x = select(flip, 1/x, x);
942         x = approx_atan_unit(x);
943         x = select(flip, SK_ScalarPI/2 - x, x);
944         x = select(neg, -x, x);
945         return x;
946     }
947 
948     /*  Use identity atan(x) = pi/2 - atan(1/x) for x > 1
949      *  By swapping y,x to ensure the ratio is <= 1, we can safely call atan_unit()
950      *  which avoids a 2nd divide instruction if we had instead called atan().
951      */
approx_atan2(F32 y0,F32 x0)952     F32 Builder::approx_atan2(F32 y0, F32 x0) {
953 
954         I32 flip = (abs(y0) > abs(x0));
955         F32 y = select(flip, x0, y0);
956         F32 x = select(flip, y0, x0);
957         F32 arg = y/x;
958 
959         I32 neg = (arg < 0.0f);
960         arg = select(neg, -arg, arg);
961 
962         F32 r = approx_atan_unit(arg);
963         r = select(flip, SK_ScalarPI/2 - r, r);
964         r = select(neg, -r, r);
965 
966         // handle quadrant distinctions
967         r = select((y0 >= 0) & (x0  < 0), r + SK_ScalarPI, r);
968         r = select((y0  < 0) & (x0 <= 0), r - SK_ScalarPI, r);
969         // Note: we don't try to handle 0,0 or infinities (yet)
970         return r;
971     }
972 
min(F32 x,F32 y)973     F32 Builder::min(F32 x, F32 y) {
974         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::min(X,Y)); }
975         return {this, this->push(Op::min_f32, x.id, y.id)};
976     }
max(F32 x,F32 y)977     F32 Builder::max(F32 x, F32 y) {
978         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::max(X,Y)); }
979         return {this, this->push(Op::max_f32, x.id, y.id)};
980     }
981 
982     SK_ATTRIBUTE(no_sanitize("signed-integer-overflow"))
add(I32 x,I32 y)983     I32 Builder::add(I32 x, I32 y) {
984         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); }
985         if (this->isImm(x.id, 0)) { return y; }
986         if (this->isImm(y.id, 0)) { return x; }
987         return {this, this->push(Op::add_i32, x.id, y.id)};
988     }
989     SK_ATTRIBUTE(no_sanitize("signed-integer-overflow"))
sub(I32 x,I32 y)990     I32 Builder::sub(I32 x, I32 y) {
991         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); }
992         if (this->isImm(y.id, 0)) { return x; }
993         return {this, this->push(Op::sub_i32, x.id, y.id)};
994     }
995     SK_ATTRIBUTE(no_sanitize("signed-integer-overflow"))
mul(I32 x,I32 y)996     I32 Builder::mul(I32 x, I32 y) {
997         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); }
998         if (this->isImm(x.id, 0)) { return splat(0); }
999         if (this->isImm(y.id, 0)) { return splat(0); }
1000         if (this->isImm(x.id, 1)) { return y; }
1001         if (this->isImm(y.id, 1)) { return x; }
1002         return {this, this->push(Op::mul_i32, x.id, y.id)};
1003     }
1004 
1005     SK_ATTRIBUTE(no_sanitize("shift"))
shl(I32 x,int bits)1006     I32 Builder::shl(I32 x, int bits) {
1007         if (bits == 0) { return x; }
1008         if (int X; this->allImm(x.id,&X)) { return splat(X << bits); }
1009         return {this, this->push(Op::shl_i32, x.id,NA,NA,NA, bits)};
1010     }
shr(I32 x,int bits)1011     I32 Builder::shr(I32 x, int bits) {
1012         if (bits == 0) { return x; }
1013         if (int X; this->allImm(x.id,&X)) { return splat(unsigned(X) >> bits); }
1014         return {this, this->push(Op::shr_i32, x.id,NA,NA,NA, bits)};
1015     }
sra(I32 x,int bits)1016     I32 Builder::sra(I32 x, int bits) {
1017         if (bits == 0) { return x; }
1018         if (int X; this->allImm(x.id,&X)) { return splat(X >> bits); }
1019         return {this, this->push(Op::sra_i32, x.id,NA,NA,NA, bits)};
1020     }
1021 
eq(F32 x,F32 y)1022     I32 Builder:: eq(F32 x, F32 y) {
1023         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X==Y ? ~0 : 0); }
1024         return {this, this->push(Op::eq_f32, x.id, y.id)};
1025     }
neq(F32 x,F32 y)1026     I32 Builder::neq(F32 x, F32 y) {
1027         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X!=Y ? ~0 : 0); }
1028         return {this, this->push(Op::neq_f32, x.id, y.id)};
1029     }
lt(F32 x,F32 y)1030     I32 Builder::lt(F32 x, F32 y) {
1031         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y> X ? ~0 : 0); }
1032         return {this, this->push(Op::gt_f32, y.id, x.id)};
1033     }
lte(F32 x,F32 y)1034     I32 Builder::lte(F32 x, F32 y) {
1035         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y>=X ? ~0 : 0); }
1036         return {this, this->push(Op::gte_f32, y.id, x.id)};
1037     }
gt(F32 x,F32 y)1038     I32 Builder::gt(F32 x, F32 y) {
1039         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X> Y ? ~0 : 0); }
1040         return {this, this->push(Op::gt_f32, x.id, y.id)};
1041     }
gte(F32 x,F32 y)1042     I32 Builder::gte(F32 x, F32 y) {
1043         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X>=Y ? ~0 : 0); }
1044         return {this, this->push(Op::gte_f32, x.id, y.id)};
1045     }
1046 
eq(I32 x,I32 y)1047     I32 Builder:: eq(I32 x, I32 y) {
1048         if (x.id == y.id) { return splat(~0); }
1049         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X==Y ? ~0 : 0); }
1050         return {this, this->push(Op:: eq_i32, x.id, y.id)};
1051     }
neq(I32 x,I32 y)1052     I32 Builder::neq(I32 x, I32 y) {
1053         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X!=Y ? ~0 : 0); }
1054         return ~(x == y);
1055     }
gt(I32 x,I32 y)1056     I32 Builder:: gt(I32 x, I32 y) {
1057         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X> Y ? ~0 : 0); }
1058         return {this, this->push(Op:: gt_i32, x.id, y.id)};
1059     }
gte(I32 x,I32 y)1060     I32 Builder::gte(I32 x, I32 y) {
1061         if (x.id == y.id) { return splat(~0); }
1062         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X>=Y ? ~0 : 0); }
1063         return ~(x < y);
1064     }
lt(I32 x,I32 y)1065     I32 Builder:: lt(I32 x, I32 y) { return y>x; }
lte(I32 x,I32 y)1066     I32 Builder::lte(I32 x, I32 y) { return y>=x; }
1067 
bit_and(I32 x,I32 y)1068     I32 Builder::bit_and(I32 x, I32 y) {
1069         if (x.id == y.id) { return x; }
1070         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&Y); }
1071         if (this->isImm(y.id, 0)) { return splat(0); }   // (x & false) == false
1072         if (this->isImm(x.id, 0)) { return splat(0); }   // (false & y) == false
1073         if (this->isImm(y.id,~0)) { return x; }          // (x & true) == x
1074         if (this->isImm(x.id,~0)) { return y; }          // (true & y) == y
1075         return {this, this->push(Op::bit_and, x.id, y.id)};
1076     }
bit_or(I32 x,I32 y)1077     I32 Builder::bit_or(I32 x, I32 y) {
1078         if (x.id == y.id) { return x; }
1079         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X|Y); }
1080         if (this->isImm(y.id, 0)) { return x; }           // (x | false) == x
1081         if (this->isImm(x.id, 0)) { return y; }           // (false | y) == y
1082         if (this->isImm(y.id,~0)) { return splat(~0); }   // (x | true) == true
1083         if (this->isImm(x.id,~0)) { return splat(~0); }   // (true | y) == true
1084         return {this, this->push(Op::bit_or, x.id, y.id)};
1085     }
bit_xor(I32 x,I32 y)1086     I32 Builder::bit_xor(I32 x, I32 y) {
1087         if (x.id == y.id) { return splat(0); }
1088         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X^Y); }
1089         if (this->isImm(y.id, 0)) { return x; }   // (x ^ false) == x
1090         if (this->isImm(x.id, 0)) { return y; }   // (false ^ y) == y
1091         return {this, this->push(Op::bit_xor, x.id, y.id)};
1092     }
1093 
bit_clear(I32 x,I32 y)1094     I32 Builder::bit_clear(I32 x, I32 y) {
1095         if (x.id == y.id) { return splat(0); }
1096         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&~Y); }
1097         if (this->isImm(y.id, 0)) { return x; }          // (x & ~false) == x
1098         if (this->isImm(y.id,~0)) { return splat(0); }   // (x & ~true) == false
1099         if (this->isImm(x.id, 0)) { return splat(0); }   // (false & ~y) == false
1100         return {this, this->push(Op::bit_clear, x.id, y.id)};
1101     }
1102 
select(I32 x,I32 y,I32 z)1103     I32 Builder::select(I32 x, I32 y, I32 z) {
1104         if (y.id == z.id) { return y; }
1105         if (int X,Y,Z; this->allImm(x.id,&X, y.id,&Y, z.id,&Z)) { return splat(X?Y:Z); }
1106         if (this->isImm(x.id,~0)) { return y; }               // true  ? y : z == y
1107         if (this->isImm(x.id, 0)) { return z; }               // false ? y : z == z
1108         if (this->isImm(y.id, 0)) { return bit_clear(z,x); }  //     x ? 0 : z == ~x&z
1109         if (this->isImm(z.id, 0)) { return bit_and  (y,x); }  //     x ? y : 0 ==  x&y
1110         return {this, this->push(Op::select, x.id, y.id, z.id)};
1111     }
1112 
extract(I32 x,int bits,I32 z)1113     I32 Builder::extract(I32 x, int bits, I32 z) {
1114         if (unsigned Z; this->allImm(z.id,&Z) && (~0u>>bits) == Z) { return shr(x, bits); }
1115         return bit_and(z, shr(x, bits));
1116     }
1117 
pack(I32 x,I32 y,int bits)1118     I32 Builder::pack(I32 x, I32 y, int bits) {
1119         return bit_or(x, shl(y, bits));
1120     }
1121 
ceil(F32 x)1122     F32 Builder::ceil(F32 x) {
1123         if (float X; this->allImm(x.id,&X)) { return splat(ceilf(X)); }
1124         return {this, this->push(Op::ceil, x.id)};
1125     }
floor(F32 x)1126     F32 Builder::floor(F32 x) {
1127         if (float X; this->allImm(x.id,&X)) { return splat(floorf(X)); }
1128         return {this, this->push(Op::floor, x.id)};
1129     }
to_F32(I32 x)1130     F32 Builder::to_F32(I32 x) {
1131         if (int X; this->allImm(x.id,&X)) { return splat((float)X); }
1132         return {this, this->push(Op::to_f32, x.id)};
1133     }
trunc(F32 x)1134     I32 Builder::trunc(F32 x) {
1135         if (float X; this->allImm(x.id,&X)) { return splat((int)X); }
1136         return {this, this->push(Op::trunc, x.id)};
1137     }
round(F32 x)1138     I32 Builder::round(F32 x) {
1139         if (float X; this->allImm(x.id,&X)) { return splat((int)lrintf(X)); }
1140         return {this, this->push(Op::round, x.id)};
1141     }
1142 
to_fp16(F32 x)1143     I32 Builder::to_fp16(F32 x) {
1144         if (float X; this->allImm(x.id,&X)) { return splat((int)SkFloatToHalf(X)); }
1145         return {this, this->push(Op::to_fp16, x.id)};
1146     }
from_fp16(I32 x)1147     F32 Builder::from_fp16(I32 x) {
1148         if (int X; this->allImm(x.id,&X)) { return splat(SkHalfToFloat(X)); }
1149         return {this, this->push(Op::from_fp16, x.id)};
1150     }
1151 
from_unorm(int bits,I32 x)1152     F32 Builder::from_unorm(int bits, I32 x) {
1153         F32 limit = splat(1 / ((1<<bits)-1.0f));
1154         return mul(to_F32(x), limit);
1155     }
to_unorm(int bits,F32 x)1156     I32 Builder::to_unorm(int bits, F32 x) {
1157         F32 limit = splat((1<<bits)-1.0f);
1158         return round(mul(x, limit));
1159     }
1160 
SkColorType_to_PixelFormat(SkColorType ct)1161     PixelFormat SkColorType_to_PixelFormat(SkColorType ct) {
1162         auto UNORM = PixelFormat::UNORM,
1163              SRGB  = PixelFormat::SRGB,
1164              FLOAT = PixelFormat::FLOAT;
1165         switch (ct) {
1166             case kUnknown_SkColorType: break;
1167 
1168             case kRGBA_F32_SkColorType: return {FLOAT,32,32,32,32, 0,32,64,96};
1169 
1170             case kRGBA_F16Norm_SkColorType:       return {FLOAT,16,16,16,16, 0,16,32,48};
1171             case kRGBA_F16_SkColorType:           return {FLOAT,16,16,16,16, 0,16,32,48};
1172             case kR16G16B16A16_unorm_SkColorType: return {UNORM,16,16,16,16, 0,16,32,48};
1173 
1174             case kA16_float_SkColorType:    return {FLOAT,  0, 0,0,16, 0, 0,0,0};
1175             case kR16G16_float_SkColorType: return {FLOAT, 16,16,0, 0, 0,16,0,0};
1176 
1177             case kAlpha_8_SkColorType: return {UNORM, 0,0,0,8, 0,0,0,0};
1178             case kGray_8_SkColorType:  return {UNORM, 8,8,8,0, 0,0,0,0};  // Subtle.
1179 
1180             case kRGB_565_SkColorType:   return {UNORM, 5,6,5,0, 11,5,0,0};  // (BGR)
1181             case kARGB_4444_SkColorType: return {UNORM, 4,4,4,4, 12,8,4,0};  // (ABGR)
1182 
1183             case kRGBA_8888_SkColorType:  return {UNORM, 8,8,8,8,  0,8,16,24};
1184             case kRGB_888x_SkColorType:   return {UNORM, 8,8,8,0,  0,8,16,32};  // 32-bit
1185             case kBGRA_8888_SkColorType:  return {UNORM, 8,8,8,8, 16,8, 0,24};
1186             case kSRGBA_8888_SkColorType: return { SRGB, 8,8,8,8,  0,8,16,24};
1187 
1188             case kRGBA_1010102_SkColorType: return {UNORM, 10,10,10,2,  0,10,20,30};
1189             case kBGRA_1010102_SkColorType: return {UNORM, 10,10,10,2, 20,10, 0,30};
1190             case kRGB_101010x_SkColorType:  return {UNORM, 10,10,10,0,  0,10,20, 0};
1191             case kBGR_101010x_SkColorType:  return {UNORM, 10,10,10,0, 20,10, 0, 0};
1192 
1193             case kR8G8_unorm_SkColorType:   return {UNORM,  8, 8,0, 0, 0, 8,0,0};
1194             case kR16G16_unorm_SkColorType: return {UNORM, 16,16,0, 0, 0,16,0,0};
1195             case kA16_unorm_SkColorType:    return {UNORM,  0, 0,0,16, 0, 0,0,0};
1196         }
1197         SkASSERT(false);
1198         return {UNORM, 0,0,0,0, 0,0,0,0};
1199     }
1200 
byte_size(PixelFormat f)1201     static int byte_size(PixelFormat f) {
1202         // What's the highest bit we read?
1203         int bits = std::max(f.r_bits + f.r_shift,
1204                    std::max(f.g_bits + f.g_shift,
1205                    std::max(f.b_bits + f.b_shift,
1206                             f.a_bits + f.a_shift)));
1207         // Round up to bytes.
1208         return (bits + 7) / 8;
1209     }
1210 
unpack(PixelFormat f,I32 x)1211     static Color unpack(PixelFormat f, I32 x) {
1212         SkASSERT(byte_size(f) <= 4);
1213 
1214         auto from_srgb = [](int bits, I32 channel) -> F32 {
1215             const skcms_TransferFunction* tf = skcms_sRGB_TransferFunction();
1216             F32 v = from_unorm(bits, channel);
1217             return sk_program_transfer_fn(v, sRGBish_TF,
1218                                           v->splat(tf->g),
1219                                           v->splat(tf->a),
1220                                           v->splat(tf->b),
1221                                           v->splat(tf->c),
1222                                           v->splat(tf->d),
1223                                           v->splat(tf->e),
1224                                           v->splat(tf->f));
1225         };
1226 
1227         auto unpack_rgb = [=](int bits, int shift) -> F32 {
1228             I32 channel = extract(x, shift, (1<<bits)-1);
1229             switch (f.encoding) {
1230                 case PixelFormat::UNORM: return from_unorm(bits, channel);
1231                 case PixelFormat:: SRGB: return from_srgb (bits, channel);
1232                 case PixelFormat::FLOAT: return from_fp16 (      channel);
1233             }
1234             SkUNREACHABLE;
1235         };
1236         auto unpack_alpha = [=](int bits, int shift) -> F32 {
1237             I32 channel = extract(x, shift, (1<<bits)-1);
1238             switch (f.encoding) {
1239                 case PixelFormat::UNORM:
1240                 case PixelFormat:: SRGB: return from_unorm(bits, channel);
1241                 case PixelFormat::FLOAT: return from_fp16 (      channel);
1242             }
1243             SkUNREACHABLE;
1244         };
1245         return {
1246             f.r_bits ? unpack_rgb  (f.r_bits, f.r_shift) : x->splat(0.0f),
1247             f.g_bits ? unpack_rgb  (f.g_bits, f.g_shift) : x->splat(0.0f),
1248             f.b_bits ? unpack_rgb  (f.b_bits, f.b_shift) : x->splat(0.0f),
1249             f.a_bits ? unpack_alpha(f.a_bits, f.a_shift) : x->splat(1.0f),
1250         };
1251     }
1252 
split_disjoint_8byte_format(PixelFormat f,PixelFormat * lo,PixelFormat * hi)1253     static void split_disjoint_8byte_format(PixelFormat f, PixelFormat* lo, PixelFormat* hi) {
1254         SkASSERT(byte_size(f) == 8);
1255         // We assume some of the channels are in the low 32 bits, some in the high 32 bits.
1256         // The assert on byte_size(lo) will trigger if this assumption is violated.
1257         *lo = f;
1258         if (f.r_shift >= 32) { lo->r_bits = 0; lo->r_shift = 32; }
1259         if (f.g_shift >= 32) { lo->g_bits = 0; lo->g_shift = 32; }
1260         if (f.b_shift >= 32) { lo->b_bits = 0; lo->b_shift = 32; }
1261         if (f.a_shift >= 32) { lo->a_bits = 0; lo->a_shift = 32; }
1262         SkASSERT(byte_size(*lo) == 4);
1263 
1264         *hi = f;
1265         if (f.r_shift < 32) { hi->r_bits = 0; hi->r_shift = 32; } else { hi->r_shift -= 32; }
1266         if (f.g_shift < 32) { hi->g_bits = 0; hi->g_shift = 32; } else { hi->g_shift -= 32; }
1267         if (f.b_shift < 32) { hi->b_bits = 0; hi->b_shift = 32; } else { hi->b_shift -= 32; }
1268         if (f.a_shift < 32) { hi->a_bits = 0; hi->a_shift = 32; } else { hi->a_shift -= 32; }
1269         SkASSERT(byte_size(*hi) == 4);
1270     }
1271 
1272     // The only 16-byte format we support today is RGBA F32,
1273     // though, TODO, we could generalize that to any swizzle, and to allow UNORM too.
assert_16byte_is_rgba_f32(PixelFormat f)1274     static void assert_16byte_is_rgba_f32(PixelFormat f) {
1275     #if defined(SK_DEBUG)
1276         SkASSERT(byte_size(f) == 16);
1277         PixelFormat rgba_f32 = SkColorType_to_PixelFormat(kRGBA_F32_SkColorType);
1278 
1279         SkASSERT(f.encoding == rgba_f32.encoding);
1280 
1281         SkASSERT(f.r_bits == rgba_f32.r_bits);
1282         SkASSERT(f.g_bits == rgba_f32.g_bits);
1283         SkASSERT(f.b_bits == rgba_f32.b_bits);
1284         SkASSERT(f.a_bits == rgba_f32.a_bits);
1285 
1286         SkASSERT(f.r_shift == rgba_f32.r_shift);
1287         SkASSERT(f.g_shift == rgba_f32.g_shift);
1288         SkASSERT(f.b_shift == rgba_f32.b_shift);
1289         SkASSERT(f.a_shift == rgba_f32.a_shift);
1290     #endif
1291     }
1292 
load(PixelFormat f,Ptr ptr)1293     Color Builder::load(PixelFormat f, Ptr ptr) {
1294         switch (byte_size(f)) {
1295             case 1: return unpack(f, load8 (ptr));
1296             case 2: return unpack(f, load16(ptr));
1297             case 4: return unpack(f, load32(ptr));
1298             case 8: {
1299                 PixelFormat lo,hi;
1300                 split_disjoint_8byte_format(f, &lo,&hi);
1301                 Color l = unpack(lo, load64(ptr, 0)),
1302                       h = unpack(hi, load64(ptr, 1));
1303                 return {
1304                     lo.r_bits ? l.r : h.r,
1305                     lo.g_bits ? l.g : h.g,
1306                     lo.b_bits ? l.b : h.b,
1307                     lo.a_bits ? l.a : h.a,
1308                 };
1309             }
1310             case 16: {
1311                 assert_16byte_is_rgba_f32(f);
1312                 return {
1313                     pun_to_F32(load128(ptr, 0)),
1314                     pun_to_F32(load128(ptr, 1)),
1315                     pun_to_F32(load128(ptr, 2)),
1316                     pun_to_F32(load128(ptr, 3)),
1317                 };
1318             }
1319             default: SkUNREACHABLE;
1320         }
1321         return {};
1322     }
1323 
gather(PixelFormat f,UPtr ptr,int offset,I32 index)1324     Color Builder::gather(PixelFormat f, UPtr ptr, int offset, I32 index) {
1325         switch (byte_size(f)) {
1326             case 1: return unpack(f, gather8 (ptr, offset, index));
1327             case 2: return unpack(f, gather16(ptr, offset, index));
1328             case 4: return unpack(f, gather32(ptr, offset, index));
1329             case 8: {
1330                 PixelFormat lo,hi;
1331                 split_disjoint_8byte_format(f, &lo,&hi);
1332                 Color l = unpack(lo, gather32(ptr, offset, (index<<1)+0)),
1333                       h = unpack(hi, gather32(ptr, offset, (index<<1)+1));
1334                 return {
1335                     lo.r_bits ? l.r : h.r,
1336                     lo.g_bits ? l.g : h.g,
1337                     lo.b_bits ? l.b : h.b,
1338                     lo.a_bits ? l.a : h.a,
1339                 };
1340             }
1341             case 16: {
1342                 assert_16byte_is_rgba_f32(f);
1343                 return {
1344                     gatherF(ptr, offset, (index<<2)+0),
1345                     gatherF(ptr, offset, (index<<2)+1),
1346                     gatherF(ptr, offset, (index<<2)+2),
1347                     gatherF(ptr, offset, (index<<2)+3),
1348                 };
1349             }
1350             default: SkUNREACHABLE;
1351         }
1352         return {};
1353     }
1354 
pack32(PixelFormat f,Color c)1355     static I32 pack32(PixelFormat f, Color c) {
1356         SkASSERT(byte_size(f) <= 4);
1357 
1358         auto to_srgb = [](int bits, F32 v) {
1359             const skcms_TransferFunction* tf = skcms_sRGB_Inverse_TransferFunction();
1360             return to_unorm(bits, sk_program_transfer_fn(v, sRGBish_TF,
1361                                                          v->splat(tf->g),
1362                                                          v->splat(tf->a),
1363                                                          v->splat(tf->b),
1364                                                          v->splat(tf->c),
1365                                                          v->splat(tf->d),
1366                                                          v->splat(tf->e),
1367                                                          v->splat(tf->f)));
1368         };
1369 
1370         I32 packed = c->splat(0);
1371         auto pack_rgb = [&](F32 channel, int bits, int shift) {
1372             I32 encoded;
1373             switch (f.encoding) {
1374                 case PixelFormat::UNORM: encoded = to_unorm(bits, channel); break;
1375                 case PixelFormat:: SRGB: encoded = to_srgb (bits, channel); break;
1376                 case PixelFormat::FLOAT: encoded = to_fp16 (      channel); break;
1377             }
1378             packed = pack(packed, encoded, shift);
1379         };
1380         auto pack_alpha = [&](F32 channel, int bits, int shift) {
1381             I32 encoded;
1382             switch (f.encoding) {
1383                 case PixelFormat::UNORM:
1384                 case PixelFormat:: SRGB: encoded = to_unorm(bits, channel); break;
1385                 case PixelFormat::FLOAT: encoded = to_fp16 (      channel); break;
1386             }
1387             packed = pack(packed, encoded, shift);
1388         };
1389         if (f.r_bits) { pack_rgb  (c.r, f.r_bits, f.r_shift); }
1390         if (f.g_bits) { pack_rgb  (c.g, f.g_bits, f.g_shift); }
1391         if (f.b_bits) { pack_rgb  (c.b, f.b_bits, f.b_shift); }
1392         if (f.a_bits) { pack_alpha(c.a, f.a_bits, f.a_shift); }
1393         return packed;
1394     }
1395 
store(PixelFormat f,Ptr ptr,Color c)1396     void Builder::store(PixelFormat f, Ptr ptr, Color c) {
1397         // Detect a grayscale PixelFormat: r,g,b bit counts and shifts all equal.
1398         if (f.r_bits  == f.g_bits  && f.g_bits  == f.b_bits &&
1399             f.r_shift == f.g_shift && f.g_shift == f.b_shift) {
1400 
1401             // TODO: pull these coefficients from an SkColorSpace?  This is sRGB luma/luminance.
1402             c.r = c.r * 0.2126f
1403                 + c.g * 0.7152f
1404                 + c.b * 0.0722f;
1405             f.g_bits = f.b_bits = 0;
1406         }
1407 
1408         switch (byte_size(f)) {
1409             case 1: store8 (ptr, pack32(f,c)); break;
1410             case 2: store16(ptr, pack32(f,c)); break;
1411             case 4: store32(ptr, pack32(f,c)); break;
1412             case 8: {
1413                 PixelFormat lo,hi;
1414                 split_disjoint_8byte_format(f, &lo,&hi);
1415                 store64(ptr, pack32(lo,c)
1416                            , pack32(hi,c));
1417                 break;
1418             }
1419             case 16: {
1420                 assert_16byte_is_rgba_f32(f);
1421                 store128(ptr, pun_to_I32(c.r), pun_to_I32(c.g), pun_to_I32(c.b), pun_to_I32(c.a));
1422                 break;
1423             }
1424             default: SkUNREACHABLE;
1425         }
1426     }
1427 
unpremul(F32 * r,F32 * g,F32 * b,F32 a)1428     void Builder::unpremul(F32* r, F32* g, F32* b, F32 a) {
1429         skvm::F32 invA = 1.0f / a,
1430                   inf  = pun_to_F32(splat(0x7f800000));
1431         // If a is 0, so are *r,*g,*b, so set invA to 0 to avoid 0*inf=NaN (instead 0*0 = 0).
1432         invA = select(invA < inf, invA
1433                                 , 0.0f);
1434         *r *= invA;
1435         *g *= invA;
1436         *b *= invA;
1437     }
1438 
premul(F32 * r,F32 * g,F32 * b,F32 a)1439     void Builder::premul(F32* r, F32* g, F32* b, F32 a) {
1440         *r *= a;
1441         *g *= a;
1442         *b *= a;
1443     }
1444 
uniformColor(SkColor4f color,Uniforms * uniforms)1445     Color Builder::uniformColor(SkColor4f color, Uniforms* uniforms) {
1446         auto [r,g,b,a] = color;
1447         return {
1448             uniformF(uniforms->pushF(r)),
1449             uniformF(uniforms->pushF(g)),
1450             uniformF(uniforms->pushF(b)),
1451             uniformF(uniforms->pushF(a)),
1452         };
1453     }
1454 
lerp(F32 lo,F32 hi,F32 t)1455     F32 Builder::lerp(F32 lo, F32 hi, F32 t) {
1456         if (this->isImm(t.id, 0.0f)) { return lo; }
1457         if (this->isImm(t.id, 1.0f)) { return hi; }
1458         return mad(sub(hi, lo), t, lo);
1459     }
1460 
lerp(Color lo,Color hi,F32 t)1461     Color Builder::lerp(Color lo, Color hi, F32 t) {
1462         return {
1463             lerp(lo.r, hi.r, t),
1464             lerp(lo.g, hi.g, t),
1465             lerp(lo.b, hi.b, t),
1466             lerp(lo.a, hi.a, t),
1467         };
1468     }
1469 
to_hsla(Color c)1470     HSLA Builder::to_hsla(Color c) {
1471         F32 mx = max(max(c.r,c.g),c.b),
1472             mn = min(min(c.r,c.g),c.b),
1473              d = mx - mn,
1474           invd = 1.0f / d,
1475         g_lt_b = select(c.g < c.b, splat(6.0f)
1476                                  , splat(0.0f));
1477 
1478         F32 h = (1/6.0f) * select(mx == mn,  0.0f,
1479                            select(mx == c.r, invd * (c.g - c.b) + g_lt_b,
1480                            select(mx == c.g, invd * (c.b - c.r) + 2.0f
1481                                            , invd * (c.r - c.g) + 4.0f)));
1482 
1483         F32 sum = mx + mn,
1484               l = sum * 0.5f,
1485               s = select(mx == mn, 0.0f
1486                                  , d / select(l > 0.5f, 2.0f - sum
1487                                                       , sum));
1488         return {h, s, l, c.a};
1489     }
1490 
to_rgba(HSLA c)1491     Color Builder::to_rgba(HSLA c) {
1492         // See GrRGBToHSLFilterEffect.fp
1493 
1494         auto [h,s,l,a] = c;
1495         F32 x = s * (1.0f - abs(l + l - 1.0f));
1496 
1497         auto hue_to_rgb = [&,l=l](auto hue) {
1498             auto q = abs(6.0f * fract(hue) - 3.0f) - 1.0f;
1499             return x * (clamp01(q) - 0.5f) + l;
1500         };
1501 
1502         return {
1503             hue_to_rgb(h + 0/3.0f),
1504             hue_to_rgb(h + 2/3.0f),
1505             hue_to_rgb(h + 1/3.0f),
1506             c.a,
1507         };
1508     }
1509 
1510     // We're basing our implementation of non-separable blend modes on
1511     //   https://www.w3.org/TR/compositing-1/#blendingnonseparable.
1512     // and
1513     //   https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf
1514     // They're equivalent, but ES' math has been better simplified.
1515     //
1516     // Anything extra we add beyond that is to make the math work with premul inputs.
1517 
saturation(skvm::F32 r,skvm::F32 g,skvm::F32 b)1518     static skvm::F32 saturation(skvm::F32 r, skvm::F32 g, skvm::F32 b) {
1519         return max(r, max(g, b))
1520              - min(r, min(g, b));
1521     }
1522 
luminance(skvm::F32 r,skvm::F32 g,skvm::F32 b)1523     static skvm::F32 luminance(skvm::F32 r, skvm::F32 g, skvm::F32 b) {
1524         return r*0.30f + g*0.59f + b*0.11f;
1525     }
1526 
set_sat(skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 s)1527     static void set_sat(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 s) {
1528         F32 mn  = min(*r, min(*g, *b)),
1529             mx  = max(*r, max(*g, *b)),
1530             sat = mx - mn;
1531 
1532         // Map min channel to 0, max channel to s, and scale the middle proportionally.
1533         auto scale = [&](skvm::F32 c) {
1534             auto scaled = ((c - mn) * s) / sat;
1535             return select(is_finite(scaled), scaled, 0.0f);
1536         };
1537         *r = scale(*r);
1538         *g = scale(*g);
1539         *b = scale(*b);
1540     }
1541 
set_lum(skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 lu)1542     static void set_lum(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 lu) {
1543         auto diff = lu - luminance(*r, *g, *b);
1544         *r += diff;
1545         *g += diff;
1546         *b += diff;
1547     }
1548 
clip_color(skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 a)1549     static void clip_color(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 a) {
1550         F32 mn  = min(*r, min(*g, *b)),
1551             mx  = max(*r, max(*g, *b)),
1552             lu = luminance(*r, *g, *b);
1553 
1554         auto clip = [&](auto c) {
1555             c = select(mn >= 0, c
1556                               , lu + ((c-lu)*(  lu)) / (lu-mn));
1557             c = select(mx >  a, lu + ((c-lu)*(a-lu)) / (mx-lu)
1558                               , c);
1559             return clamp01(c);  // May be a little negative, or worse, NaN.
1560         };
1561         *r = clip(*r);
1562         *g = clip(*g);
1563         *b = clip(*b);
1564     }
1565 
blend(SkBlendMode mode,Color src,Color dst)1566     Color Builder::blend(SkBlendMode mode, Color src, Color dst) {
1567         auto mma = [](skvm::F32 x, skvm::F32 y, skvm::F32 z, skvm::F32 w) {
1568             return x*y + z*w;
1569         };
1570 
1571         auto two = [](skvm::F32 x) { return x+x; };
1572 
1573         auto apply_rgba = [&](auto fn) {
1574             return Color {
1575                 fn(src.r, dst.r),
1576                 fn(src.g, dst.g),
1577                 fn(src.b, dst.b),
1578                 fn(src.a, dst.a),
1579             };
1580         };
1581 
1582         auto apply_rgb_srcover_a = [&](auto fn) {
1583             return Color {
1584                 fn(src.r, dst.r),
1585                 fn(src.g, dst.g),
1586                 fn(src.b, dst.b),
1587                 mad(dst.a, 1-src.a, src.a),   // srcover for alpha
1588             };
1589         };
1590 
1591         auto non_sep = [&](auto R, auto G, auto B) {
1592             return Color{
1593                 R + mma(src.r, 1-dst.a,  dst.r, 1-src.a),
1594                 G + mma(src.g, 1-dst.a,  dst.g, 1-src.a),
1595                 B + mma(src.b, 1-dst.a,  dst.b, 1-src.a),
1596                 mad(dst.a, 1-src.a, src.a),   // srcover for alpha
1597             };
1598         };
1599 
1600         switch (mode) {
1601             default:
1602                 SkASSERT(false);
1603                 [[fallthrough]]; /*but also, for safety, fallthrough*/
1604 
1605             case SkBlendMode::kClear: return { splat(0.0f), splat(0.0f), splat(0.0f), splat(0.0f) };
1606 
1607             case SkBlendMode::kSrc: return src;
1608             case SkBlendMode::kDst: return dst;
1609 
1610             case SkBlendMode::kDstOver: std::swap(src, dst); [[fallthrough]];
1611             case SkBlendMode::kSrcOver:
1612                 return apply_rgba([&](auto s, auto d) {
1613                     return mad(d,1-src.a, s);
1614                 });
1615 
1616             case SkBlendMode::kDstIn: std::swap(src, dst); [[fallthrough]];
1617             case SkBlendMode::kSrcIn:
1618                 return apply_rgba([&](auto s, auto d) {
1619                     return s * dst.a;
1620                 });
1621 
1622             case SkBlendMode::kDstOut: std::swap(src, dst); [[fallthrough]];
1623 
1624             case SkBlendMode::kSrcOut:
1625                 return apply_rgba([&](auto s, auto d) {
1626                     return s * (1-dst.a);
1627                 });
1628 
1629             case SkBlendMode::kDstATop: std::swap(src, dst); [[fallthrough]];
1630             case SkBlendMode::kSrcATop:
1631                 return apply_rgba([&](auto s, auto d) {
1632                     return mma(s, dst.a,  d, 1-src.a);
1633                 });
1634 
1635             case SkBlendMode::kXor:
1636                 return apply_rgba([&](auto s, auto d) {
1637                     return mma(s, 1-dst.a,  d, 1-src.a);
1638                 });
1639 
1640             case SkBlendMode::kPlus:
1641                 return apply_rgba([&](auto s, auto d) {
1642                     return min(s+d, 1.0f);
1643                 });
1644 
1645             case SkBlendMode::kModulate:
1646                 return apply_rgba([&](auto s, auto d) {
1647                     return s * d;
1648                 });
1649 
1650             case SkBlendMode::kScreen:
1651                 // (s+d)-(s*d) gave us trouble with our "r,g,b <= after blending" asserts.
1652                 // It's kind of plausible that s + (d - sd) keeps more precision?
1653                 return apply_rgba([&](auto s, auto d) {
1654                     return s + (d - s*d);
1655                 });
1656 
1657             case SkBlendMode::kDarken:
1658                 return apply_rgb_srcover_a([&](auto s, auto d) {
1659                     return s + (d - max(s * dst.a,
1660                                         d * src.a));
1661                 });
1662 
1663             case SkBlendMode::kLighten:
1664                 return apply_rgb_srcover_a([&](auto s, auto d) {
1665                     return s + (d - min(s * dst.a,
1666                                         d * src.a));
1667                 });
1668 
1669             case SkBlendMode::kDifference:
1670                 return apply_rgb_srcover_a([&](auto s, auto d) {
1671                     return s + (d - two(min(s * dst.a,
1672                                             d * src.a)));
1673                 });
1674 
1675             case SkBlendMode::kExclusion:
1676                 return apply_rgb_srcover_a([&](auto s, auto d) {
1677                     return s + (d - two(s * d));
1678                 });
1679 
1680             case SkBlendMode::kColorBurn:
1681                 return apply_rgb_srcover_a([&](auto s, auto d) {
1682                     auto mn   = min(dst.a,
1683                                     src.a * (dst.a - d) / s),
1684                          burn = src.a * (dst.a - mn) + mma(s, 1-dst.a, d, 1-src.a);
1685                     return select(d == dst.a     , s * (1-dst.a) + d,
1686                            select(is_finite(burn), burn
1687                                                  , d * (1-src.a) + s));
1688                 });
1689 
1690             case SkBlendMode::kColorDodge:
1691                 return apply_rgb_srcover_a([&](auto s, auto d) {
1692                     auto dodge = src.a * min(dst.a,
1693                                              d * src.a / (src.a - s))
1694                                        + mma(s, 1-dst.a, d, 1-src.a);
1695                     return select(d == 0.0f       , s * (1-dst.a) + d,
1696                            select(is_finite(dodge), dodge
1697                                                   , d * (1-src.a) + s));
1698                 });
1699 
1700             case SkBlendMode::kHardLight:
1701                 return apply_rgb_srcover_a([&](auto s, auto d) {
1702                     return mma(s, 1-dst.a, d, 1-src.a) +
1703                            select(two(s) <= src.a,
1704                                   two(s * d),
1705                                   src.a * dst.a - two((dst.a - d) * (src.a - s)));
1706                 });
1707 
1708             case SkBlendMode::kOverlay:
1709                 return apply_rgb_srcover_a([&](auto s, auto d) {
1710                     return mma(s, 1-dst.a, d, 1-src.a) +
1711                            select(two(d) <= dst.a,
1712                                   two(s * d),
1713                                   src.a * dst.a - two((dst.a - d) * (src.a - s)));
1714                 });
1715 
1716             case SkBlendMode::kMultiply:
1717                 return apply_rgba([&](auto s, auto d) {
1718                     return mma(s, 1-dst.a, d, 1-src.a) + s * d;
1719                 });
1720 
1721             case SkBlendMode::kSoftLight:
1722                 return apply_rgb_srcover_a([&](auto s, auto d) {
1723                     auto  m = select(dst.a > 0.0f, d / dst.a
1724                                                  , 0.0f),
1725                          s2 = two(s),
1726                          m4 = 4*m;
1727 
1728                          // The logic forks three ways:
1729                          //    1. dark src?
1730                          //    2. light src, dark dst?
1731                          //    3. light src, light dst?
1732 
1733                          // Used in case 1
1734                     auto darkSrc = d * ((s2-src.a) * (1-m) + src.a),
1735                          // Used in case 2
1736                          darkDst = (m4 * m4 + m4) * (m-1) + 7*m,
1737                          // Used in case 3.
1738                          liteDst = sqrt(m) - m,
1739                          // Used in 2 or 3?
1740                          liteSrc = dst.a * (s2 - src.a) * select(4*d <= dst.a, darkDst
1741                                                                              , liteDst)
1742                                    + d * src.a;
1743                     return s * (1-dst.a) + d * (1-src.a) + select(s2 <= src.a, darkSrc
1744                                                                              , liteSrc);
1745                 });
1746 
1747             case SkBlendMode::kHue: {
1748                 skvm::F32 R = src.r * src.a,
1749                           G = src.g * src.a,
1750                           B = src.b * src.a;
1751 
1752                 set_sat   (&R, &G, &B, src.a * saturation(dst.r, dst.g, dst.b));
1753                 set_lum   (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b));
1754                 clip_color(&R, &G, &B, src.a * dst.a);
1755 
1756                 return non_sep(R, G, B);
1757             }
1758 
1759             case SkBlendMode::kSaturation: {
1760                 skvm::F32 R = dst.r * src.a,
1761                           G = dst.g * src.a,
1762                           B = dst.b * src.a;
1763 
1764                 set_sat   (&R, &G, &B, dst.a * saturation(src.r, src.g, src.b));
1765                 set_lum   (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b));
1766                 clip_color(&R, &G, &B, src.a * dst.a);
1767 
1768                 return non_sep(R, G, B);
1769             }
1770 
1771             case SkBlendMode::kColor: {
1772                 skvm::F32 R = src.r * dst.a,
1773                           G = src.g * dst.a,
1774                           B = src.b * dst.a;
1775 
1776                 set_lum   (&R, &G, &B, src.a * luminance(dst.r, dst.g, dst.b));
1777                 clip_color(&R, &G, &B, src.a * dst.a);
1778 
1779                 return non_sep(R, G, B);
1780             }
1781 
1782             case SkBlendMode::kLuminosity: {
1783                 skvm::F32 R = dst.r * src.a,
1784                           G = dst.g * src.a,
1785                           B = dst.b * src.a;
1786 
1787                 set_lum   (&R, &G, &B, dst.a * luminance(src.r, src.g, src.b));
1788                 clip_color(&R, &G, &B, dst.a * src.a);
1789 
1790                 return non_sep(R, G, B);
1791             }
1792         }
1793     }
1794 
1795     // ~~~~ Program::eval() and co. ~~~~ //
1796 
1797     // Handy references for x86-64 instruction encoding:
1798     // https://wiki.osdev.org/X86-64_Instruction_Encoding
1799     // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x64.htm
1800     // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x86.htm
1801     // http://ref.x86asm.net/coder64.html
1802 
1803     // Used for ModRM / immediate instruction encoding.
_233(int a,int b,int c)1804     static uint8_t _233(int a, int b, int c) {
1805         return (a & 3) << 6
1806              | (b & 7) << 3
1807              | (c & 7) << 0;
1808     }
1809 
1810     // ModRM byte encodes the arguments of an opcode.
1811     enum class Mod { Indirect, OneByteImm, FourByteImm, Direct };
mod_rm(Mod mod,int reg,int rm)1812     static uint8_t mod_rm(Mod mod, int reg, int rm) {
1813         return _233((int)mod, reg, rm);
1814     }
1815 
mod(int imm)1816     static Mod mod(int imm) {
1817         if (imm == 0)               { return Mod::Indirect; }
1818         if (SkTFitsIn<int8_t>(imm)) { return Mod::OneByteImm; }
1819         return Mod::FourByteImm;
1820     }
1821 
imm_bytes(Mod mod)1822     static int imm_bytes(Mod mod) {
1823         switch (mod) {
1824             case Mod::Indirect:    return 0;
1825             case Mod::OneByteImm:  return 1;
1826             case Mod::FourByteImm: return 4;
1827             case Mod::Direct: SkUNREACHABLE;
1828         }
1829         SkUNREACHABLE;
1830     }
1831 
1832     // SIB byte encodes a memory address, base + (index * scale).
sib(Assembler::Scale scale,int index,int base)1833     static uint8_t sib(Assembler::Scale scale, int index, int base) {
1834         return _233((int)scale, index, base);
1835     }
1836 
1837     // The REX prefix is used to extend most old 32-bit instructions to 64-bit.
rex(bool W,bool R,bool X,bool B)1838     static uint8_t rex(bool W,   // If set, operation is 64-bit, otherwise default, usually 32-bit.
1839                        bool R,   // Extra top bit to select ModRM reg, registers 8-15.
1840                        bool X,   // Extra top bit for SIB index register.
1841                        bool B) { // Extra top bit for SIB base or ModRM rm register.
1842         return 0b01000000   // Fixed 0100 for top four bits.
1843              | (W << 3)
1844              | (R << 2)
1845              | (X << 1)
1846              | (B << 0);
1847     }
1848 
1849 
1850     // The VEX prefix extends SSE operations to AVX.  Used generally, even with XMM.
1851     struct VEX {
1852         int     len;
1853         uint8_t bytes[3];
1854     };
1855 
vex(bool WE,bool R,bool X,bool B,int map,int vvvv,bool L,int pp)1856     static VEX vex(bool  WE,   // Like REX W for int operations, or opcode extension for float?
1857                    bool   R,   // Same as REX R.  Pass high bit of dst register, dst>>3.
1858                    bool   X,   // Same as REX X.
1859                    bool   B,   // Same as REX B.  Pass y>>3 for 3-arg ops, x>>3 for 2-arg.
1860                    int  map,   // SSE opcode map selector: 0x0f, 0x380f, 0x3a0f.
1861                    int vvvv,   // 4-bit second operand register.  Pass our x for 3-arg ops.
1862                    bool   L,   // Set for 256-bit ymm operations, off for 128-bit xmm.
1863                    int   pp) { // SSE mandatory prefix: 0x66, 0xf3, 0xf2, else none.
1864 
1865         // Pack x86 opcode map selector to 5-bit VEX encoding.
1866         map = [map]{
1867             switch (map) {
1868                 case   0x0f: return 0b00001;
1869                 case 0x380f: return 0b00010;
1870                 case 0x3a0f: return 0b00011;
1871                 // Several more cases only used by XOP / TBM.
1872             }
1873             SkUNREACHABLE;
1874         }();
1875 
1876         // Pack  mandatory SSE opcode prefix byte to 2-bit VEX encoding.
1877         pp = [pp]{
1878             switch (pp) {
1879                 case 0x66: return 0b01;
1880                 case 0xf3: return 0b10;
1881                 case 0xf2: return 0b11;
1882             }
1883             return 0b00;
1884         }();
1885 
1886         VEX vex = {0, {0,0,0}};
1887         if (X == 0 && B == 0 && WE == 0 && map == 0b00001) {
1888             // With these conditions met, we can optionally compress VEX to 2-byte.
1889             vex.len = 2;
1890             vex.bytes[0] = 0xc5;
1891             vex.bytes[1] = (pp      &  3) << 0
1892                          | (L       &  1) << 2
1893                          | (~vvvv   & 15) << 3
1894                          | (~(int)R &  1) << 7;
1895         } else {
1896             // We could use this 3-byte VEX prefix all the time if we like.
1897             vex.len = 3;
1898             vex.bytes[0] = 0xc4;
1899             vex.bytes[1] = (map     & 31) << 0
1900                          | (~(int)B &  1) << 5
1901                          | (~(int)X &  1) << 6
1902                          | (~(int)R &  1) << 7;
1903             vex.bytes[2] = (pp    &  3) << 0
1904                          | (L     &  1) << 2
1905                          | (~vvvv & 15) << 3
1906                          | (WE    &  1) << 7;
1907         }
1908         return vex;
1909     }
1910 
Assembler(void * buf)1911     Assembler::Assembler(void* buf) : fCode((uint8_t*)buf), fSize(0) {}
1912 
size() const1913     size_t Assembler::size() const { return fSize; }
1914 
bytes(const void * p,int n)1915     void Assembler::bytes(const void* p, int n) {
1916         if (fCode) {
1917             memcpy(fCode+fSize, p, n);
1918         }
1919         fSize += n;
1920     }
1921 
byte(uint8_t b)1922     void Assembler::byte(uint8_t b) { this->bytes(&b, 1); }
word(uint32_t w)1923     void Assembler::word(uint32_t w) { this->bytes(&w, 4); }
1924 
align(int mod)1925     void Assembler::align(int mod) {
1926         while (this->size() % mod) {
1927             this->byte(0x00);
1928         }
1929     }
1930 
int3()1931     void Assembler::int3() {
1932         this->byte(0xcc);
1933     }
1934 
vzeroupper()1935     void Assembler::vzeroupper() {
1936         this->byte(0xc5);
1937         this->byte(0xf8);
1938         this->byte(0x77);
1939     }
ret()1940     void Assembler::ret() { this->byte(0xc3); }
1941 
op(int opcode,Operand dst,GP64 x)1942     void Assembler::op(int opcode, Operand dst, GP64 x) {
1943         if (dst.kind == Operand::REG) {
1944             this->byte(rex(W1,x>>3,0,dst.reg>>3));
1945             this->bytes(&opcode, SkTFitsIn<uint8_t>(opcode) ? 1 : 2);
1946             this->byte(mod_rm(Mod::Direct, x, dst.reg&7));
1947         } else {
1948             SkASSERT(dst.kind == Operand::MEM);
1949             const Mem& m = dst.mem;
1950             const bool need_SIB = (m.base&7) == rsp
1951                                || m.index != rsp;
1952 
1953             this->byte(rex(W1,x>>3,m.index>>3,m.base>>3));
1954             this->bytes(&opcode, SkTFitsIn<uint8_t>(opcode) ? 1 : 2);
1955             this->byte(mod_rm(mod(m.disp), x&7, (need_SIB ? rsp : m.base)&7));
1956             if (need_SIB) {
1957                 this->byte(sib(m.scale, m.index&7, m.base&7));
1958             }
1959             this->bytes(&m.disp, imm_bytes(mod(m.disp)));
1960         }
1961     }
1962 
op(int opcode,int opcode_ext,Operand dst,int imm)1963     void Assembler::op(int opcode, int opcode_ext, Operand dst, int imm) {
1964         opcode |= 0b1000'0000;   // top bit set for instructions with any immediate
1965 
1966         int imm_bytes = 4;
1967         if (SkTFitsIn<int8_t>(imm)) {
1968             imm_bytes = 1;
1969             opcode |= 0b0000'0010;  // second bit set for 8-bit immediate, else 32-bit.
1970         }
1971 
1972         this->op(opcode, dst, (GP64)opcode_ext);
1973         this->bytes(&imm, imm_bytes);
1974     }
1975 
add(Operand dst,int imm)1976     void Assembler::add(Operand dst, int imm) { this->op(0x01,0b000, dst,imm); }
sub(Operand dst,int imm)1977     void Assembler::sub(Operand dst, int imm) { this->op(0x01,0b101, dst,imm); }
cmp(Operand dst,int imm)1978     void Assembler::cmp(Operand dst, int imm) { this->op(0x01,0b111, dst,imm); }
1979 
1980     // These don't work quite like the other instructions with immediates:
1981     // these immediates are always fixed size at 4 bytes or 1 byte.
mov(Operand dst,int imm)1982     void Assembler::mov(Operand dst, int imm) {
1983         this->op(0xC7,dst,(GP64)0b000);
1984         this->word(imm);
1985     }
movb(Operand dst,int imm)1986     void Assembler::movb(Operand dst, int imm) {
1987         this->op(0xC6,dst,(GP64)0b000);
1988         this->byte(imm);
1989     }
1990 
add(Operand dst,GP64 x)1991     void Assembler::add (Operand dst, GP64 x) { this->op(0x01, dst,x); }
sub(Operand dst,GP64 x)1992     void Assembler::sub (Operand dst, GP64 x) { this->op(0x29, dst,x); }
cmp(Operand dst,GP64 x)1993     void Assembler::cmp (Operand dst, GP64 x) { this->op(0x39, dst,x); }
mov(Operand dst,GP64 x)1994     void Assembler::mov (Operand dst, GP64 x) { this->op(0x89, dst,x); }
movb(Operand dst,GP64 x)1995     void Assembler::movb(Operand dst, GP64 x) { this->op(0x88, dst,x); }
1996 
add(GP64 dst,Operand x)1997     void Assembler::add (GP64 dst, Operand x) { this->op(0x03, x,dst); }
sub(GP64 dst,Operand x)1998     void Assembler::sub (GP64 dst, Operand x) { this->op(0x2B, x,dst); }
cmp(GP64 dst,Operand x)1999     void Assembler::cmp (GP64 dst, Operand x) { this->op(0x3B, x,dst); }
mov(GP64 dst,Operand x)2000     void Assembler::mov (GP64 dst, Operand x) { this->op(0x8B, x,dst); }
movb(GP64 dst,Operand x)2001     void Assembler::movb(GP64 dst, Operand x) { this->op(0x8A, x,dst); }
2002 
movzbq(GP64 dst,Operand x)2003     void Assembler::movzbq(GP64 dst, Operand x) { this->op(0xB60F, x,dst); }
movzwq(GP64 dst,Operand x)2004     void Assembler::movzwq(GP64 dst, Operand x) { this->op(0xB70F, x,dst); }
2005 
vpaddd(Ymm dst,Ymm x,Operand y)2006     void Assembler::vpaddd (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xfe, dst,x,y); }
vpsubd(Ymm dst,Ymm x,Operand y)2007     void Assembler::vpsubd (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xfa, dst,x,y); }
vpmulld(Ymm dst,Ymm x,Operand y)2008     void Assembler::vpmulld(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x40, dst,x,y); }
2009 
vpaddw(Ymm dst,Ymm x,Operand y)2010     void Assembler::vpaddw   (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xfd, dst,x,y); }
vpsubw(Ymm dst,Ymm x,Operand y)2011     void Assembler::vpsubw   (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xf9, dst,x,y); }
vpmullw(Ymm dst,Ymm x,Operand y)2012     void Assembler::vpmullw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xd5, dst,x,y); }
vpavgw(Ymm dst,Ymm x,Operand y)2013     void Assembler::vpavgw   (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xe3, dst,x,y); }
vpmulhrsw(Ymm dst,Ymm x,Operand y)2014     void Assembler::vpmulhrsw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x0b, dst,x,y); }
vpminsw(Ymm dst,Ymm x,Operand y)2015     void Assembler::vpminsw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xea, dst,x,y); }
vpmaxsw(Ymm dst,Ymm x,Operand y)2016     void Assembler::vpmaxsw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xee, dst,x,y); }
vpminuw(Ymm dst,Ymm x,Operand y)2017     void Assembler::vpminuw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x3a, dst,x,y); }
vpmaxuw(Ymm dst,Ymm x,Operand y)2018     void Assembler::vpmaxuw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x3e, dst,x,y); }
2019 
vpabsw(Ymm dst,Operand x)2020     void Assembler::vpabsw(Ymm dst, Operand x) { this->op(0x66,0x380f,0x1d, dst,x); }
2021 
2022 
vpand(Ymm dst,Ymm x,Operand y)2023     void Assembler::vpand (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xdb, dst,x,y); }
vpor(Ymm dst,Ymm x,Operand y)2024     void Assembler::vpor  (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xeb, dst,x,y); }
vpxor(Ymm dst,Ymm x,Operand y)2025     void Assembler::vpxor (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xef, dst,x,y); }
vpandn(Ymm dst,Ymm x,Operand y)2026     void Assembler::vpandn(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xdf, dst,x,y); }
2027 
vaddps(Ymm dst,Ymm x,Operand y)2028     void Assembler::vaddps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x58, dst,x,y); }
vsubps(Ymm dst,Ymm x,Operand y)2029     void Assembler::vsubps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5c, dst,x,y); }
vmulps(Ymm dst,Ymm x,Operand y)2030     void Assembler::vmulps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x59, dst,x,y); }
vdivps(Ymm dst,Ymm x,Operand y)2031     void Assembler::vdivps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5e, dst,x,y); }
vminps(Ymm dst,Ymm x,Operand y)2032     void Assembler::vminps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5d, dst,x,y); }
vmaxps(Ymm dst,Ymm x,Operand y)2033     void Assembler::vmaxps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5f, dst,x,y); }
2034 
vfmadd132ps(Ymm dst,Ymm x,Operand y)2035     void Assembler::vfmadd132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x98, dst,x,y); }
vfmadd213ps(Ymm dst,Ymm x,Operand y)2036     void Assembler::vfmadd213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xa8, dst,x,y); }
vfmadd231ps(Ymm dst,Ymm x,Operand y)2037     void Assembler::vfmadd231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xb8, dst,x,y); }
2038 
vfmsub132ps(Ymm dst,Ymm x,Operand y)2039     void Assembler::vfmsub132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x9a, dst,x,y); }
vfmsub213ps(Ymm dst,Ymm x,Operand y)2040     void Assembler::vfmsub213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xaa, dst,x,y); }
vfmsub231ps(Ymm dst,Ymm x,Operand y)2041     void Assembler::vfmsub231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xba, dst,x,y); }
2042 
vfnmadd132ps(Ymm dst,Ymm x,Operand y)2043     void Assembler::vfnmadd132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x9c, dst,x,y); }
vfnmadd213ps(Ymm dst,Ymm x,Operand y)2044     void Assembler::vfnmadd213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xac, dst,x,y); }
vfnmadd231ps(Ymm dst,Ymm x,Operand y)2045     void Assembler::vfnmadd231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xbc, dst,x,y); }
2046 
vpackusdw(Ymm dst,Ymm x,Operand y)2047     void Assembler::vpackusdw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x2b, dst,x,y); }
vpackuswb(Ymm dst,Ymm x,Operand y)2048     void Assembler::vpackuswb(Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0x67, dst,x,y); }
2049 
vpunpckldq(Ymm dst,Ymm x,Operand y)2050     void Assembler::vpunpckldq(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x62, dst,x,y); }
vpunpckhdq(Ymm dst,Ymm x,Operand y)2051     void Assembler::vpunpckhdq(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x6a, dst,x,y); }
2052 
vpcmpeqd(Ymm dst,Ymm x,Operand y)2053     void Assembler::vpcmpeqd(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x76, dst,x,y); }
vpcmpeqw(Ymm dst,Ymm x,Operand y)2054     void Assembler::vpcmpeqw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x75, dst,x,y); }
vpcmpgtd(Ymm dst,Ymm x,Operand y)2055     void Assembler::vpcmpgtd(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x66, dst,x,y); }
vpcmpgtw(Ymm dst,Ymm x,Operand y)2056     void Assembler::vpcmpgtw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x65, dst,x,y); }
2057 
2058 
imm_byte_after_operand(const Operand & operand,int imm)2059     void Assembler::imm_byte_after_operand(const Operand& operand, int imm) {
2060         // When we've embedded a label displacement in the middle of an instruction,
2061         // we need to tweak it a little so that the resolved displacement starts
2062         // from the end of the instruction and not the end of the displacement.
2063         if (operand.kind == Operand::LABEL && fCode) {
2064             int disp;
2065             memcpy(&disp, fCode+fSize-4, 4);
2066             disp--;
2067             memcpy(fCode+fSize-4, &disp, 4);
2068         }
2069         this->byte(imm);
2070     }
2071 
vcmpps(Ymm dst,Ymm x,Operand y,int imm)2072     void Assembler::vcmpps(Ymm dst, Ymm x, Operand y, int imm) {
2073         this->op(0,0x0f,0xc2, dst,x,y);
2074         this->imm_byte_after_operand(y, imm);
2075     }
2076 
vpblendvb(Ymm dst,Ymm x,Operand y,Ymm z)2077     void Assembler::vpblendvb(Ymm dst, Ymm x, Operand y, Ymm z) {
2078         this->op(0x66,0x3a0f,0x4c, dst,x,y);
2079         this->imm_byte_after_operand(y, z << 4);
2080     }
2081 
2082     // Shift instructions encode their opcode extension as "dst", dst as x, and x as y.
vpslld(Ymm dst,Ymm x,int imm)2083     void Assembler::vpslld(Ymm dst, Ymm x, int imm) {
2084         this->op(0x66,0x0f,0x72,(Ymm)6, dst,x);
2085         this->byte(imm);
2086     }
vpsrld(Ymm dst,Ymm x,int imm)2087     void Assembler::vpsrld(Ymm dst, Ymm x, int imm) {
2088         this->op(0x66,0x0f,0x72,(Ymm)2, dst,x);
2089         this->byte(imm);
2090     }
vpsrad(Ymm dst,Ymm x,int imm)2091     void Assembler::vpsrad(Ymm dst, Ymm x, int imm) {
2092         this->op(0x66,0x0f,0x72,(Ymm)4, dst,x);
2093         this->byte(imm);
2094     }
vpsllw(Ymm dst,Ymm x,int imm)2095     void Assembler::vpsllw(Ymm dst, Ymm x, int imm) {
2096         this->op(0x66,0x0f,0x71,(Ymm)6, dst,x);
2097         this->byte(imm);
2098     }
vpsrlw(Ymm dst,Ymm x,int imm)2099     void Assembler::vpsrlw(Ymm dst, Ymm x, int imm) {
2100         this->op(0x66,0x0f,0x71,(Ymm)2, dst,x);
2101         this->byte(imm);
2102     }
vpsraw(Ymm dst,Ymm x,int imm)2103     void Assembler::vpsraw(Ymm dst, Ymm x, int imm) {
2104         this->op(0x66,0x0f,0x71,(Ymm)4, dst,x);
2105         this->byte(imm);
2106     }
2107 
vpermq(Ymm dst,Operand x,int imm)2108     void Assembler::vpermq(Ymm dst, Operand x, int imm) {
2109         // A bit unusual among the instructions we use, this is 64-bit operation, so we set W.
2110         this->op(0x66,0x3a0f,0x00, dst,x,W1);
2111         this->imm_byte_after_operand(x, imm);
2112     }
2113 
vperm2f128(Ymm dst,Ymm x,Operand y,int imm)2114     void Assembler::vperm2f128(Ymm dst, Ymm x, Operand y, int imm) {
2115         this->op(0x66,0x3a0f,0x06, dst,x,y);
2116         this->imm_byte_after_operand(y, imm);
2117     }
2118 
vpermps(Ymm dst,Ymm ix,Operand src)2119     void Assembler::vpermps(Ymm dst, Ymm ix, Operand src) {
2120         this->op(0x66,0x380f,0x16, dst,ix,src);
2121     }
2122 
vroundps(Ymm dst,Operand x,Rounding imm)2123     void Assembler::vroundps(Ymm dst, Operand x, Rounding imm) {
2124         this->op(0x66,0x3a0f,0x08, dst,x);
2125         this->imm_byte_after_operand(x, imm);
2126     }
2127 
vmovdqa(Ymm dst,Operand src)2128     void Assembler::vmovdqa(Ymm dst, Operand src) { this->op(0x66,0x0f,0x6f, dst,src); }
vmovups(Ymm dst,Operand src)2129     void Assembler::vmovups(Ymm dst, Operand src) { this->op(   0,0x0f,0x10, dst,src); }
vmovups(Xmm dst,Operand src)2130     void Assembler::vmovups(Xmm dst, Operand src) { this->op(   0,0x0f,0x10, dst,src); }
vmovups(Operand dst,Ymm src)2131     void Assembler::vmovups(Operand dst, Ymm src) { this->op(   0,0x0f,0x11, src,dst); }
vmovups(Operand dst,Xmm src)2132     void Assembler::vmovups(Operand dst, Xmm src) { this->op(   0,0x0f,0x11, src,dst); }
2133 
vcvtdq2ps(Ymm dst,Operand x)2134     void Assembler::vcvtdq2ps (Ymm dst, Operand x) { this->op(   0,0x0f,0x5b, dst,x); }
vcvttps2dq(Ymm dst,Operand x)2135     void Assembler::vcvttps2dq(Ymm dst, Operand x) { this->op(0xf3,0x0f,0x5b, dst,x); }
vcvtps2dq(Ymm dst,Operand x)2136     void Assembler::vcvtps2dq (Ymm dst, Operand x) { this->op(0x66,0x0f,0x5b, dst,x); }
vsqrtps(Ymm dst,Operand x)2137     void Assembler::vsqrtps   (Ymm dst, Operand x) { this->op(   0,0x0f,0x51, dst,x); }
2138 
vcvtps2ph(Operand dst,Ymm x,Rounding imm)2139     void Assembler::vcvtps2ph(Operand dst, Ymm x, Rounding imm) {
2140         this->op(0x66,0x3a0f,0x1d, x,dst);
2141         this->imm_byte_after_operand(dst, imm);
2142     }
vcvtph2ps(Ymm dst,Operand x)2143     void Assembler::vcvtph2ps(Ymm dst, Operand x) {
2144         this->op(0x66,0x380f,0x13, dst,x);
2145     }
2146 
disp19(Label * l)2147     int Assembler::disp19(Label* l) {
2148         SkASSERT(l->kind == Label::NotYetSet ||
2149                  l->kind == Label::ARMDisp19);
2150         int here = (int)this->size();
2151         l->kind = Label::ARMDisp19;
2152         l->references.push_back(here);
2153         // ARM 19-bit instruction count, from the beginning of this instruction.
2154         return (l->offset - here) / 4;
2155     }
2156 
disp32(Label * l)2157     int Assembler::disp32(Label* l) {
2158         SkASSERT(l->kind == Label::NotYetSet ||
2159                  l->kind == Label::X86Disp32);
2160         int here = (int)this->size();
2161         l->kind = Label::X86Disp32;
2162         l->references.push_back(here);
2163         // x86 32-bit byte count, from the end of this instruction.
2164         return l->offset - (here + 4);
2165     }
2166 
op(int prefix,int map,int opcode,int dst,int x,Operand y,W w,L l)2167     void Assembler::op(int prefix, int map, int opcode, int dst, int x, Operand y, W w, L l) {
2168         switch (y.kind) {
2169             case Operand::REG: {
2170                 VEX v = vex(w, dst>>3, 0, y.reg>>3,
2171                             map, x, l, prefix);
2172                 this->bytes(v.bytes, v.len);
2173                 this->byte(opcode);
2174                 this->byte(mod_rm(Mod::Direct, dst&7, y.reg&7));
2175             } return;
2176 
2177             case Operand::MEM: {
2178                 // Passing rsp as the rm argument to mod_rm() signals an SIB byte follows;
2179                 // without an SIB byte, that's where the base register would usually go.
2180                 // This means we have to use an SIB byte if we want to use rsp as a base register.
2181                 const Mem& m = y.mem;
2182                 const bool need_SIB = m.base  == rsp
2183                                    || m.index != rsp;
2184 
2185                 VEX v = vex(w, dst>>3, m.index>>3, m.base>>3,
2186                             map, x, l, prefix);
2187                 this->bytes(v.bytes, v.len);
2188                 this->byte(opcode);
2189                 this->byte(mod_rm(mod(m.disp), dst&7, (need_SIB ? rsp : m.base)&7));
2190                 if (need_SIB) {
2191                     this->byte(sib(m.scale, m.index&7, m.base&7));
2192                 }
2193                 this->bytes(&m.disp, imm_bytes(mod(m.disp)));
2194             } return;
2195 
2196             case Operand::LABEL: {
2197                 // IP-relative addressing uses Mod::Indirect with the R/M encoded as-if rbp or r13.
2198                 const int rip = rbp;
2199 
2200                 VEX v = vex(w, dst>>3, 0, rip>>3,
2201                             map, x, l, prefix);
2202                 this->bytes(v.bytes, v.len);
2203                 this->byte(opcode);
2204                 this->byte(mod_rm(Mod::Indirect, dst&7, rip&7));
2205                 this->word(this->disp32(y.label));
2206             } return;
2207         }
2208     }
2209 
vpshufb(Ymm dst,Ymm x,Operand y)2210     void Assembler::vpshufb(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x00, dst,x,y); }
2211 
vptest(Ymm x,Operand y)2212     void Assembler::vptest(Ymm x, Operand y) { this->op(0x66, 0x380f, 0x17, x,y); }
2213 
vbroadcastss(Ymm dst,Operand y)2214     void Assembler::vbroadcastss(Ymm dst, Operand y) { this->op(0x66,0x380f,0x18, dst,y); }
2215 
jump(uint8_t condition,Label * l)2216     void Assembler::jump(uint8_t condition, Label* l) {
2217         // These conditional jumps can be either 2 bytes (short) or 6 bytes (near):
2218         //    7?     one-byte-disp
2219         //    0F 8? four-byte-disp
2220         // We always use the near displacement to make updating labels simpler (no resizing).
2221         this->byte(0x0f);
2222         this->byte(condition);
2223         this->word(this->disp32(l));
2224     }
je(Label * l)2225     void Assembler::je (Label* l) { this->jump(0x84, l); }
jne(Label * l)2226     void Assembler::jne(Label* l) { this->jump(0x85, l); }
jl(Label * l)2227     void Assembler::jl (Label* l) { this->jump(0x8c, l); }
jc(Label * l)2228     void Assembler::jc (Label* l) { this->jump(0x82, l); }
2229 
jmp(Label * l)2230     void Assembler::jmp(Label* l) {
2231         // Like above in jump(), we could use 8-bit displacement here, but always use 32-bit.
2232         this->byte(0xe9);
2233         this->word(this->disp32(l));
2234     }
2235 
vpmovzxwd(Ymm dst,Operand src)2236     void Assembler::vpmovzxwd(Ymm dst, Operand src) { this->op(0x66,0x380f,0x33, dst,src); }
vpmovzxbd(Ymm dst,Operand src)2237     void Assembler::vpmovzxbd(Ymm dst, Operand src) { this->op(0x66,0x380f,0x31, dst,src); }
2238 
vmovq(Operand dst,Xmm src)2239     void Assembler::vmovq(Operand dst, Xmm src) { this->op(0x66,0x0f,0xd6, src,dst); }
2240 
vmovd(Operand dst,Xmm src)2241     void Assembler::vmovd(Operand dst, Xmm src) { this->op(0x66,0x0f,0x7e, src,dst); }
vmovd(Xmm dst,Operand src)2242     void Assembler::vmovd(Xmm dst, Operand src) { this->op(0x66,0x0f,0x6e, dst,src); }
2243 
vpinsrd(Xmm dst,Xmm src,Operand y,int imm)2244     void Assembler::vpinsrd(Xmm dst, Xmm src, Operand y, int imm) {
2245         this->op(0x66,0x3a0f,0x22, dst,src,y);
2246         this->imm_byte_after_operand(y, imm);
2247     }
vpinsrw(Xmm dst,Xmm src,Operand y,int imm)2248     void Assembler::vpinsrw(Xmm dst, Xmm src, Operand y, int imm) {
2249         this->op(0x66,0x0f,0xc4, dst,src,y);
2250         this->imm_byte_after_operand(y, imm);
2251     }
vpinsrb(Xmm dst,Xmm src,Operand y,int imm)2252     void Assembler::vpinsrb(Xmm dst, Xmm src, Operand y, int imm) {
2253         this->op(0x66,0x3a0f,0x20, dst,src,y);
2254         this->imm_byte_after_operand(y, imm);
2255     }
2256 
vextracti128(Operand dst,Ymm src,int imm)2257     void Assembler::vextracti128(Operand dst, Ymm src, int imm) {
2258         this->op(0x66,0x3a0f,0x39, src,dst);
2259         SkASSERT(dst.kind != Operand::LABEL);
2260         this->byte(imm);
2261     }
vpextrd(Operand dst,Xmm src,int imm)2262     void Assembler::vpextrd(Operand dst, Xmm src, int imm) {
2263         this->op(0x66,0x3a0f,0x16, src,dst);
2264         SkASSERT(dst.kind != Operand::LABEL);
2265         this->byte(imm);
2266     }
vpextrw(Operand dst,Xmm src,int imm)2267     void Assembler::vpextrw(Operand dst, Xmm src, int imm) {
2268         this->op(0x66,0x3a0f,0x15, src,dst);
2269         SkASSERT(dst.kind != Operand::LABEL);
2270         this->byte(imm);
2271     }
vpextrb(Operand dst,Xmm src,int imm)2272     void Assembler::vpextrb(Operand dst, Xmm src, int imm) {
2273         this->op(0x66,0x3a0f,0x14, src,dst);
2274         SkASSERT(dst.kind != Operand::LABEL);
2275         this->byte(imm);
2276     }
2277 
vgatherdps(Ymm dst,Scale scale,Ymm ix,GP64 base,Ymm mask)2278     void Assembler::vgatherdps(Ymm dst, Scale scale, Ymm ix, GP64 base, Ymm mask) {
2279         // Unlike most instructions, no aliasing is permitted here.
2280         SkASSERT(dst != ix);
2281         SkASSERT(dst != mask);
2282         SkASSERT(mask != ix);
2283 
2284         int prefix = 0x66,
2285             map    = 0x380f,
2286             opcode = 0x92;
2287         VEX v = vex(0, dst>>3, ix>>3, base>>3,
2288                     map, mask, /*ymm?*/1, prefix);
2289         this->bytes(v.bytes, v.len);
2290         this->byte(opcode);
2291         this->byte(mod_rm(Mod::Indirect, dst&7, rsp/*use SIB*/));
2292         this->byte(sib(scale, ix&7, base&7));
2293     }
2294 
2295     // https://static.docs.arm.com/ddi0596/a/DDI_0596_ARM_a64_instruction_set_architecture.pdf
2296 
operator ""_mask(unsigned long long bits)2297     static int operator"" _mask(unsigned long long bits) { return (1<<(int)bits)-1; }
2298 
op(uint32_t hi,V m,uint32_t lo,V n,V d)2299     void Assembler::op(uint32_t hi, V m, uint32_t lo, V n, V d) {
2300         this->word( (hi & 11_mask) << 21
2301                   | (m  &  5_mask) << 16
2302                   | (lo &  6_mask) << 10
2303                   | (n  &  5_mask) <<  5
2304                   | (d  &  5_mask) <<  0);
2305     }
op(uint32_t op22,V n,V d,int imm)2306     void Assembler::op(uint32_t op22, V n, V d, int imm) {
2307         this->word( (op22 & 22_mask) << 10
2308                   | imm  // size and location depends on the instruction
2309                   | (n    &  5_mask) <<  5
2310                   | (d    &  5_mask) <<  0);
2311     }
2312 
and16b(V d,V n,V m)2313     void Assembler::and16b(V d, V n, V m) { this->op(0b0'1'0'01110'00'1, m, 0b00011'1, n, d); }
orr16b(V d,V n,V m)2314     void Assembler::orr16b(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b00011'1, n, d); }
eor16b(V d,V n,V m)2315     void Assembler::eor16b(V d, V n, V m) { this->op(0b0'1'1'01110'00'1, m, 0b00011'1, n, d); }
bic16b(V d,V n,V m)2316     void Assembler::bic16b(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b00011'1, n, d); }
bsl16b(V d,V n,V m)2317     void Assembler::bsl16b(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b00011'1, n, d); }
not16b(V d,V n)2318     void Assembler::not16b(V d, V n)      { this->op(0b0'1'1'01110'00'10000'00101'10,  n, d); }
2319 
add4s(V d,V n,V m)2320     void Assembler::add4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10000'1, n, d); }
sub4s(V d,V n,V m)2321     void Assembler::sub4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10000'1, n, d); }
mul4s(V d,V n,V m)2322     void Assembler::mul4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10011'1, n, d); }
2323 
cmeq4s(V d,V n,V m)2324     void Assembler::cmeq4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10001'1, n, d); }
cmgt4s(V d,V n,V m)2325     void Assembler::cmgt4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b0011'0'1, n, d); }
2326 
sub8h(V d,V n,V m)2327     void Assembler::sub8h(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b10000'1, n, d); }
mul8h(V d,V n,V m)2328     void Assembler::mul8h(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b10011'1, n, d); }
2329 
fadd4s(V d,V n,V m)2330     void Assembler::fadd4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11010'1, n, d); }
fsub4s(V d,V n,V m)2331     void Assembler::fsub4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11010'1, n, d); }
fmul4s(V d,V n,V m)2332     void Assembler::fmul4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11011'1, n, d); }
fdiv4s(V d,V n,V m)2333     void Assembler::fdiv4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11111'1, n, d); }
fmin4s(V d,V n,V m)2334     void Assembler::fmin4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11110'1, n, d); }
fmax4s(V d,V n,V m)2335     void Assembler::fmax4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11110'1, n, d); }
2336 
fneg4s(V d,V n)2337     void Assembler::fneg4s (V d, V n) { this->op(0b0'1'1'01110'1'0'10000'01111'10, n,d); }
fsqrt4s(V d,V n)2338     void Assembler::fsqrt4s(V d, V n) { this->op(0b0'1'1'01110'1'0'10000'11111'10, n,d); }
2339 
fcmeq4s(V d,V n,V m)2340     void Assembler::fcmeq4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b1110'0'1, n, d); }
fcmgt4s(V d,V n,V m)2341     void Assembler::fcmgt4s(V d, V n, V m) { this->op(0b0'1'1'01110'1'0'1, m, 0b1110'0'1, n, d); }
fcmge4s(V d,V n,V m)2342     void Assembler::fcmge4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b1110'0'1, n, d); }
2343 
fmla4s(V d,V n,V m)2344     void Assembler::fmla4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11001'1, n, d); }
fmls4s(V d,V n,V m)2345     void Assembler::fmls4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11001'1, n, d); }
2346 
tbl(V d,V n,V m)2347     void Assembler::tbl(V d, V n, V m) { this->op(0b0'1'001110'00'0, m, 0b0'00'0'00, n, d); }
2348 
uzp14s(V d,V n,V m)2349     void Assembler::uzp14s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'0'01'10, n, d); }
uzp24s(V d,V n,V m)2350     void Assembler::uzp24s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'1'01'10, n, d); }
zip14s(V d,V n,V m)2351     void Assembler::zip14s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'0'11'10, n, d); }
zip24s(V d,V n,V m)2352     void Assembler::zip24s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'1'11'10, n, d); }
2353 
sli4s(V d,V n,int imm5)2354     void Assembler::sli4s(V d, V n, int imm5) {
2355         this->op(0b0'1'1'011110'0100'000'01010'1,    n, d, ( imm5 & 5_mask)<<16);
2356     }
shl4s(V d,V n,int imm5)2357     void Assembler::shl4s(V d, V n, int imm5) {
2358         this->op(0b0'1'0'011110'0100'000'01010'1,    n, d, ( imm5 & 5_mask)<<16);
2359     }
sshr4s(V d,V n,int imm5)2360     void Assembler::sshr4s(V d, V n, int imm5) {
2361         this->op(0b0'1'0'011110'0100'000'00'0'0'0'1, n, d, (-imm5 & 5_mask)<<16);
2362     }
ushr4s(V d,V n,int imm5)2363     void Assembler::ushr4s(V d, V n, int imm5) {
2364         this->op(0b0'1'1'011110'0100'000'00'0'0'0'1, n, d, (-imm5 & 5_mask)<<16);
2365     }
ushr8h(V d,V n,int imm4)2366     void Assembler::ushr8h(V d, V n, int imm4) {
2367         this->op(0b0'1'1'011110'0010'000'00'0'0'0'1, n, d, (-imm4 & 4_mask)<<16);
2368     }
2369 
scvtf4s(V d,V n)2370     void Assembler::scvtf4s (V d, V n) { this->op(0b0'1'0'01110'0'0'10000'11101'10, n,d); }
fcvtzs4s(V d,V n)2371     void Assembler::fcvtzs4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1101'1'10, n,d); }
fcvtns4s(V d,V n)2372     void Assembler::fcvtns4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1101'0'10, n,d); }
frintp4s(V d,V n)2373     void Assembler::frintp4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1100'0'10, n,d); }
frintm4s(V d,V n)2374     void Assembler::frintm4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1100'1'10, n,d); }
2375 
fcvtn(V d,V n)2376     void Assembler::fcvtn(V d, V n) { this->op(0b0'0'0'01110'0'0'10000'10110'10, n,d); }
fcvtl(V d,V n)2377     void Assembler::fcvtl(V d, V n) { this->op(0b0'0'0'01110'0'0'10000'10111'10, n,d); }
2378 
xtns2h(V d,V n)2379     void Assembler::xtns2h(V d, V n) { this->op(0b0'0'0'01110'01'10000'10010'10, n,d); }
xtnh2b(V d,V n)2380     void Assembler::xtnh2b(V d, V n) { this->op(0b0'0'0'01110'00'10000'10010'10, n,d); }
2381 
uxtlb2h(V d,V n)2382     void Assembler::uxtlb2h(V d, V n) { this->op(0b0'0'1'011110'0001'000'10100'1, n,d); }
uxtlh2s(V d,V n)2383     void Assembler::uxtlh2s(V d, V n) { this->op(0b0'0'1'011110'0010'000'10100'1, n,d); }
2384 
uminv4s(V d,V n)2385     void Assembler::uminv4s(V d, V n) { this->op(0b0'1'1'01110'10'11000'1'1010'10, n,d); }
2386 
brk(int imm16)2387     void Assembler::brk(int imm16) {
2388         this->op(0b11010100'001'00000000000, (imm16 & 16_mask) << 5);
2389     }
2390 
ret(X n)2391     void Assembler::ret(X n) { this->op(0b1101011'0'0'10'11111'0000'0'0, n, (X)0); }
2392 
add(X d,X n,int imm12)2393     void Assembler::add(X d, X n, int imm12) {
2394         this->op(0b1'0'0'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10);
2395     }
sub(X d,X n,int imm12)2396     void Assembler::sub(X d, X n, int imm12) {
2397         this->op(0b1'1'0'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10);
2398     }
subs(X d,X n,int imm12)2399     void Assembler::subs(X d, X n, int imm12) {
2400         this->op(0b1'1'1'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10);
2401     }
2402 
add(X d,X n,X m,Shift shift,int imm6)2403     void Assembler::add(X d, X n, X m, Shift shift, int imm6) {
2404         SkASSERT(shift != ROR);
2405 
2406         int imm = (imm6  & 6_mask) << 0
2407                 | (m     & 5_mask) << 6
2408                 | (0     & 1_mask) << 11
2409                 | (shift & 2_mask) << 12;
2410         this->op(0b1'0'0'01011'00'0'00000'000000, n,d, imm << 10);
2411     }
2412 
b(Condition cond,Label * l)2413     void Assembler::b(Condition cond, Label* l) {
2414         const int imm19 = this->disp19(l);
2415         this->op(0b0101010'0'00000000000000, (X)0, (V)cond, (imm19 & 19_mask) << 5);
2416     }
cbz(X t,Label * l)2417     void Assembler::cbz(X t, Label* l) {
2418         const int imm19 = this->disp19(l);
2419         this->op(0b1'011010'0'00000000000000, (X)0, t, (imm19 & 19_mask) << 5);
2420     }
cbnz(X t,Label * l)2421     void Assembler::cbnz(X t, Label* l) {
2422         const int imm19 = this->disp19(l);
2423         this->op(0b1'011010'1'00000000000000, (X)0, t, (imm19 & 19_mask) << 5);
2424     }
2425 
ldrd(X dst,X src,int imm12)2426     void Assembler::ldrd(X dst, X src, int imm12) {
2427         this->op(0b11'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2428     }
ldrs(X dst,X src,int imm12)2429     void Assembler::ldrs(X dst, X src, int imm12) {
2430         this->op(0b10'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2431     }
ldrh(X dst,X src,int imm12)2432     void Assembler::ldrh(X dst, X src, int imm12) {
2433         this->op(0b01'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2434     }
ldrb(X dst,X src,int imm12)2435     void Assembler::ldrb(X dst, X src, int imm12) {
2436         this->op(0b00'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2437     }
2438 
ldrq(V dst,X src,int imm12)2439     void Assembler::ldrq(V dst, X src, int imm12) {
2440         this->op(0b00'111'1'01'11'000000000000, src, dst, (imm12 & 12_mask) << 10);
2441     }
ldrd(V dst,X src,int imm12)2442     void Assembler::ldrd(V dst, X src, int imm12) {
2443         this->op(0b11'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2444     }
ldrs(V dst,X src,int imm12)2445     void Assembler::ldrs(V dst, X src, int imm12) {
2446         this->op(0b10'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2447     }
ldrh(V dst,X src,int imm12)2448     void Assembler::ldrh(V dst, X src, int imm12) {
2449         this->op(0b01'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2450     }
ldrb(V dst,X src,int imm12)2451     void Assembler::ldrb(V dst, X src, int imm12) {
2452         this->op(0b00'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2453     }
2454 
strs(X src,X dst,int imm12)2455     void Assembler::strs(X src, X dst, int imm12) {
2456         this->op(0b10'111'0'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10);
2457     }
2458 
strq(V src,X dst,int imm12)2459     void Assembler::strq(V src, X dst, int imm12) {
2460         this->op(0b00'111'1'01'10'000000000000, dst, src, (imm12 & 12_mask) << 10);
2461     }
strd(V src,X dst,int imm12)2462     void Assembler::strd(V src, X dst, int imm12) {
2463         this->op(0b11'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10);
2464     }
strs(V src,X dst,int imm12)2465     void Assembler::strs(V src, X dst, int imm12) {
2466         this->op(0b10'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10);
2467     }
strh(V src,X dst,int imm12)2468     void Assembler::strh(V src, X dst, int imm12) {
2469         this->op(0b01'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10);
2470     }
strb(V src,X dst,int imm12)2471     void Assembler::strb(V src, X dst, int imm12) {
2472         this->op(0b00'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10);
2473     }
2474 
movs(X dst,V src,int lane)2475     void Assembler::movs(X dst, V src, int lane) {
2476         int imm5 = (lane << 3) | 0b100;
2477         this->op(0b0'0'0'01110000'00000'0'01'1'1'1, src, dst, (imm5 & 5_mask) << 16);
2478     }
inss(V dst,X src,int lane)2479     void Assembler::inss(V dst, X src, int lane) {
2480         int imm5 = (lane << 3) | 0b100;
2481         this->op(0b0'1'0'01110000'00000'0'0011'1, src, dst, (imm5 & 5_mask) << 16);
2482     }
2483 
2484 
ldrq(V dst,Label * l)2485     void Assembler::ldrq(V dst, Label* l) {
2486         const int imm19 = this->disp19(l);
2487         this->op(0b10'011'1'00'00000000000000, (V)0, dst, (imm19 & 19_mask) << 5);
2488     }
2489 
dup4s(V dst,X src)2490     void Assembler::dup4s(V dst, X src) {
2491         this->op(0b0'1'0'01110000'00100'0'0001'1, src, dst);
2492     }
2493 
ld1r4s(V dst,X src)2494     void Assembler::ld1r4s(V dst, X src) {
2495         this->op(0b0'1'0011010'1'0'00000'110'0'10, src, dst);
2496     }
ld1r8h(V dst,X src)2497     void Assembler::ld1r8h(V dst, X src) {
2498         this->op(0b0'1'0011010'1'0'00000'110'0'01, src, dst);
2499     }
ld1r16b(V dst,X src)2500     void Assembler::ld1r16b(V dst, X src) {
2501         this->op(0b0'1'0011010'1'0'00000'110'0'00, src, dst);
2502     }
2503 
ld24s(V dst,X src)2504     void Assembler::ld24s(V dst, X src) { this->op(0b0'1'0011000'1'000000'1000'10, src, dst); }
ld44s(V dst,X src)2505     void Assembler::ld44s(V dst, X src) { this->op(0b0'1'0011000'1'000000'0000'10, src, dst); }
st24s(V src,X dst)2506     void Assembler::st24s(V src, X dst) { this->op(0b0'1'0011000'0'000000'1000'10, dst, src); }
st44s(V src,X dst)2507     void Assembler::st44s(V src, X dst) { this->op(0b0'1'0011000'0'000000'0000'10, dst, src); }
2508 
ld24s(V dst,X src,int lane)2509     void Assembler::ld24s(V dst, X src, int lane) {
2510         int Q = (lane & 2)>>1,
2511             S = (lane & 1);
2512                  /*  Q                       S */
2513         this->op(0b0'0'0011010'1'1'00000'100'0'00, src, dst, (Q<<30)|(S<<12));
2514     }
ld44s(V dst,X src,int lane)2515     void Assembler::ld44s(V dst, X src, int lane) {
2516         int Q = (lane & 2)>>1,
2517             S = (lane & 1);
2518         this->op(0b0'0'0011010'1'1'00000'101'0'00, src, dst, (Q<<30)|(S<<12));
2519     }
2520 
label(Label * l)2521     void Assembler::label(Label* l) {
2522         if (fCode) {
2523             // The instructions all currently point to l->offset.
2524             // We'll want to add a delta to point them to here.
2525             int here = (int)this->size();
2526             int delta = here - l->offset;
2527             l->offset = here;
2528 
2529             if (l->kind == Label::ARMDisp19) {
2530                 for (int ref : l->references) {
2531                     // ref points to a 32-bit instruction with 19-bit displacement in instructions.
2532                     uint32_t inst;
2533                     memcpy(&inst, fCode + ref, 4);
2534 
2535                     // [ 8 bits to preserve] [ 19 bit signed displacement ] [ 5 bits to preserve ]
2536                     int disp = (int)(inst << 8) >> 13;
2537 
2538                     disp += delta/4;  // delta is in bytes, we want instructions.
2539 
2540                     // Put it all back together, preserving the high 8 bits and low 5.
2541                     inst = ((disp << 5) &  (19_mask << 5))
2542                          | ((inst     ) & ~(19_mask << 5));
2543                     memcpy(fCode + ref, &inst, 4);
2544                 }
2545             }
2546 
2547             if (l->kind == Label::X86Disp32) {
2548                 for (int ref : l->references) {
2549                     // ref points to a 32-bit displacement in bytes.
2550                     int disp;
2551                     memcpy(&disp, fCode + ref, 4);
2552 
2553                     disp += delta;
2554 
2555                     memcpy(fCode + ref, &disp, 4);
2556                 }
2557             }
2558         }
2559     }
2560 
eval(int n,void * args[]) const2561     void Program::eval(int n, void* args[]) const {
2562     #define SKVM_JIT_STATS 0
2563     #if SKVM_JIT_STATS
2564         static std::atomic<int64_t>  calls{0}, jits{0},
2565                                     pixels{0}, fast{0};
2566         pixels += n;
2567         if (0 == calls++) {
2568             atexit([]{
2569                 int64_t num = jits .load(),
2570                         den = calls.load();
2571                 SkDebugf("%.3g%% of %lld eval() calls went through JIT.\n", (100.0 * num)/den, den);
2572                 num = fast  .load();
2573                 den = pixels.load();
2574                 SkDebugf("%.3g%% of %lld pixels went through JIT.\n", (100.0 * num)/den, den);
2575             });
2576         }
2577     #endif
2578 
2579     #if !defined(SKVM_JIT_BUT_IGNORE_IT)
2580         const void* jit_entry = fImpl->jit_entry.load();
2581         // jit_entry may be null either simply because we can't JIT, or when using LLVM
2582         // if the work represented by fImpl->llvm_compiling hasn't finished yet.
2583         //
2584         // Ordinarily we'd never find ourselves with non-null jit_entry and !gSkVMAllowJIT, but it
2585         // can happen during interactive programs like Viewer that toggle gSkVMAllowJIT on and off,
2586         // due to timing or program caching.
2587         if (jit_entry != nullptr && gSkVMAllowJIT) {
2588         #if SKVM_JIT_STATS
2589             jits++;
2590             fast += n;
2591         #endif
2592             void** a = args;
2593             switch (fImpl->strides.size()) {
2594                 case 0: return ((void(*)(int                        ))jit_entry)(n               );
2595                 case 1: return ((void(*)(int,void*                  ))jit_entry)(n,a[0]          );
2596                 case 2: return ((void(*)(int,void*,void*            ))jit_entry)(n,a[0],a[1]     );
2597                 case 3: return ((void(*)(int,void*,void*,void*      ))jit_entry)(n,a[0],a[1],a[2]);
2598                 case 4: return ((void(*)(int,void*,void*,void*,void*))jit_entry)
2599                                 (n,a[0],a[1],a[2],a[3]);
2600                 case 5: return ((void(*)(int,void*,void*,void*,void*,void*))jit_entry)
2601                                 (n,a[0],a[1],a[2],a[3],a[4]);
2602                 case 6: return ((void(*)(int,void*,void*,void*,void*,void*,void*))jit_entry)
2603                                 (n,a[0],a[1],a[2],a[3],a[4],a[5]);
2604                 case 7: return ((void(*)(int,void*,void*,void*,void*,void*,void*,void*))jit_entry)
2605                                 (n,a[0],a[1],a[2],a[3],a[4],a[5],a[6]);
2606                 default: break; //SkASSERT(fImpl->strides.size() <= 7);
2607             }
2608         }
2609     #endif
2610 
2611         // So we'll sometimes use the interpreter here even if later calls will use the JIT.
2612         SkOpts::interpret_skvm(fImpl->instructions.data(), (int)fImpl->instructions.size(),
2613                                this->nregs(), this->loop(), fImpl->strides.data(), this->nargs(),
2614                                n, args);
2615     }
2616 
2617     #if defined(SKVM_LLVM)
2618     // -- SKVM_LLVM --------------------------------------------------------------------------------
setupLLVM(const std::vector<OptimizedInstruction> & instructions,const char * debug_name)2619     void Program::setupLLVM(const std::vector<OptimizedInstruction>& instructions,
2620                             const char* debug_name) {
2621         auto ctx = std::make_unique<llvm::LLVMContext>();
2622 
2623         auto mod = std::make_unique<llvm::Module>("", *ctx);
2624         // All the scary bare pointers from here on are owned by ctx or mod, I think.
2625 
2626         // Everything I've tested runs faster at K=8 (using ymm) than K=16 (zmm) on SKX machines.
2627         const int K = (true && SkCpu::Supports(SkCpu::HSW)) ? 8 : 4;
2628 
2629         llvm::Type *ptr = llvm::Type::getInt8Ty(*ctx)->getPointerTo(),
2630                    *i32 = llvm::Type::getInt32Ty(*ctx);
2631 
2632         std::vector<llvm::Type*> arg_types = { i32 };
2633         for (size_t i = 0; i < fImpl->strides.size(); i++) {
2634             arg_types.push_back(ptr);
2635         }
2636 
2637         llvm::FunctionType* fn_type = llvm::FunctionType::get(llvm::Type::getVoidTy(*ctx),
2638                                                               arg_types, /*vararg?=*/false);
2639         llvm::Function* fn
2640             = llvm::Function::Create(fn_type, llvm::GlobalValue::ExternalLinkage, debug_name, *mod);
2641         for (size_t i = 0; i < fImpl->strides.size(); i++) {
2642             fn->addParamAttr(i+1, llvm::Attribute::NoAlias);
2643         }
2644 
2645         llvm::BasicBlock *enter  = llvm::BasicBlock::Create(*ctx, "enter" , fn),
2646                          *hoistK = llvm::BasicBlock::Create(*ctx, "hoistK", fn),
2647                          *testK  = llvm::BasicBlock::Create(*ctx, "testK" , fn),
2648                          *loopK  = llvm::BasicBlock::Create(*ctx, "loopK" , fn),
2649                          *hoist1 = llvm::BasicBlock::Create(*ctx, "hoist1", fn),
2650                          *test1  = llvm::BasicBlock::Create(*ctx, "test1" , fn),
2651                          *loop1  = llvm::BasicBlock::Create(*ctx, "loop1" , fn),
2652                          *leave  = llvm::BasicBlock::Create(*ctx, "leave" , fn);
2653 
2654         using IRBuilder = llvm::IRBuilder<>;
2655 
2656         llvm::PHINode*                 n;
2657         std::vector<llvm::PHINode*> args;
2658         std::vector<llvm::Value*> vals(instructions.size());
2659 
2660         auto emit = [&](size_t i, bool scalar, IRBuilder* b) {
2661             auto [op, x,y,z,w, immA,immB,immC, death,can_hoist] = instructions[i];
2662 
2663             llvm::Type *i1    = llvm::Type::getInt1Ty (*ctx),
2664                        *i8    = llvm::Type::getInt8Ty (*ctx),
2665                        *i16   = llvm::Type::getInt16Ty(*ctx),
2666                        *f32   = llvm::Type::getFloatTy(*ctx),
2667                        *I1    = scalar ? i1    : llvm::VectorType::get(i1 , K, false  ),
2668                        *I8    = scalar ? i8    : llvm::VectorType::get(i8 , K, false  ),
2669                        *I16   = scalar ? i16   : llvm::VectorType::get(i16, K, false  ),
2670                        *I32   = scalar ? i32   : llvm::VectorType::get(i32, K, false  ),
2671                        *F32   = scalar ? f32   : llvm::VectorType::get(f32, K, false  );
2672 
2673             auto I  = [&](llvm::Value* v) { return b->CreateBitCast(v, I32  ); };
2674             auto F  = [&](llvm::Value* v) { return b->CreateBitCast(v, F32  ); };
2675 
2676             auto S = [&](llvm::Type* dst, llvm::Value* v) { return b->CreateSExt(v, dst); };
2677 
2678             llvm::Type* vt = nullptr;
2679             switch (llvm::Type* t = nullptr; op) {
2680                 default:
2681                     SkDebugf("can't llvm %s (%d)\n", name(op), op);
2682                     return false;
2683 
2684                 case Op::assert_true: /*TODO*/ break;
2685 
2686                 case Op::trace_line:
2687                 case Op::trace_var:
2688                 case Op::trace_call:
2689                     /* Only supported in the interpreter. */
2690                     break;
2691 
2692                 case Op::index:
2693                     if (I32->isVectorTy()) {
2694                         std::vector<llvm::Constant*> iota(K);
2695                         for (int j = 0; j < K; j++) {
2696                             iota[j] = b->getInt32(j);
2697                         }
2698                         vals[i] = b->CreateSub(b->CreateVectorSplat(K, n),
2699                                                llvm::ConstantVector::get(iota));
2700                     } else {
2701                         vals[i] = n;
2702                     } break;
2703 
2704                 case Op::load8:  t = I8 ; goto load;
2705                 case Op::load16: t = I16; goto load;
2706                 case Op::load32: t = I32; goto load;
2707                 load: {
2708                     llvm::Value* ptr = b->CreateBitCast(args[immA], t->getPointerTo());
2709                     vals[i] = b->CreateZExt(
2710                             b->CreateAlignedLoad(t, ptr, llvm::MaybeAlign{1}), I32);
2711                 } break;
2712 
2713 
2714                 case Op::splat: vals[i] = llvm::ConstantInt::get(I32, immA); break;
2715 
2716                 case Op::uniform32: {
2717                     llvm::Value* ptr = b->CreateBitCast(
2718                             b->CreateConstInBoundsGEP1_32(i8, args[immA], immB),
2719                             i32->getPointerTo());
2720                     llvm::Value* val = b->CreateZExt(
2721                             b->CreateAlignedLoad(i32, ptr, llvm::MaybeAlign{1}), i32);
2722                     vals[i] = I32->isVectorTy() ? b->CreateVectorSplat(K, val)
2723                                                 : val;
2724                 } break;
2725 
2726                 case Op::gather8:  t = i8 ; vt = I8; goto gather;
2727                 case Op::gather16: t = i16; vt = I16; goto gather;
2728                 case Op::gather32: t = i32; vt = I32; goto gather;
2729                 gather: {
2730                     // Our gather base pointer is immB bytes off of uniform immA.
2731                     llvm::Value* base =
2732                         b->CreateLoad(b->CreateBitCast(
2733                                 b->CreateConstInBoundsGEP1_32(i8, args[immA],immB),
2734                                 t->getPointerTo()->getPointerTo()));
2735 
2736                     llvm::Value* ptr = b->CreateInBoundsGEP(t, base, vals[x]);
2737                     llvm::Value* gathered;
2738                     if (ptr->getType()->isVectorTy()) {
2739                         gathered = b->CreateMaskedGather(
2740                                 vt,
2741                                 ptr,
2742                                 llvm::Align{1});
2743                     } else {
2744                         gathered = b->CreateAlignedLoad(vt, ptr, llvm::MaybeAlign{1});
2745                     }
2746                     vals[i] = b->CreateZExt(gathered, I32);
2747                 } break;
2748 
2749                 case Op::store8:  t = I8 ; goto store;
2750                 case Op::store16: t = I16; goto store;
2751                 case Op::store32: t = I32; goto store;
2752                 store: {
2753                     llvm::Value* val = b->CreateTrunc(vals[x], t);
2754                     llvm::Value* ptr = b->CreateBitCast(args[immA],
2755                                                         val->getType()->getPointerTo());
2756                     vals[i] = b->CreateAlignedStore(val, ptr, llvm::MaybeAlign{1});
2757                 } break;
2758 
2759                 case Op::bit_and:   vals[i] = b->CreateAnd(vals[x], vals[y]); break;
2760                 case Op::bit_or :   vals[i] = b->CreateOr (vals[x], vals[y]); break;
2761                 case Op::bit_xor:   vals[i] = b->CreateXor(vals[x], vals[y]); break;
2762                 case Op::bit_clear: vals[i] = b->CreateAnd(vals[x], b->CreateNot(vals[y])); break;
2763 
2764                 case Op::select:
2765                     vals[i] = b->CreateSelect(b->CreateTrunc(vals[x], I1), vals[y], vals[z]);
2766                     break;
2767 
2768                 case Op::add_i32: vals[i] = b->CreateAdd(vals[x], vals[y]); break;
2769                 case Op::sub_i32: vals[i] = b->CreateSub(vals[x], vals[y]); break;
2770                 case Op::mul_i32: vals[i] = b->CreateMul(vals[x], vals[y]); break;
2771 
2772                 case Op::shl_i32: vals[i] = b->CreateShl (vals[x], immA); break;
2773                 case Op::sra_i32: vals[i] = b->CreateAShr(vals[x], immA); break;
2774                 case Op::shr_i32: vals[i] = b->CreateLShr(vals[x], immA); break;
2775 
2776                 case Op:: eq_i32: vals[i] = S(I32, b->CreateICmpEQ (vals[x], vals[y])); break;
2777                 case Op:: gt_i32: vals[i] = S(I32, b->CreateICmpSGT(vals[x], vals[y])); break;
2778 
2779                 case Op::add_f32: vals[i] = I(b->CreateFAdd(F(vals[x]), F(vals[y]))); break;
2780                 case Op::sub_f32: vals[i] = I(b->CreateFSub(F(vals[x]), F(vals[y]))); break;
2781                 case Op::mul_f32: vals[i] = I(b->CreateFMul(F(vals[x]), F(vals[y]))); break;
2782                 case Op::div_f32: vals[i] = I(b->CreateFDiv(F(vals[x]), F(vals[y]))); break;
2783 
2784                 case Op:: eq_f32: vals[i] = S(I32, b->CreateFCmpOEQ(F(vals[x]), F(vals[y]))); break;
2785                 case Op::neq_f32: vals[i] = S(I32, b->CreateFCmpUNE(F(vals[x]), F(vals[y]))); break;
2786                 case Op:: gt_f32: vals[i] = S(I32, b->CreateFCmpOGT(F(vals[x]), F(vals[y]))); break;
2787                 case Op::gte_f32: vals[i] = S(I32, b->CreateFCmpOGE(F(vals[x]), F(vals[y]))); break;
2788 
2789                 case Op::fma_f32:
2790                     vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32},
2791                                                    {F(vals[x]), F(vals[y]), F(vals[z])}));
2792                     break;
2793 
2794                 case Op::fms_f32:
2795                     vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32},
2796                                                    {F(vals[x]), F(vals[y]),
2797                                                     b->CreateFNeg(F(vals[z]))}));
2798                     break;
2799 
2800                 case Op::fnma_f32:
2801                     vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32},
2802                                                    {b->CreateFNeg(F(vals[x])), F(vals[y]),
2803                                                     F(vals[z])}));
2804                     break;
2805 
2806                 case Op::ceil:
2807                     vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::ceil, F(vals[x])));
2808                     break;
2809                 case Op::floor:
2810                     vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::floor, F(vals[x])));
2811                     break;
2812 
2813                 case Op::max_f32:
2814                     vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[x]), F(vals[y])),
2815                                                 F(vals[y]), F(vals[x])));
2816                     break;
2817                 case Op::min_f32:
2818                     vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[y]), F(vals[x])),
2819                                                 F(vals[y]), F(vals[x])));
2820                     break;
2821 
2822                 case Op::sqrt_f32:
2823                     vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, F(vals[x])));
2824                     break;
2825 
2826                 case Op::to_f32: vals[i] = I(b->CreateSIToFP(  vals[x] , F32)); break;
2827                 case Op::trunc : vals[i] =   b->CreateFPToSI(F(vals[x]), I32) ; break;
2828                 case Op::round : {
2829                     // Basic impl when we can't use cvtps2dq and co.
2830                     auto round = b->CreateUnaryIntrinsic(llvm::Intrinsic::rint, F(vals[x]));
2831                     vals[i] = b->CreateFPToSI(round, I32);
2832 
2833                 #if 1 && defined(SK_CPU_X86)
2834                     // Using b->CreateIntrinsic(..., {}, {...}) to avoid name mangling.
2835                     if (scalar) {
2836                         // cvtss2si is float x4 -> int, ignoring input lanes 1,2,3.  ¯\_(ツ)_/¯
2837                         llvm::Value* v = llvm::UndefValue::get(
2838                                 llvm::VectorType::get(f32, 4, false));
2839                         v = b->CreateInsertElement(v, F(vals[x]), (uint64_t)0);
2840                         vals[i] = b->CreateIntrinsic(llvm::Intrinsic::x86_sse_cvtss2si, {}, {v});
2841                     } else {
2842                         SkASSERT(K == 4  || K == 8);
2843                         auto intr = K == 4 ?   llvm::Intrinsic::x86_sse2_cvtps2dq :
2844                                  /* K == 8 ?*/ llvm::Intrinsic::x86_avx_cvt_ps2dq_256;
2845                         vals[i] = b->CreateIntrinsic(intr, {}, {F(vals[x])});
2846                     }
2847                 #endif
2848                 } break;
2849 
2850             }
2851             return true;
2852         };
2853 
2854         {
2855             IRBuilder b(enter);
2856             b.CreateBr(hoistK);
2857         }
2858 
2859         // hoistK: emit each hoistable vector instruction; goto testK;
2860         // LLVM can do this sort of thing itself, but we've got the information cheap,
2861         // and pointer aliasing makes it easier to manually hoist than teach LLVM it's safe.
2862         {
2863             IRBuilder b(hoistK);
2864 
2865             // Hoisted instructions will need args (think, uniforms), so set that up now.
2866             // These phi nodes are degenerate... they'll always be the passed-in args from enter.
2867             // Later on when we start looping the phi nodes will start looking useful.
2868             llvm::Argument* arg = fn->arg_begin();
2869             (void)arg++;  // Leave n as nullptr... it'd be a bug to use n in a hoisted instruction.
2870             for (size_t i = 0; i < fImpl->strides.size(); i++) {
2871                 args.push_back(b.CreatePHI(arg->getType(), 1));
2872                 args.back()->addIncoming(arg++, enter);
2873             }
2874 
2875             for (size_t i = 0; i < instructions.size(); i++) {
2876                 if (instructions[i].can_hoist && !emit(i, false, &b)) {
2877                     return;
2878                 }
2879             }
2880 
2881             b.CreateBr(testK);
2882         }
2883 
2884         // testK:  if (N >= K) goto loopK; else goto hoist1;
2885         {
2886             IRBuilder b(testK);
2887 
2888             // New phi nodes for `n` and each pointer argument from hoistK; later we'll add loopK.
2889             // These also start as the initial function arguments; hoistK can't have changed them.
2890             llvm::Argument* arg = fn->arg_begin();
2891 
2892             n = b.CreatePHI(arg->getType(), 2);
2893             n->addIncoming(arg++, hoistK);
2894 
2895             for (size_t i = 0; i < fImpl->strides.size(); i++) {
2896                 args[i] = b.CreatePHI(arg->getType(), 2);
2897                 args[i]->addIncoming(arg++, hoistK);
2898             }
2899 
2900             b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(K)), loopK, hoist1);
2901         }
2902 
2903         // loopK:  ... insts on K x T vectors; N -= K, args += K*stride; goto testK;
2904         {
2905             IRBuilder b(loopK);
2906             for (size_t i = 0; i < instructions.size(); i++) {
2907                 if (!instructions[i].can_hoist && !emit(i, false, &b)) {
2908                     return;
2909                 }
2910             }
2911 
2912             // n -= K
2913             llvm::Value* n_next = b.CreateSub(n, b.getInt32(K));
2914             n->addIncoming(n_next, loopK);
2915 
2916             // Each arg ptr += K
2917             for (size_t i = 0; i < fImpl->strides.size(); i++) {
2918                 llvm::Value* arg_next
2919                     = b.CreateConstInBoundsGEP1_32(
2920                             llvm::Type::getInt8Ty (*ctx),
2921                             args[i],
2922                             K*fImpl->strides[i]);
2923                 args[i]->addIncoming(arg_next, loopK);
2924             }
2925             b.CreateBr(testK);
2926         }
2927 
2928         // hoist1: emit each hoistable scalar instruction; goto test1;
2929         {
2930             IRBuilder b(hoist1);
2931             for (size_t i = 0; i < instructions.size(); i++) {
2932                 if (instructions[i].can_hoist && !emit(i, true, &b)) {
2933                     return;
2934                 }
2935             }
2936             b.CreateBr(test1);
2937         }
2938 
2939         // test1:  if (N >= 1) goto loop1; else goto leave;
2940         {
2941             IRBuilder b(test1);
2942 
2943             // Set up new phi nodes for `n` and each pointer argument, now from hoist1 and loop1.
2944             llvm::PHINode* n_new = b.CreatePHI(n->getType(), 2);
2945             n_new->addIncoming(n, hoist1);
2946             n = n_new;
2947 
2948             for (size_t i = 0; i < fImpl->strides.size(); i++) {
2949                 llvm::PHINode* arg_new = b.CreatePHI(args[i]->getType(), 2);
2950                 arg_new->addIncoming(args[i], hoist1);
2951                 args[i] = arg_new;
2952             }
2953 
2954             b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(1)), loop1, leave);
2955         }
2956 
2957         // loop1:  ... insts on scalars; N -= 1, args += stride; goto test1;
2958         {
2959             IRBuilder b(loop1);
2960             for (size_t i = 0; i < instructions.size(); i++) {
2961                 if (!instructions[i].can_hoist && !emit(i, true, &b)) {
2962                     return;
2963                 }
2964             }
2965 
2966             // n -= 1
2967             llvm::Value* n_next = b.CreateSub(n, b.getInt32(1));
2968             n->addIncoming(n_next, loop1);
2969 
2970             // Each arg ptr += 1
2971             for (size_t i = 0; i < fImpl->strides.size(); i++) {
2972                 llvm::Value* arg_next
2973                     = b.CreateConstInBoundsGEP1_32(
2974                             llvm::Type::getInt8Ty (*ctx), args[i], fImpl->strides[i]);
2975                 args[i]->addIncoming(arg_next, loop1);
2976             }
2977             b.CreateBr(test1);
2978         }
2979 
2980         // leave:  ret
2981         {
2982             IRBuilder b(leave);
2983             b.CreateRetVoid();
2984         }
2985 
2986         SkASSERT(false == llvm::verifyModule(*mod, &llvm::outs()));
2987 
2988         if (true) {
2989             SkString path = SkStringPrintf("/tmp/%s.bc", debug_name);
2990             std::error_code err;
2991             llvm::raw_fd_ostream os(path.c_str(), err);
2992             if (err) {
2993                 return;
2994             }
2995             llvm::WriteBitcodeToFile(*mod, os);
2996         }
2997 
2998         static SkOnce once;
2999         once([]{
3000             SkAssertResult(false == llvm::InitializeNativeTarget());
3001             SkAssertResult(false == llvm::InitializeNativeTargetAsmPrinter());
3002         });
3003 
3004         if (llvm::ExecutionEngine* ee = llvm::EngineBuilder(std::move(mod))
3005                                             .setEngineKind(llvm::EngineKind::JIT)
3006                                             .setMCPU(llvm::sys::getHostCPUName())
3007                                             .create()) {
3008             fImpl->llvm_ctx = std::move(ctx);
3009             fImpl->llvm_ee.reset(ee);
3010 
3011             #if defined(SKVM_LLVM_WAIT_FOR_COMPILATION)
3012             // Wait for llvm to compile
3013             void* function = (void*)ee->getFunctionAddress(debug_name);
3014             fImpl->jit_entry.store(function);
3015             // We have to be careful here about what we close over and how, in case fImpl moves.
3016             // fImpl itself may change, but its pointee fields won't, so close over them by value.
3017             // Also, debug_name will almost certainly leave scope, so copy it.
3018             #else
3019             fImpl->llvm_compiling = std::async(std::launch::async, [dst  = &fImpl->jit_entry,
3020                                                                     ee   =  fImpl->llvm_ee.get(),
3021                                                                     name = std::string(debug_name)]{
3022                 // std::atomic<void*>*    dst;
3023                 // llvm::ExecutionEngine* ee;
3024                 // std::string            name;
3025                 dst->store( (void*)ee->getFunctionAddress(name.c_str()) );
3026             });
3027             #endif
3028         }
3029     }
3030     #endif  // SKVM_LLVM
3031 
waitForLLVM() const3032     void Program::waitForLLVM() const {
3033     #if defined(SKVM_LLVM) && !defined(SKVM_LLVM_WAIT_FOR_COMPILATION)
3034         if (fImpl->llvm_compiling.valid()) {
3035             fImpl->llvm_compiling.wait();
3036         }
3037     #endif
3038     }
3039 
hasJIT() const3040     bool Program::hasJIT() const {
3041         // Program::hasJIT() is really just a debugging / test aid,
3042         // so we don't mind adding a sync point here to wait for compilation.
3043         this->waitForLLVM();
3044 
3045         return fImpl->jit_entry.load() != nullptr;
3046     }
3047 
dropJIT()3048     void Program::dropJIT() {
3049     #if defined(SKVM_LLVM)
3050         this->waitForLLVM();
3051         fImpl->llvm_ee .reset(nullptr);
3052         fImpl->llvm_ctx.reset(nullptr);
3053     #elif defined(SKVM_JIT)
3054         if (fImpl->dylib) {
3055             close_dylib(fImpl->dylib);
3056         } else if (auto jit_entry = fImpl->jit_entry.load()) {
3057             unmap_jit_buffer(jit_entry, fImpl->jit_size);
3058         }
3059     #else
3060         SkASSERT(!this->hasJIT());
3061     #endif
3062 
3063         fImpl->jit_entry.store(nullptr);
3064         fImpl->jit_size  = 0;
3065         fImpl->dylib     = nullptr;
3066     }
3067 
Program()3068     Program::Program() : fImpl(std::make_unique<Impl>()) {}
3069 
~Program()3070     Program::~Program() {
3071         // Moved-from Programs may have fImpl == nullptr.
3072         if (fImpl) {
3073             this->dropJIT();
3074         }
3075     }
3076 
Program(Program && other)3077     Program::Program(Program&& other) : fImpl(std::move(other.fImpl)) {}
3078 
operator =(Program && other)3079     Program& Program::operator=(Program&& other) {
3080         fImpl = std::move(other.fImpl);
3081         return *this;
3082     }
3083 
Program(const std::vector<OptimizedInstruction> & instructions,const std::vector<int> & strides,const char * debug_name,bool allow_jit)3084     Program::Program(const std::vector<OptimizedInstruction>& instructions,
3085                      const std::vector<int>& strides,
3086                      const char* debug_name, bool allow_jit) : Program() {
3087         fImpl->strides = strides;
3088         if (gSkVMAllowJIT && allow_jit) {
3089         #if 1 && defined(SKVM_LLVM)
3090             this->setupLLVM(instructions, debug_name);
3091         #elif 1 && defined(SKVM_JIT)
3092             this->setupJIT(instructions, debug_name);
3093         #endif
3094         }
3095 
3096         // Might as well do this after setupLLVM() to get a little more time to compile.
3097         this->setupInterpreter(instructions);
3098     }
3099 
instructions() const3100     std::vector<InterpreterInstruction> Program::instructions() const { return fImpl->instructions; }
nargs() const3101     int  Program::nargs() const { return (int)fImpl->strides.size(); }
nregs() const3102     int  Program::nregs() const { return fImpl->regs; }
loop() const3103     int  Program::loop () const { return fImpl->loop; }
empty() const3104     bool Program::empty() const { return fImpl->instructions.empty(); }
3105 
3106     // Translate OptimizedInstructions to InterpreterInstructions.
setupInterpreter(const std::vector<OptimizedInstruction> & instructions)3107     void Program::setupInterpreter(const std::vector<OptimizedInstruction>& instructions) {
3108         // Register each instruction is assigned to.
3109         std::vector<Reg> reg(instructions.size());
3110 
3111         // This next bit is a bit more complicated than strictly necessary;
3112         // we could just assign every instruction to its own register.
3113         //
3114         // But recycling registers is fairly cheap, and good practice for the
3115         // JITs where minimizing register pressure really is important.
3116         //
3117         // We have effectively infinite registers, so we hoist any value we can.
3118         // (The JIT may choose a more complex policy to reduce register pressure.)
3119 
3120         fImpl->regs = 0;
3121         std::vector<Reg> avail;
3122 
3123         // Assign this value to a register, recycling them where we can.
3124         auto assign_register = [&](Val id) {
3125             const OptimizedInstruction& inst = instructions[id];
3126 
3127             // If this is a real input and it's lifetime ends at this instruction,
3128             // we can recycle the register it's occupying.
3129             auto maybe_recycle_register = [&](Val input) {
3130                 if (input != NA && instructions[input].death == id) {
3131                     avail.push_back(reg[input]);
3132                 }
3133             };
3134 
3135             // Take care to not recycle the same register twice.
3136             const Val x = inst.x, y = inst.y, z = inst.z, w = inst.w;
3137             if (true                      ) { maybe_recycle_register(x); }
3138             if (y != x                    ) { maybe_recycle_register(y); }
3139             if (z != x && z != y          ) { maybe_recycle_register(z); }
3140             if (w != x && w != y && w != z) { maybe_recycle_register(w); }
3141 
3142             // Instructions that die at themselves (stores) don't need a register.
3143             if (inst.death != id) {
3144                 // Allocate a register if we have to, preferring to reuse anything available.
3145                 if (avail.empty()) {
3146                     reg[id] = fImpl->regs++;
3147                 } else {
3148                     reg[id] = avail.back();
3149                     avail.pop_back();
3150                 }
3151             }
3152         };
3153 
3154         // Assign a register to each hoisted instruction, then each non-hoisted loop instruction.
3155         for (Val id = 0; id < (Val)instructions.size(); id++) {
3156             if ( instructions[id].can_hoist) { assign_register(id); }
3157         }
3158         for (Val id = 0; id < (Val)instructions.size(); id++) {
3159             if (!instructions[id].can_hoist) { assign_register(id); }
3160         }
3161 
3162         // Translate OptimizedInstructions to InterpreterIstructions by mapping values to
3163         // registers.  This will be two passes, first hoisted instructions, then inside the loop.
3164 
3165         // The loop begins at the fImpl->loop'th Instruction.
3166         fImpl->loop = 0;
3167         fImpl->instructions.reserve(instructions.size());
3168 
3169         // Add a mapping for the N/A sentinel Val to any arbitrary register
3170         // so lookups don't have to know which arguments are used by which Ops.
3171         auto lookup_register = [&](Val id) {
3172             return id == NA ? (Reg)0
3173                             : reg[id];
3174         };
3175 
3176         auto push_instruction = [&](Val id, const OptimizedInstruction& inst) {
3177             InterpreterInstruction pinst{
3178                 inst.op,
3179                 lookup_register(id),
3180                 lookup_register(inst.x),
3181                 lookup_register(inst.y),
3182                 lookup_register(inst.z),
3183                 lookup_register(inst.w),
3184                 inst.immA,
3185                 inst.immB,
3186                 inst.immC,
3187             };
3188             fImpl->instructions.push_back(pinst);
3189         };
3190 
3191         for (Val id = 0; id < (Val)instructions.size(); id++) {
3192             const OptimizedInstruction& inst = instructions[id];
3193             if (inst.can_hoist) {
3194                 push_instruction(id, inst);
3195                 fImpl->loop++;
3196             }
3197         }
3198         for (Val id = 0; id < (Val)instructions.size(); id++) {
3199             const OptimizedInstruction& inst = instructions[id];
3200             if (!inst.can_hoist) {
3201                 push_instruction(id, inst);
3202             }
3203         }
3204     }
3205 
3206 #if defined(SKVM_JIT)
3207 
3208     namespace SkVMJitTypes {
3209     #if defined(__x86_64__) || defined(_M_X64)
3210         using Reg = Assembler::Ymm;
3211     #elif defined(__aarch64__)
3212         using Reg = Assembler::V;
3213     #endif
3214     }  // namespace SkVMJitTypes
3215 
jit(const std::vector<OptimizedInstruction> & instructions,int * stack_hint,uint32_t * registers_used,Assembler * a) const3216     bool Program::jit(const std::vector<OptimizedInstruction>& instructions,
3217                       int* stack_hint,
3218                       uint32_t* registers_used,
3219                       Assembler* a) const {
3220         using A = Assembler;
3221         using SkVMJitTypes::Reg;
3222 
3223         SkTHashMap<int, A::Label> constants;    // Constants (mostly splats) share the same pool.
3224         A::Label                  iota;         // Varies per lane, for Op::index.
3225         A::Label                  load64_index; // Used to load low or high half of 64-bit lanes.
3226 
3227         // The `regs` array tracks everything we know about each register's state:
3228         //   - NA:   empty
3229         //   - RES:  reserved by ABI
3230         //   - TMP:  holding a temporary
3231         //   - id:   holding Val id
3232         constexpr Val RES = NA-1,
3233                       TMP = RES-1;
3234 
3235         // Map val -> stack slot.
3236         std::vector<int> stack_slot(instructions.size(), NA);
3237         int next_stack_slot = 0;
3238 
3239         const int nstack_slots = *stack_hint >= 0 ? *stack_hint
3240                                                   : stack_slot.size();
3241     #if defined(__x86_64__) || defined(_M_X64)
3242         if (!SkCpu::Supports(SkCpu::HSW)) {
3243             return false;
3244         }
3245         const int K = 8;
3246         #if defined(_M_X64)  // Important to check this first; clang-cl defines both.
3247             const A::GP64 N = A::rcx,
3248                         GP0 = A::rax,
3249                         GP1 = A::r11,
3250                         arg[]    = { A::rdx, A::r8, A::r9, A::r10, A::rdi, A::rsi };
3251 
3252             // xmm6-15 need are callee-saved.
3253             std::array<Val,16> regs = {
3254                  NA, NA, NA, NA,  NA, NA,RES,RES,
3255                 RES,RES,RES,RES, RES,RES,RES,RES,
3256             };
3257             const uint32_t incoming_registers_used = *registers_used;
3258 
3259             auto enter = [&]{
3260                 // rcx,rdx,r8,r9 are all already holding their correct values.
3261                 // Load caller-saved r10 from rsp+40 if there's a fourth arg.
3262                 if (fImpl->strides.size() >= 4) {
3263                     a->mov(A::r10, A::Mem{A::rsp, 40});
3264                 }
3265                 // Load callee-saved rdi from rsp+48 if there's a fifth arg,
3266                 // first saving it to ABI reserved shadow area rsp+8.
3267                 if (fImpl->strides.size() >= 5) {
3268                     a->mov(A::Mem{A::rsp, 8}, A::rdi);
3269                     a->mov(A::rdi, A::Mem{A::rsp, 48});
3270                 }
3271                 // Load callee-saved rsi from rsp+56 if there's a sixth arg,
3272                 // first saving it to ABI reserved shadow area rsp+16.
3273                 if (fImpl->strides.size() >= 6) {
3274                     a->mov(A::Mem{A::rsp, 16}, A::rsi);
3275                     a->mov(A::rsi, A::Mem{A::rsp, 56});
3276                 }
3277 
3278                 // Allocate stack for our values and callee-saved xmm6-15.
3279                 int stack_needed = nstack_slots*K*4;
3280                 for (int r = 6; r < 16; r++) {
3281                     if (incoming_registers_used & (1<<r)) {
3282                         stack_needed += 16;
3283                     }
3284                 }
3285                 if (stack_needed) { a->sub(A::rsp, stack_needed); }
3286 
3287                 int next_saved_xmm = nstack_slots*K*4;
3288                 for (int r = 6; r < 16; r++) {
3289                     if (incoming_registers_used & (1<<r)) {
3290                         a->vmovups(A::Mem{A::rsp, next_saved_xmm}, (A::Xmm)r);
3291                         next_saved_xmm += 16;
3292                         regs[r] = NA;
3293                     }
3294                 }
3295             };
3296             auto exit  = [&]{
3297                 // The second pass of jit() shouldn't use any register it didn't in the first pass.
3298                 SkASSERT((*registers_used & incoming_registers_used) == *registers_used);
3299 
3300                 // Restore callee-saved xmm6-15 and the stack pointer.
3301                 int stack_used = nstack_slots*K*4;
3302                 for (int r = 6; r < 16; r++) {
3303                     if (incoming_registers_used & (1<<r)) {
3304                         a->vmovups((A::Xmm)r, A::Mem{A::rsp, stack_used});
3305                         stack_used += 16;
3306                     }
3307                 }
3308                 if (stack_used) { a->add(A::rsp, stack_used); }
3309 
3310                 // Restore callee-saved rdi/rsi if we used them.
3311                 if (fImpl->strides.size() >= 5) {
3312                     a->mov(A::rdi, A::Mem{A::rsp, 8});
3313                 }
3314                 if (fImpl->strides.size() >= 6) {
3315                     a->mov(A::rsi, A::Mem{A::rsp, 16});
3316                 }
3317 
3318                 a->vzeroupper();
3319                 a->ret();
3320             };
3321         #elif defined(__x86_64__)
3322             const A::GP64 N = A::rdi,
3323                         GP0 = A::rax,
3324                         GP1 = A::r11,
3325                         arg[]    = { A::rsi, A::rdx, A::rcx, A::r8, A::r9, A::r10 };
3326 
3327             // All 16 ymm registers are available to use.
3328             std::array<Val,16> regs = {
3329                 NA,NA,NA,NA, NA,NA,NA,NA,
3330                 NA,NA,NA,NA, NA,NA,NA,NA,
3331             };
3332 
3333             auto enter = [&]{
3334                 // Load caller-saved r10 from rsp+8 if there's a sixth arg.
3335                 if (fImpl->strides.size() >= 6) {
3336                     a->mov(A::r10, A::Mem{A::rsp, 8});
3337                 }
3338                 if (nstack_slots) { a->sub(A::rsp, nstack_slots*K*4); }
3339             };
3340             auto exit  = [&]{
3341                 if (nstack_slots) { a->add(A::rsp, nstack_slots*K*4); }
3342                 a->vzeroupper();
3343                 a->ret();
3344             };
3345         #endif
3346 
3347         auto load_from_memory = [&](Reg r, Val v) {
3348             if (instructions[v].op == Op::splat) {
3349                 if (instructions[v].immA == 0) {
3350                     a->vpxor(r,r,r);
3351                 } else {
3352                     a->vmovups(r, constants.find(instructions[v].immA));
3353                 }
3354             } else {
3355                 SkASSERT(stack_slot[v] != NA);
3356                 a->vmovups(r, A::Mem{A::rsp, stack_slot[v]*K*4});
3357             }
3358         };
3359         auto store_to_stack = [&](Reg r, Val v) {
3360             SkASSERT(next_stack_slot < nstack_slots);
3361             stack_slot[v] = next_stack_slot++;
3362             a->vmovups(A::Mem{A::rsp, stack_slot[v]*K*4}, r);
3363         };
3364     #elif defined(__aarch64__)
3365         const int K = 4;
3366         const A::X N     = A::x0,
3367                    GP0   = A::x8,
3368                    GP1   = A::x9,
3369                    arg[] = { A::x1, A::x2, A::x3, A::x4, A::x5, A::x6, A::x7 };
3370 
3371         // We can use v0-v7 and v16-v31 freely; we'd need to preserve v8-v15 in enter/exit.
3372         std::array<Val,32> regs = {
3373              NA, NA, NA, NA,  NA, NA, NA, NA,
3374             RES,RES,RES,RES, RES,RES,RES,RES,
3375              NA, NA, NA, NA,  NA, NA, NA, NA,
3376              NA, NA, NA, NA,  NA, NA, NA, NA,
3377         };
3378 
3379         auto enter = [&]{ if (nstack_slots) { a->sub(A::sp, A::sp, nstack_slots*K*4); } };
3380         auto exit  = [&]{ if (nstack_slots) { a->add(A::sp, A::sp, nstack_slots*K*4); }
3381                           a->ret(A::x30); };
3382 
3383         auto load_from_memory = [&](Reg r, Val v) {
3384             if (instructions[v].op == Op::splat) {
3385                 if (instructions[v].immA == 0) {
3386                     a->eor16b(r,r,r);
3387                 } else {
3388                     a->ldrq(r, constants.find(instructions[v].immA));
3389                 }
3390             } else {
3391                 SkASSERT(stack_slot[v] != NA);
3392                 a->ldrq(r, A::sp, stack_slot[v]);
3393             }
3394         };
3395         auto store_to_stack  = [&](Reg r, Val v) {
3396             SkASSERT(next_stack_slot < nstack_slots);
3397             stack_slot[v] = next_stack_slot++;
3398             a->strq(r, A::sp, stack_slot[v]);
3399         };
3400     #endif
3401 
3402         *registers_used = 0;  // We'll update this as we go.
3403 
3404         if (SK_ARRAY_COUNT(arg) < fImpl->strides.size()) {
3405             return false;
3406         }
3407 
3408         auto emit = [&](Val id, bool scalar) {
3409             const int active_lanes = scalar ? 1 : K;
3410             const OptimizedInstruction& inst = instructions[id];
3411             const Op op = inst.op;
3412             const Val x = inst.x,
3413                       y = inst.y,
3414                       z = inst.z,
3415                       w = inst.w;
3416             const int immA = inst.immA,
3417                       immB = inst.immB,
3418                       immC = inst.immC;
3419 
3420             // alloc_tmp() returns the first of N adjacent temporary registers,
3421             // each freed manually with free_tmp() or noted as our result with mark_tmp_as_dst().
3422             auto alloc_tmp = [&](int N=1) -> Reg {
3423                 auto needs_spill = [&](Val v) -> bool {
3424                     SkASSERT(v >= 0);   // {NA,TMP,RES} need to be handled before calling this.
3425                     return stack_slot[v] == NA               // We haven't spilled it already?
3426                         && instructions[v].op != Op::splat;  // No need to spill constants.
3427                 };
3428 
3429                 // We want to find a block of N adjacent registers requiring the fewest spills.
3430                 int best_block = -1,
3431                     min_spills = 0x7fff'ffff;
3432                 for (int block = 0; block+N <= (int)regs.size(); block++) {
3433                     int spills = 0;
3434                     for (int r = block; r < block+N; r++) {
3435                         Val v = regs[r];
3436                         // Registers holding NA (nothing) are ideal, nothing to spill.
3437                         if (v == NA) {
3438                             continue;
3439                         }
3440                         // We can't spill anything REServed or that we'll need this instruction.
3441                         if (v == RES ||
3442                             v == TMP || v == id || v == x || v == y || v == z || v == w) {
3443                             spills = 0x7fff'ffff;
3444                             block  = r;   // (optimization) continue outer loop at next register.
3445                             break;
3446                         }
3447                         // Usually here we've got a value v that we'd have to spill to the stack
3448                         // before reusing its register, but sometimes even now we get a freebie.
3449                         spills += needs_spill(v) ? 1 : 0;
3450                     }
3451 
3452                     // TODO: non-arbitrary tie-breaking?
3453                     if (min_spills > spills) {
3454                         min_spills = spills;
3455                         best_block = block;
3456                     }
3457                     if (min_spills == 0) {
3458                         break;  // (optimization) stop early if we find an unbeatable block.
3459                     }
3460                 }
3461 
3462                 // TODO: our search's success isn't obviously guaranteed... it depends on N
3463                 // and the number and relative position in regs of any unspillable values.
3464                 // I think we should be able to get away with N≤2 on x86-64 and N≤4 on arm64;
3465                 // we'll need to revisit this logic should this assert fire.
3466                 SkASSERT(min_spills <= N);
3467 
3468                 // Spill what needs spilling, and mark the block all as TMP.
3469                 for (int r = best_block; r < best_block+N; r++) {
3470                     Val& v = regs[r];
3471                     *registers_used |= (1<<r);
3472 
3473                     SkASSERT(v == NA || v >= 0);
3474                     if (v >= 0 && needs_spill(v)) {
3475                         store_to_stack((Reg)r, v);
3476                         SkASSERT(!needs_spill(v));
3477                         min_spills--;
3478                     }
3479 
3480                     v = TMP;
3481                 }
3482                 SkASSERT(min_spills == 0);
3483                 return (Reg)best_block;
3484             };
3485 
3486             auto free_tmp = [&](Reg r) {
3487                 SkASSERT(regs[r] == TMP);
3488                 regs[r] = NA;
3489             };
3490 
3491             // Which register holds dst,x,y,z,w for this instruction?  NA if none does yet.
3492             int rd = NA,
3493                 rx = NA,
3494                 ry = NA,
3495                 rz = NA,
3496                 rw = NA;
3497 
3498             auto update_regs = [&](Reg r, Val v) {
3499                 if (v == id) { rd = r; }
3500                 if (v ==  x) { rx = r; }
3501                 if (v ==  y) { ry = r; }
3502                 if (v ==  z) { rz = r; }
3503                 if (v ==  w) { rw = r; }
3504                 return r;
3505             };
3506 
3507             auto find_existing_reg = [&](Val v) -> int {
3508                 // Quick-check our working registers.
3509                 if (v == id && rd != NA) { return rd; }
3510                 if (v ==  x && rx != NA) { return rx; }
3511                 if (v ==  y && ry != NA) { return ry; }
3512                 if (v ==  z && rz != NA) { return rz; }
3513                 if (v ==  w && rw != NA) { return rw; }
3514 
3515                 // Search inter-instruction register map.
3516                 for (auto [r,val] : SkMakeEnumerate(regs)) {
3517                     if (val == v) {
3518                         return update_regs((Reg)r, v);
3519                     }
3520                 }
3521                 return NA;
3522             };
3523 
3524             // Return a register for Val, holding that value if it already exists.
3525             // During this instruction all calls to r(v) will return the same register.
3526             auto r = [&](Val v) -> Reg {
3527                 SkASSERT(v >= 0);
3528 
3529                 if (int found = find_existing_reg(v); found != NA) {
3530                     return (Reg)found;
3531                 }
3532 
3533                 Reg r = alloc_tmp();
3534                 SkASSERT(regs[r] == TMP);
3535 
3536                 SkASSERT(v <= id);
3537                 if (v < id) {
3538                     // If v < id, we're loading one of this instruction's inputs.
3539                     // If v == id we're just allocating its destination register.
3540                     load_from_memory(r, v);
3541                 }
3542                 regs[r] = v;
3543                 return update_regs(r, v);
3544             };
3545 
3546             auto dies_here = [&](Val v) -> bool {
3547                 SkASSERT(v >= 0);
3548                 return instructions[v].death == id;
3549             };
3550 
3551             // Alias dst() to r(v) if dies_here(v).
3552             auto try_alias = [&](Val v) -> bool {
3553                 SkASSERT(v == x || v == y || v == z || v == w);
3554                 if (dies_here(v)) {
3555                     rd = r(v);      // Vals v and id share a register for this instruction.
3556                     regs[rd] = id;  // Next instruction, Val id will be in the register, not Val v.
3557                     return true;
3558                 }
3559                 return false;
3560             };
3561 
3562             // Generally r(id),
3563             // but with a hint, try to alias dst() to r(v) if dies_here(v).
3564             auto dst = [&](Val hint1 = NA, Val hint2 = NA) -> Reg {
3565                 if (hint1 != NA && try_alias(hint1)) { return r(id); }
3566                 if (hint2 != NA && try_alias(hint2)) { return r(id); }
3567                 return r(id);
3568             };
3569 
3570         #if defined(__aarch64__)  // Nothing sneaky, just unused on x86-64.
3571             auto mark_tmp_as_dst = [&](Reg tmp) {
3572                 SkASSERT(regs[tmp] == TMP);
3573                 rd = tmp;
3574                 regs[rd] = id;
3575                 SkASSERT(dst() == tmp);
3576             };
3577         #endif
3578 
3579         #if defined(__x86_64__) || defined(_M_X64)
3580             // On x86 we can work with many values directly from the stack or program constant pool.
3581             auto any = [&](Val v) -> A::Operand {
3582                 SkASSERT(v >= 0);
3583                 SkASSERT(v < id);
3584 
3585                 if (int found = find_existing_reg(v); found != NA) {
3586                     return (Reg)found;
3587                 }
3588                 if (instructions[v].op == Op::splat) {
3589                     return constants.find(instructions[v].immA);
3590                 }
3591                 return A::Mem{A::rsp, stack_slot[v]*K*4};
3592             };
3593 
3594             // This is never really worth asking except when any() might be used;
3595             // if we need this value in ARM, might as well just call r(v) to get it into a register.
3596             auto in_reg = [&](Val v) -> bool {
3597                 return find_existing_reg(v) != NA;
3598             };
3599         #endif
3600 
3601             switch (op) {
3602                 // Make sure splat constants can be found by load_from_memory() or any().
3603                 case Op::splat:
3604                     (void)constants[immA];
3605                     break;
3606 
3607             #if defined(__x86_64__) || defined(_M_X64)
3608                 case Op::assert_true: {
3609                     a->vptest (r(x), &constants[0xffffffff]);
3610                     A::Label all_true;
3611                     a->jc(&all_true);
3612                     a->int3();
3613                     a->label(&all_true);
3614                 } break;
3615 
3616                 case Op::trace_line:
3617                 case Op::trace_var:
3618                 case Op::trace_call:
3619                     /* Only supported in the interpreter. */
3620                     break;
3621 
3622                 case Op::store8:
3623                     if (scalar) {
3624                         a->vpextrb(A::Mem{arg[immA]}, (A::Xmm)r(x), 0);
3625                     } else {
3626                         a->vpackusdw(dst(x), r(x), r(x));
3627                         a->vpermq   (dst(), dst(), 0xd8);
3628                         a->vpackuswb(dst(), dst(), dst());
3629                         a->vmovq    (A::Mem{arg[immA]}, (A::Xmm)dst());
3630                     } break;
3631 
3632                 case Op::store16:
3633                     if (scalar) {
3634                         a->vpextrw(A::Mem{arg[immA]}, (A::Xmm)r(x), 0);
3635                     } else {
3636                         a->vpackusdw(dst(x), r(x), r(x));
3637                         a->vpermq   (dst(), dst(), 0xd8);
3638                         a->vmovups  (A::Mem{arg[immA]}, (A::Xmm)dst());
3639                     } break;
3640 
3641                 case Op::store32: if (scalar) { a->vmovd  (A::Mem{arg[immA]}, (A::Xmm)r(x)); }
3642                                   else        { a->vmovups(A::Mem{arg[immA]},         r(x)); }
3643                                   break;
3644 
3645                 case Op::store64: if (scalar) {
3646                                       a->vmovd(A::Mem{arg[immA],0}, (A::Xmm)r(x));
3647                                       a->vmovd(A::Mem{arg[immA],4}, (A::Xmm)r(y));
3648                                   } else {
3649                                       // r(x) = {a,b,c,d|e,f,g,h}
3650                                       // r(y) = {i,j,k,l|m,n,o,p}
3651                                       // We want to write a,i,b,j,c,k,d,l,e,m...
3652                                       A::Ymm L = alloc_tmp(),
3653                                              H = alloc_tmp();
3654                                       a->vpunpckldq(L, r(x), any(y));  // L = {a,i,b,j|e,m,f,n}
3655                                       a->vpunpckhdq(H, r(x), any(y));  // H = {c,k,d,l|g,o,h,p}
3656                                       a->vperm2f128(dst(), L,H, 0x20); //   = {a,i,b,j|c,k,d,l}
3657                                       a->vmovups(A::Mem{arg[immA], 0}, dst());
3658                                       a->vperm2f128(dst(), L,H, 0x31); //   = {e,m,f,n|g,o,h,p}
3659                                       a->vmovups(A::Mem{arg[immA],32}, dst());
3660                                       free_tmp(L);
3661                                       free_tmp(H);
3662                                   } break;
3663 
3664                 case Op::store128: {
3665                     // TODO: >32-bit stores
3666                     a->vmovd  (A::Mem{arg[immA], 0*16 +  0}, (A::Xmm)r(x)   );
3667                     a->vmovd  (A::Mem{arg[immA], 0*16 +  4}, (A::Xmm)r(y)   );
3668                     a->vmovd  (A::Mem{arg[immA], 0*16 +  8}, (A::Xmm)r(z)   );
3669                     a->vmovd  (A::Mem{arg[immA], 0*16 + 12}, (A::Xmm)r(w)   );
3670                     if (scalar) { break; }
3671 
3672                     a->vpextrd(A::Mem{arg[immA], 1*16 +  0}, (A::Xmm)r(x), 1);
3673                     a->vpextrd(A::Mem{arg[immA], 1*16 +  4}, (A::Xmm)r(y), 1);
3674                     a->vpextrd(A::Mem{arg[immA], 1*16 +  8}, (A::Xmm)r(z), 1);
3675                     a->vpextrd(A::Mem{arg[immA], 1*16 + 12}, (A::Xmm)r(w), 1);
3676 
3677                     a->vpextrd(A::Mem{arg[immA], 2*16 +  0}, (A::Xmm)r(x), 2);
3678                     a->vpextrd(A::Mem{arg[immA], 2*16 +  4}, (A::Xmm)r(y), 2);
3679                     a->vpextrd(A::Mem{arg[immA], 2*16 +  8}, (A::Xmm)r(z), 2);
3680                     a->vpextrd(A::Mem{arg[immA], 2*16 + 12}, (A::Xmm)r(w), 2);
3681 
3682                     a->vpextrd(A::Mem{arg[immA], 3*16 +  0}, (A::Xmm)r(x), 3);
3683                     a->vpextrd(A::Mem{arg[immA], 3*16 +  4}, (A::Xmm)r(y), 3);
3684                     a->vpextrd(A::Mem{arg[immA], 3*16 +  8}, (A::Xmm)r(z), 3);
3685                     a->vpextrd(A::Mem{arg[immA], 3*16 + 12}, (A::Xmm)r(w), 3);
3686                     // Now we need to store the upper 128 bits of x,y,z,w.
3687                     // Storing in this order rather than interlacing minimizes temporaries.
3688                     a->vextracti128(dst(), r(x), 1);
3689                     a->vmovd  (A::Mem{arg[immA], 4*16 +  0}, (A::Xmm)dst()   );
3690                     a->vpextrd(A::Mem{arg[immA], 5*16 +  0}, (A::Xmm)dst(), 1);
3691                     a->vpextrd(A::Mem{arg[immA], 6*16 +  0}, (A::Xmm)dst(), 2);
3692                     a->vpextrd(A::Mem{arg[immA], 7*16 +  0}, (A::Xmm)dst(), 3);
3693 
3694                     a->vextracti128(dst(), r(y), 1);
3695                     a->vmovd  (A::Mem{arg[immA], 4*16 +  4}, (A::Xmm)dst()   );
3696                     a->vpextrd(A::Mem{arg[immA], 5*16 +  4}, (A::Xmm)dst(), 1);
3697                     a->vpextrd(A::Mem{arg[immA], 6*16 +  4}, (A::Xmm)dst(), 2);
3698                     a->vpextrd(A::Mem{arg[immA], 7*16 +  4}, (A::Xmm)dst(), 3);
3699 
3700                     a->vextracti128(dst(), r(z), 1);
3701                     a->vmovd  (A::Mem{arg[immA], 4*16 +  8}, (A::Xmm)dst()   );
3702                     a->vpextrd(A::Mem{arg[immA], 5*16 +  8}, (A::Xmm)dst(), 1);
3703                     a->vpextrd(A::Mem{arg[immA], 6*16 +  8}, (A::Xmm)dst(), 2);
3704                     a->vpextrd(A::Mem{arg[immA], 7*16 +  8}, (A::Xmm)dst(), 3);
3705 
3706                     a->vextracti128(dst(), r(w), 1);
3707                     a->vmovd  (A::Mem{arg[immA], 4*16 + 12}, (A::Xmm)dst()   );
3708                     a->vpextrd(A::Mem{arg[immA], 5*16 + 12}, (A::Xmm)dst(), 1);
3709                     a->vpextrd(A::Mem{arg[immA], 6*16 + 12}, (A::Xmm)dst(), 2);
3710                     a->vpextrd(A::Mem{arg[immA], 7*16 + 12}, (A::Xmm)dst(), 3);
3711                 } break;
3712 
3713                 case Op::load8:  if (scalar) {
3714                                      a->vpxor  (dst(), dst(), dst());
3715                                      a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{arg[immA]}, 0);
3716                                  } else {
3717                                      a->vpmovzxbd(dst(), A::Mem{arg[immA]});
3718                                  } break;
3719 
3720                 case Op::load16: if (scalar) {
3721                                      a->vpxor  (dst(), dst(), dst());
3722                                      a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{arg[immA]}, 0);
3723                                  } else {
3724                                      a->vpmovzxwd(dst(), A::Mem{arg[immA]});
3725                                  } break;
3726 
3727                 case Op::load32: if (scalar) { a->vmovd  ((A::Xmm)dst(), A::Mem{arg[immA]}); }
3728                                  else        { a->vmovups(        dst(), A::Mem{arg[immA]}); }
3729                                  break;
3730 
3731                 case Op::load64: if (scalar) {
3732                                     a->vmovd((A::Xmm)dst(), A::Mem{arg[immA], 4*immB});
3733                                  } else {
3734                                     A::Ymm tmp = alloc_tmp();
3735                                     a->vmovups(tmp, &load64_index);
3736                                     a->vpermps(dst(), tmp, A::Mem{arg[immA],  0});
3737                                     a->vpermps(  tmp, tmp, A::Mem{arg[immA], 32});
3738                                     // Low 128 bits holds immB=0 lanes, high 128 bits holds immB=1.
3739                                     a->vperm2f128(dst(), dst(),tmp, immB ? 0x31 : 0x20);
3740                                     free_tmp(tmp);
3741                                  } break;
3742 
3743                 case Op::load128: if (scalar) {
3744                                       a->vmovd((A::Xmm)dst(), A::Mem{arg[immA], 4*immB});
3745                                   } else {
3746                                       // Load 4 low values into xmm tmp,
3747                                       A::Ymm tmp = alloc_tmp();
3748                                       A::Xmm t = (A::Xmm)tmp;
3749                                       a->vmovd  (t,   A::Mem{arg[immA], 0*16 + 4*immB}   );
3750                                       a->vpinsrd(t,t, A::Mem{arg[immA], 1*16 + 4*immB}, 1);
3751                                       a->vpinsrd(t,t, A::Mem{arg[immA], 2*16 + 4*immB}, 2);
3752                                       a->vpinsrd(t,t, A::Mem{arg[immA], 3*16 + 4*immB}, 3);
3753 
3754                                       // Load 4 high values into xmm dst(),
3755                                       A::Xmm d = (A::Xmm)dst();
3756                                       a->vmovd  (d,   A::Mem{arg[immA], 4*16 + 4*immB}   );
3757                                       a->vpinsrd(d,d, A::Mem{arg[immA], 5*16 + 4*immB}, 1);
3758                                       a->vpinsrd(d,d, A::Mem{arg[immA], 6*16 + 4*immB}, 2);
3759                                       a->vpinsrd(d,d, A::Mem{arg[immA], 7*16 + 4*immB}, 3);
3760 
3761                                       // Merge the two, ymm dst() = {xmm tmp|xmm dst()}
3762                                       a->vperm2f128(dst(), tmp,dst(), 0x20);
3763                                       free_tmp(tmp);
3764                                   } break;
3765 
3766                 case Op::gather8: {
3767                     // As usual, the gather base pointer is immB bytes off of uniform immA.
3768                     a->mov(GP0, A::Mem{arg[immA], immB});
3769 
3770                     A::Ymm tmp = alloc_tmp();
3771                     a->vmovups(tmp, any(x));
3772 
3773                     for (int i = 0; i < active_lanes; i++) {
3774                         if (i == 4) {
3775                             // vpextrd can only pluck indices out from an Xmm register,
3776                             // so we manually swap over to the top when we're halfway through.
3777                             a->vextracti128((A::Xmm)tmp, tmp, 1);
3778                         }
3779                         a->vpextrd(GP1, (A::Xmm)tmp, i%4);
3780                         a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{GP0,0,GP1,A::ONE}, i);
3781                     }
3782                     a->vpmovzxbd(dst(), dst());
3783                     free_tmp(tmp);
3784                 } break;
3785 
3786                 case Op::gather16: {
3787                     // Just as gather8 except vpinsrb->vpinsrw, ONE->TWO, and vpmovzxbd->vpmovzxwd.
3788                     a->mov(GP0, A::Mem{arg[immA], immB});
3789 
3790                     A::Ymm tmp = alloc_tmp();
3791                     a->vmovups(tmp, any(x));
3792 
3793                     for (int i = 0; i < active_lanes; i++) {
3794                         if (i == 4) {
3795                             a->vextracti128((A::Xmm)tmp, tmp, 1);
3796                         }
3797                         a->vpextrd(GP1, (A::Xmm)tmp, i%4);
3798                         a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{GP0,0,GP1,A::TWO}, i);
3799                     }
3800                     a->vpmovzxwd(dst(), dst());
3801                     free_tmp(tmp);
3802                 } break;
3803 
3804                 case Op::gather32:
3805                 if (scalar) {
3806                     // Our gather base pointer is immB bytes off of uniform immA.
3807                     a->mov(GP0, A::Mem{arg[immA], immB});
3808 
3809                     // Grab our index from lane 0 of the index argument.
3810                     a->vmovd(GP1, (A::Xmm)r(x));
3811 
3812                     // dst = *(base + 4*index)
3813                     a->vmovd((A::Xmm)dst(x), A::Mem{GP0, 0, GP1, A::FOUR});
3814                 } else {
3815                     a->mov(GP0, A::Mem{arg[immA], immB});
3816 
3817                     A::Ymm mask = alloc_tmp();
3818                     a->vpcmpeqd(mask, mask, mask);   // (All lanes enabled.)
3819 
3820                     a->vgatherdps(dst(), A::FOUR, r(x), GP0, mask);
3821                     free_tmp(mask);
3822                 }
3823                 break;
3824 
3825                 case Op::uniform32: a->vbroadcastss(dst(), A::Mem{arg[immA], immB});
3826                                     break;
3827 
3828                 case Op::array32: a->mov(GP0, A::Mem{arg[immA], immB});
3829                                   a->vbroadcastss(dst(), A::Mem{GP0, immC});
3830                                   break;
3831 
3832                 case Op::index: a->vmovd((A::Xmm)dst(), N);
3833                                 a->vbroadcastss(dst(), dst());
3834                                 a->vpsubd(dst(), dst(), &iota);
3835                                 break;
3836 
3837                 // We can swap the arguments of symmetric instructions to make better use of any().
3838                 case Op::add_f32:
3839                     if (in_reg(x)) { a->vaddps(dst(x), r(x), any(y)); }
3840                     else           { a->vaddps(dst(y), r(y), any(x)); }
3841                                      break;
3842 
3843                 case Op::mul_f32:
3844                     if (in_reg(x)) { a->vmulps(dst(x), r(x), any(y)); }
3845                     else           { a->vmulps(dst(y), r(y), any(x)); }
3846                                      break;
3847 
3848                 case Op::sub_f32: a->vsubps(dst(x), r(x), any(y)); break;
3849                 case Op::div_f32: a->vdivps(dst(x), r(x), any(y)); break;
3850                 case Op::min_f32: a->vminps(dst(y), r(y), any(x)); break;  // Order matters,
3851                 case Op::max_f32: a->vmaxps(dst(y), r(y), any(x)); break;  // see test SkVM_min_max.
3852 
3853                 case Op::fma_f32:
3854                     if (try_alias(x)) { a->vfmadd132ps(dst(x), r(z), any(y)); } else
3855                     if (try_alias(y)) { a->vfmadd213ps(dst(y), r(x), any(z)); } else
3856                     if (try_alias(z)) { a->vfmadd231ps(dst(z), r(x), any(y)); } else
3857                                       { a->vmovups    (dst(), any(x));
3858                                         a->vfmadd132ps(dst(), r(z), any(y)); }
3859                                         break;
3860 
3861                 case Op::fms_f32:
3862                     if (try_alias(x)) { a->vfmsub132ps(dst(x), r(z), any(y)); } else
3863                     if (try_alias(y)) { a->vfmsub213ps(dst(y), r(x), any(z)); } else
3864                     if (try_alias(z)) { a->vfmsub231ps(dst(z), r(x), any(y)); } else
3865                                       { a->vmovups    (dst(), any(x));
3866                                         a->vfmsub132ps(dst(), r(z), any(y)); }
3867                                         break;
3868 
3869                 case Op::fnma_f32:
3870                     if (try_alias(x)) { a->vfnmadd132ps(dst(x), r(z), any(y)); } else
3871                     if (try_alias(y)) { a->vfnmadd213ps(dst(y), r(x), any(z)); } else
3872                     if (try_alias(z)) { a->vfnmadd231ps(dst(z), r(x), any(y)); } else
3873                                       { a->vmovups     (dst(), any(x));
3874                                         a->vfnmadd132ps(dst(), r(z), any(y)); }
3875                                         break;
3876 
3877                 // In situations like this we want to try aliasing dst(x) when x is
3878                 // already in a register, but not if we'd have to load it from the stack
3879                 // just to alias it.  That's done better directly into the new register.
3880                 case Op::sqrt_f32:
3881                     if (in_reg(x)) { a->vsqrtps(dst(x),  r(x)); }
3882                     else           { a->vsqrtps(dst(), any(x)); }
3883                                      break;
3884 
3885                 case Op::add_i32:
3886                     if (in_reg(x)) { a->vpaddd(dst(x), r(x), any(y)); }
3887                     else           { a->vpaddd(dst(y), r(y), any(x)); }
3888                                      break;
3889 
3890                 case Op::mul_i32:
3891                     if (in_reg(x)) { a->vpmulld(dst(x), r(x), any(y)); }
3892                     else           { a->vpmulld(dst(y), r(y), any(x)); }
3893                                      break;
3894 
3895                 case Op::sub_i32: a->vpsubd(dst(x), r(x), any(y)); break;
3896 
3897                 case Op::bit_and:
3898                     if (in_reg(x)) { a->vpand(dst(x), r(x), any(y)); }
3899                     else           { a->vpand(dst(y), r(y), any(x)); }
3900                                      break;
3901                 case Op::bit_or:
3902                     if (in_reg(x)) { a->vpor(dst(x), r(x), any(y)); }
3903                     else           { a->vpor(dst(y), r(y), any(x)); }
3904                                      break;
3905                 case Op::bit_xor:
3906                     if (in_reg(x)) { a->vpxor(dst(x), r(x), any(y)); }
3907                     else           { a->vpxor(dst(y), r(y), any(x)); }
3908                                      break;
3909 
3910                 case Op::bit_clear: a->vpandn(dst(y), r(y), any(x)); break; // Notice, y then x.
3911 
3912                 case Op::select:
3913                     if (try_alias(z)) { a->vpblendvb(dst(z), r(z), any(y), r(x)); }
3914                     else              { a->vpblendvb(dst(x), r(z), any(y), r(x)); }
3915                                         break;
3916 
3917                 case Op::shl_i32: a->vpslld(dst(x), r(x), immA); break;
3918                 case Op::shr_i32: a->vpsrld(dst(x), r(x), immA); break;
3919                 case Op::sra_i32: a->vpsrad(dst(x), r(x), immA); break;
3920 
3921                 case Op::eq_i32:
3922                     if (in_reg(x)) { a->vpcmpeqd(dst(x), r(x), any(y)); }
3923                     else           { a->vpcmpeqd(dst(y), r(y), any(x)); }
3924                                      break;
3925 
3926                 case Op::gt_i32: a->vpcmpgtd(dst(), r(x), any(y)); break;
3927 
3928                 case Op::eq_f32:
3929                     if (in_reg(x)) { a->vcmpeqps(dst(x), r(x), any(y)); }
3930                     else           { a->vcmpeqps(dst(y), r(y), any(x)); }
3931                                      break;
3932                 case Op::neq_f32:
3933                     if (in_reg(x)) { a->vcmpneqps(dst(x), r(x), any(y)); }
3934                     else           { a->vcmpneqps(dst(y), r(y), any(x)); }
3935                                      break;
3936 
3937                 case Op:: gt_f32: a->vcmpltps (dst(y), r(y), any(x)); break;
3938                 case Op::gte_f32: a->vcmpleps (dst(y), r(y), any(x)); break;
3939 
3940                 case Op::ceil:
3941                     if (in_reg(x)) { a->vroundps(dst(x),  r(x), Assembler::CEIL); }
3942                     else           { a->vroundps(dst(), any(x), Assembler::CEIL); }
3943                                      break;
3944 
3945                 case Op::floor:
3946                     if (in_reg(x)) { a->vroundps(dst(x),  r(x), Assembler::FLOOR); }
3947                     else           { a->vroundps(dst(), any(x), Assembler::FLOOR); }
3948                                      break;
3949 
3950                 case Op::to_f32:
3951                     if (in_reg(x)) { a->vcvtdq2ps(dst(x),  r(x)); }
3952                     else           { a->vcvtdq2ps(dst(), any(x)); }
3953                                      break;
3954 
3955                 case Op::trunc:
3956                     if (in_reg(x)) { a->vcvttps2dq(dst(x),  r(x)); }
3957                     else           { a->vcvttps2dq(dst(), any(x)); }
3958                                      break;
3959 
3960                 case Op::round:
3961                     if (in_reg(x)) { a->vcvtps2dq(dst(x),  r(x)); }
3962                     else           { a->vcvtps2dq(dst(), any(x)); }
3963                                      break;
3964 
3965                 case Op::to_fp16:
3966                     a->vcvtps2ph(dst(x), r(x), A::CURRENT);  // f32 ymm -> f16 xmm
3967                     a->vpmovzxwd(dst(), dst());              // f16 xmm -> f16 ymm
3968                     break;
3969 
3970                 case Op::from_fp16:
3971                     a->vpackusdw(dst(x), r(x), r(x));  // f16 ymm -> f16 xmm
3972                     a->vpermq   (dst(), dst(), 0xd8);  // swap middle two 64-bit lanes
3973                     a->vcvtph2ps(dst(), dst());        // f16 xmm -> f32 ymm
3974                     break;
3975 
3976             #elif defined(__aarch64__)
3977                 case Op::assert_true: {
3978                     a->uminv4s(dst(), r(x));   // uminv acts like an all() across the vector.
3979                     a->movs(GP0, dst(), 0);
3980                     A::Label all_true;
3981                     a->cbnz(GP0, &all_true);
3982                     a->brk(0);
3983                     a->label(&all_true);
3984                 } break;
3985 
3986                 case Op::trace_line:
3987                 case Op::trace_var:
3988                 case Op::trace_call:
3989                     /* Only supported in the interpreter. */
3990                     break;
3991 
3992                 case Op::index: {
3993                     A::V tmp = alloc_tmp();
3994                     a->ldrq (tmp, &iota);
3995                     a->dup4s(dst(), N);
3996                     a->sub4s(dst(), dst(), tmp);
3997                     free_tmp(tmp);
3998                 } break;
3999 
4000                 case Op::store8: a->xtns2h(dst(x), r(x));
4001                                  a->xtnh2b(dst(), dst());
4002                    if (scalar) { a->strb  (dst(), arg[immA]); }
4003                    else        { a->strs  (dst(), arg[immA]); }
4004                                  break;
4005 
4006                 case Op::store16: a->xtns2h(dst(x), r(x));
4007                     if (scalar) { a->strh  (dst(), arg[immA]); }
4008                     else        { a->strd  (dst(), arg[immA]); }
4009                                   break;
4010 
4011                 case Op::store32: if (scalar) { a->strs(r(x), arg[immA]); }
4012                                   else        { a->strq(r(x), arg[immA]); }
4013                                                 break;
4014 
4015                 case Op::store64: if (scalar) {
4016                                       a->strs(r(x), arg[immA], 0);
4017                                       a->strs(r(y), arg[immA], 1);
4018                                   } else if (r(y) == r(x)+1) {
4019                                       a->st24s(r(x), arg[immA]);
4020                                   } else {
4021                                       Reg tmp0 = alloc_tmp(2),
4022                                           tmp1 = (Reg)(tmp0+1);
4023                                       a->orr16b(tmp0, r(x), r(x));
4024                                       a->orr16b(tmp1, r(y), r(y));
4025                                       a-> st24s(tmp0, arg[immA]);
4026                                       free_tmp(tmp0);
4027                                       free_tmp(tmp1);
4028                                   } break;
4029 
4030                 case Op::store128:
4031                     if (scalar) {
4032                         a->strs(r(x), arg[immA], 0);
4033                         a->strs(r(y), arg[immA], 1);
4034                         a->strs(r(z), arg[immA], 2);
4035                         a->strs(r(w), arg[immA], 3);
4036                     } else if (r(y) == r(x)+1 &&
4037                                r(z) == r(x)+2 &&
4038                                r(w) == r(x)+3) {
4039                         a->st44s(r(x), arg[immA]);
4040                     } else {
4041                         Reg tmp0 = alloc_tmp(4),
4042                             tmp1 = (Reg)(tmp0+1),
4043                             tmp2 = (Reg)(tmp0+2),
4044                             tmp3 = (Reg)(tmp0+3);
4045                         a->orr16b(tmp0, r(x), r(x));
4046                         a->orr16b(tmp1, r(y), r(y));
4047                         a->orr16b(tmp2, r(z), r(z));
4048                         a->orr16b(tmp3, r(w), r(w));
4049                         a-> st44s(tmp0, arg[immA]);
4050                         free_tmp(tmp0);
4051                         free_tmp(tmp1);
4052                         free_tmp(tmp2);
4053                         free_tmp(tmp3);
4054                     } break;
4055 
4056 
4057                 case Op::load8: if (scalar) { a->ldrb(dst(), arg[immA]); }
4058                                 else        { a->ldrs(dst(), arg[immA]); }
4059                                               a->uxtlb2h(dst(), dst());
4060                                               a->uxtlh2s(dst(), dst());
4061                                               break;
4062 
4063                 case Op::load16: if (scalar) { a->ldrh(dst(), arg[immA]); }
4064                                  else        { a->ldrd(dst(), arg[immA]); }
4065                                                a->uxtlh2s(dst(), dst());
4066                                                break;
4067 
4068                 case Op::load32: if (scalar) { a->ldrs(dst(), arg[immA]); }
4069                                  else        { a->ldrq(dst(), arg[immA]); }
4070                                                break;
4071 
4072                 case Op::load64: if (scalar) {
4073                                     a->ldrs(dst(), arg[immA], immB);
4074                                  } else {
4075                                     Reg tmp0 = alloc_tmp(2),
4076                                         tmp1 = (Reg)(tmp0+1);
4077                                     a->ld24s(tmp0, arg[immA]);
4078                                     // TODO: return both
4079                                     switch (immB) {
4080                                         case 0: mark_tmp_as_dst(tmp0); free_tmp(tmp1); break;
4081                                         case 1: mark_tmp_as_dst(tmp1); free_tmp(tmp0); break;
4082                                     }
4083                                  } break;
4084 
4085                 case Op::load128: if (scalar) {
4086                                       a->ldrs(dst(), arg[immA], immB);
4087                                   } else {
4088                                       Reg tmp0 = alloc_tmp(4),
4089                                           tmp1 = (Reg)(tmp0+1),
4090                                           tmp2 = (Reg)(tmp0+2),
4091                                           tmp3 = (Reg)(tmp0+3);
4092                                       a->ld44s(tmp0, arg[immA]);
4093                                       // TODO: return all four
4094                                       switch (immB) {
4095                                           case 0: mark_tmp_as_dst(tmp0); break;
4096                                           case 1: mark_tmp_as_dst(tmp1); break;
4097                                           case 2: mark_tmp_as_dst(tmp2); break;
4098                                           case 3: mark_tmp_as_dst(tmp3); break;
4099                                       }
4100                                       if (immB != 0) { free_tmp(tmp0); }
4101                                       if (immB != 1) { free_tmp(tmp1); }
4102                                       if (immB != 2) { free_tmp(tmp2); }
4103                                       if (immB != 3) { free_tmp(tmp3); }
4104                                   } break;
4105 
4106                 case Op::uniform32: a->add(GP0, arg[immA], immB);
4107                                     a->ld1r4s(dst(), GP0);
4108                                     break;
4109 
4110                 case Op::array32: a->add(GP0, arg[immA], immB);
4111                                   a->ldrd(GP0, GP0);
4112                                   a->add(GP0, GP0, immC);
4113                                   a->ld1r4s(dst(), GP0);
4114                                   break;
4115 
4116                 case Op::gather8: {
4117                     // As usual, the gather base pointer is immB bytes off of uniform immA.
4118                     a->add (GP0, arg[immA], immB);  // GP0 = &(gather base pointer)
4119                     a->ldrd(GP0, GP0);              // GP0 =   gather base pointer
4120 
4121                     for (int i = 0; i < active_lanes; i++) {
4122                         a->movs(GP1, r(x), i);    // Extract index lane i into GP1.
4123                         a->add (GP1, GP0, GP1);   // Add the gather base pointer.
4124                         a->ldrb(GP1, GP1);        // Load that byte.
4125                         a->inss(dst(x), GP1, i);  // Insert it into dst() lane i.
4126                     }
4127                 } break;
4128 
4129                 // See gather8 for general idea; comments here only where gather16 differs.
4130                 case Op::gather16: {
4131                     a->add (GP0, arg[immA], immB);
4132                     a->ldrd(GP0, GP0);
4133                     for (int i = 0; i < active_lanes; i++) {
4134                         a->movs(GP1, r(x), i);
4135                         a->add (GP1, GP0, GP1, A::LSL, 1);  // Scale index 2x into a byte offset.
4136                         a->ldrh(GP1, GP1);                  // 2-byte load.
4137                         a->inss(dst(x), GP1, i);
4138                     }
4139                 } break;
4140 
4141                 // See gather8 for general idea; comments here only where gather32 differs.
4142                 case Op::gather32: {
4143                     a->add (GP0, arg[immA], immB);
4144                     a->ldrd(GP0, GP0);
4145                     for (int i = 0; i < active_lanes; i++) {
4146                         a->movs(GP1, r(x), i);
4147                         a->add (GP1, GP0, GP1, A::LSL, 2);  // Scale index 4x into a byte offset.
4148                         a->ldrs(GP1, GP1);                  // 4-byte load.
4149                         a->inss(dst(x), GP1, i);
4150                     }
4151                 } break;
4152 
4153                 case Op::add_f32: a->fadd4s(dst(x,y), r(x), r(y)); break;
4154                 case Op::sub_f32: a->fsub4s(dst(x,y), r(x), r(y)); break;
4155                 case Op::mul_f32: a->fmul4s(dst(x,y), r(x), r(y)); break;
4156                 case Op::div_f32: a->fdiv4s(dst(x,y), r(x), r(y)); break;
4157 
4158                 case Op::sqrt_f32: a->fsqrt4s(dst(x), r(x)); break;
4159 
4160                 case Op::fma_f32: // fmla.4s is z += x*y
4161                     if (try_alias(z)) { a->fmla4s( r(z), r(x), r(y)); }
4162                     else              { a->orr16b(dst(), r(z), r(z));
4163                                         a->fmla4s(dst(), r(x), r(y)); }
4164                                         break;
4165 
4166                 case Op::fnma_f32:  // fmls.4s is z -= x*y
4167                     if (try_alias(z)) { a->fmls4s( r(z), r(x), r(y)); }
4168                     else              { a->orr16b(dst(), r(z), r(z));
4169                                         a->fmls4s(dst(), r(x), r(y)); }
4170                                         break;
4171 
4172                 case Op::fms_f32:   // calculate z - xy, then negate to xy - z
4173                     if (try_alias(z)) { a->fmls4s( r(z), r(x), r(y)); }
4174                     else              { a->orr16b(dst(), r(z), r(z));
4175                                         a->fmls4s(dst(), r(x), r(y)); }
4176                                         a->fneg4s(dst(), dst());
4177                                         break;
4178 
4179                 case Op:: gt_f32: a->fcmgt4s (dst(x,y), r(x), r(y)); break;
4180                 case Op::gte_f32: a->fcmge4s (dst(x,y), r(x), r(y)); break;
4181                 case Op:: eq_f32: a->fcmeq4s (dst(x,y), r(x), r(y)); break;
4182                 case Op::neq_f32: a->fcmeq4s (dst(x,y), r(x), r(y));
4183                                   a->not16b  (dst(), dst());         break;
4184 
4185 
4186                 case Op::add_i32: a->add4s(dst(x,y), r(x), r(y)); break;
4187                 case Op::sub_i32: a->sub4s(dst(x,y), r(x), r(y)); break;
4188                 case Op::mul_i32: a->mul4s(dst(x,y), r(x), r(y)); break;
4189 
4190                 case Op::bit_and  : a->and16b(dst(x,y), r(x), r(y)); break;
4191                 case Op::bit_or   : a->orr16b(dst(x,y), r(x), r(y)); break;
4192                 case Op::bit_xor  : a->eor16b(dst(x,y), r(x), r(y)); break;
4193                 case Op::bit_clear: a->bic16b(dst(x,y), r(x), r(y)); break;
4194 
4195                 case Op::select: // bsl16b is x = x ? y : z
4196                     if (try_alias(x)) { a->bsl16b( r(x), r(y), r(z)); }
4197                     else              { a->orr16b(dst(), r(x), r(x));
4198                                         a->bsl16b(dst(), r(y), r(z)); }
4199                                         break;
4200 
4201                 // fmin4s and fmax4s don't work the way we want with NaN,
4202                 // so we write them the long way:
4203                 case Op::min_f32: // min(x,y) = y<x ? y : x
4204                                   a->fcmgt4s(dst(), r(x), r(y));
4205                                   a->bsl16b (dst(), r(y), r(x));
4206                                   break;
4207 
4208                 case Op::max_f32: // max(x,y) = x<y ? y : x
4209                                   a->fcmgt4s(dst(), r(y), r(x));
4210                                   a->bsl16b (dst(), r(y), r(x));
4211                                   break;
4212 
4213                 case Op::shl_i32: a-> shl4s(dst(x), r(x), immA); break;
4214                 case Op::shr_i32: a->ushr4s(dst(x), r(x), immA); break;
4215                 case Op::sra_i32: a->sshr4s(dst(x), r(x), immA); break;
4216 
4217                 case Op::eq_i32: a->cmeq4s(dst(x,y), r(x), r(y)); break;
4218                 case Op::gt_i32: a->cmgt4s(dst(x,y), r(x), r(y)); break;
4219 
4220                 case Op::to_f32: a->scvtf4s (dst(x), r(x)); break;
4221                 case Op::trunc:  a->fcvtzs4s(dst(x), r(x)); break;
4222                 case Op::round:  a->fcvtns4s(dst(x), r(x)); break;
4223                 case Op::ceil:   a->frintp4s(dst(x), r(x)); break;
4224                 case Op::floor:  a->frintm4s(dst(x), r(x)); break;
4225 
4226                 case Op::to_fp16:
4227                     a->fcvtn  (dst(x), r(x));    // 4x f32 -> 4x f16 in bottom four lanes
4228                     a->uxtlh2s(dst(), dst());    // expand to 4x f16 in even 16-bit lanes
4229                     break;
4230 
4231                 case Op::from_fp16:
4232                     a->xtns2h(dst(x), r(x));     // pack even 16-bit lanes into bottom four lanes
4233                     a->fcvtl (dst(), dst());     // 4x f16 -> 4x f32
4234                     break;
4235             #endif
4236             }
4237 
4238             // Proactively free the registers holding any value that dies here.
4239             if (rd != NA &&                   dies_here(regs[rd])) { regs[rd] = NA; }
4240             if (rx != NA && regs[rx] != NA && dies_here(regs[rx])) { regs[rx] = NA; }
4241             if (ry != NA && regs[ry] != NA && dies_here(regs[ry])) { regs[ry] = NA; }
4242             if (rz != NA && regs[rz] != NA && dies_here(regs[rz])) { regs[rz] = NA; }
4243             if (rw != NA && regs[rw] != NA && dies_here(regs[rw])) { regs[rw] = NA; }
4244             return true;
4245         };
4246 
4247         #if defined(__x86_64__) || defined(_M_X64)
4248             auto jump_if_less = [&](A::Label* l) { a->jl (l); };
4249             auto jump         = [&](A::Label* l) { a->jmp(l); };
4250 
4251             auto add = [&](A::GP64 gp, int imm) { a->add(gp, imm); };
4252             auto sub = [&](A::GP64 gp, int imm) { a->sub(gp, imm); };
4253         #elif defined(__aarch64__)
4254             auto jump_if_less = [&](A::Label* l) { a->blt(l); };
4255             auto jump         = [&](A::Label* l) { a->b  (l); };
4256 
4257             auto add = [&](A::X gp, int imm) { a->add(gp, gp, imm); };
4258             auto sub = [&](A::X gp, int imm) { a->sub(gp, gp, imm); };
4259         #endif
4260 
4261         A::Label body,
4262                  tail,
4263                  done;
4264 
4265         enter();
4266         for (Val id = 0; id < (Val)instructions.size(); id++) {
4267             if (instructions[id].can_hoist && !emit(id, /*scalar=*/false)) {
4268                 return false;
4269             }
4270         }
4271 
4272         // This point marks a kind of canonical fixed point for register contents: if loop
4273         // code is generated as if these registers are holding these values, the next time
4274         // the loop comes around we'd better find those same registers holding those same values.
4275         auto restore_incoming_regs = [&,incoming=regs,saved_stack_slot=stack_slot,
4276                                       saved_next_stack_slot=next_stack_slot]{
4277             for (int r = 0; r < (int)regs.size(); r++) {
4278                 if (regs[r] != incoming[r]) {
4279                     regs[r]  = incoming[r];
4280                     if (regs[r] >= 0) {
4281                         load_from_memory((Reg)r, regs[r]);
4282                     }
4283                 }
4284             }
4285             *stack_hint = std::max(*stack_hint, next_stack_slot);
4286             stack_slot = saved_stack_slot;
4287             next_stack_slot = saved_next_stack_slot;
4288         };
4289 
4290         a->label(&body);
4291         {
4292             a->cmp(N, K);
4293             jump_if_less(&tail);
4294             for (Val id = 0; id < (Val)instructions.size(); id++) {
4295                 if (!instructions[id].can_hoist && !emit(id, /*scalar=*/false)) {
4296                     return false;
4297                 }
4298             }
4299             restore_incoming_regs();
4300             for (int i = 0; i < (int)fImpl->strides.size(); i++) {
4301                 if (fImpl->strides[i]) {
4302                     add(arg[i], K*fImpl->strides[i]);
4303                 }
4304             }
4305             sub(N, K);
4306             jump(&body);
4307         }
4308 
4309         a->label(&tail);
4310         {
4311             a->cmp(N, 1);
4312             jump_if_less(&done);
4313             for (Val id = 0; id < (Val)instructions.size(); id++) {
4314                 if (!instructions[id].can_hoist && !emit(id, /*scalar=*/true)) {
4315                     return false;
4316                 }
4317             }
4318             restore_incoming_regs();
4319             for (int i = 0; i < (int)fImpl->strides.size(); i++) {
4320                 if (fImpl->strides[i]) {
4321                     add(arg[i], 1*fImpl->strides[i]);
4322                 }
4323             }
4324             sub(N, 1);
4325             jump(&tail);
4326         }
4327 
4328         a->label(&done);
4329         {
4330             exit();
4331         }
4332 
4333         // Except for explicit aligned load and store instructions, AVX allows
4334         // memory operands to be unaligned.  So even though we're creating 16
4335         // byte patterns on ARM or 32-byte patterns on x86, we only need to
4336         // align to 4 bytes, the element size and alignment requirement.
4337 
4338         constants.foreach([&](int imm, A::Label* label) {
4339             a->align(4);
4340             a->label(label);
4341             for (int i = 0; i < K; i++) {
4342                 a->word(imm);
4343             }
4344         });
4345 
4346         if (!iota.references.empty()) {
4347             a->align(4);
4348             a->label(&iota);        // 0,1,2,3,4,...
4349             for (int i = 0; i < K; i++) {
4350                 a->word(i);
4351             }
4352         }
4353 
4354         if (!load64_index.references.empty()) {
4355             a->align(4);
4356             a->label(&load64_index);  // {0,2,4,6|1,3,5,7}
4357             a->word(0); a->word(2); a->word(4); a->word(6);
4358             a->word(1); a->word(3); a->word(5); a->word(7);
4359         }
4360 
4361         return true;
4362     }
4363 
setupJIT(const std::vector<OptimizedInstruction> & instructions,const char * debug_name)4364     void Program::setupJIT(const std::vector<OptimizedInstruction>& instructions,
4365                            const char* debug_name) {
4366         // Assemble with no buffer to determine a.size() (the number of bytes we'll assemble)
4367         // and stack_hint/registers_used to feed forward into the next jit() call.
4368         Assembler a{nullptr};
4369         int stack_hint = -1;
4370         uint32_t registers_used = 0xffff'ffff;  // Start conservatively with all.
4371         if (!this->jit(instructions, &stack_hint, &registers_used, &a)) {
4372             return;
4373         }
4374 
4375         fImpl->jit_size = a.size();
4376         void* jit_entry = alloc_jit_buffer(&fImpl->jit_size);
4377         fImpl->jit_entry.store(jit_entry);
4378 
4379         // Assemble the program for real with stack_hint/registers_used as feedback from first call.
4380         a = Assembler{jit_entry};
4381         SkAssertResult(this->jit(instructions, &stack_hint, &registers_used, &a));
4382         SkASSERT(a.size() <= fImpl->jit_size);
4383 
4384         // Remap as executable, and flush caches on platforms that need that.
4385         remap_as_executable(jit_entry, fImpl->jit_size);
4386 
4387         notify_vtune(debug_name, jit_entry, fImpl->jit_size);
4388 
4389     #if !defined(SK_BUILD_FOR_WIN)
4390         // For profiling and debugging, it's helpful to have this code loaded
4391         // dynamically rather than just jumping info fImpl->jit_entry.
4392         if (gSkVMJITViaDylib) {
4393             // Dump the raw program binary.
4394             SkString path = SkStringPrintf("/tmp/%s.XXXXXX", debug_name);
4395             int fd = mkstemp(path.writable_str());
4396             ::write(fd, jit_entry, a.size());
4397             close(fd);
4398 
4399             this->dropJIT();  // (unmap and null out fImpl->jit_entry.)
4400 
4401             // Convert it in-place to a dynamic library with a single symbol "skvm_jit":
4402             SkString cmd = SkStringPrintf(
4403                     "echo '.global _skvm_jit\n_skvm_jit: .incbin \"%s\"'"
4404                     " | clang -x assembler -shared - -o %s",
4405                     path.c_str(), path.c_str());
4406             system(cmd.c_str());
4407 
4408             // Load that dynamic library and look up skvm_jit().
4409             fImpl->dylib = dlopen(path.c_str(), RTLD_NOW|RTLD_LOCAL);
4410             void* sym = nullptr;
4411             for (const char* name : {"skvm_jit", "_skvm_jit"} ) {
4412                 if (!sym) { sym = dlsym(fImpl->dylib, name); }
4413             }
4414             fImpl->jit_entry.store(sym);
4415         }
4416     #endif
4417     }
4418 
disassemble(SkWStream * o) const4419     void Program::disassemble(SkWStream* o) const {
4420     #if !defined(SK_BUILD_FOR_WIN)
4421         SkDebugfStream debug;
4422         if (!o) { o = &debug; }
4423 
4424         const void* jit_entry = fImpl->jit_entry.load();
4425         size_t jit_size = fImpl->jit_size;
4426 
4427         if (!jit_entry) {
4428             o->writeText("Program not JIT'd. Did you pass --jit?\n");
4429             return;
4430         }
4431 
4432         char path[] = "/tmp/skvm-jit.XXXXXX";
4433         int fd = mkstemp(path);
4434         ::write(fd, jit_entry, jit_size);
4435         close(fd);
4436 
4437         // Convert it in-place to a dynamic library with a single symbol "skvm_jit":
4438         SkString cmd = SkStringPrintf(
4439                 "echo '.global _skvm_jit\n_skvm_jit: .incbin \"%s\"'"
4440                 " | clang -x assembler -shared - -o %s",
4441                 path, path);
4442         system(cmd.c_str());
4443 
4444         // Now objdump to disassemble our function:
4445         // TODO: We could trim this down to just our code using '--disassemble=<symbol name>`,
4446         // but the symbol name varies with OS, and that option may be missing from objdump on some
4447         // machines? There also apears to be quite a bit of junk after the end of the JIT'd code.
4448         // Trimming that would let us pass '--visualize-jumps' and get the loop annotated.
4449         // With the junk, we tend to end up with a bunch of stray jumps that pollute the ASCII art.
4450         cmd = SkStringPrintf("objdump -D %s", path);
4451     #if defined(SK_BUILD_FOR_UNIX)
4452         cmd.append(" --section=.text");
4453     #endif
4454         FILE* fp = popen(cmd.c_str(), "r");
4455         if (!fp) {
4456             o->writeText("objdump failed\n");
4457             return;
4458         }
4459 
4460         char line[1024];
4461         while (fgets(line, sizeof(line), fp)) {
4462             o->writeText(line);
4463         }
4464 
4465         pclose(fp);
4466     #endif
4467     }
4468 
4469 #endif
4470 
4471 }  // namespace skvm
4472