• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2019 Google LLC
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "include/core/SkStream.h"
9 #include "include/core/SkString.h"
10 #include "include/private/base/SkTFitsIn.h"
11 #include "include/private/base/SkThreadID.h"
12 #include "src/base/SkHalf.h"
13 #include "src/core/SkColorSpacePriv.h"
14 #include "src/core/SkColorSpaceXformSteps.h"
15 #include "src/core/SkCpu.h"
16 #include "src/core/SkEnumerate.h"
17 #include "src/core/SkOpts.h"
18 #include "src/core/SkStreamPriv.h"
19 #include "src/core/SkVM.h"
20 #include "src/utils/SkVMVisualizer.h"
21 #include <algorithm>
22 #include <atomic>
23 #include <queue>
24 
25 #if !defined(SK_BUILD_FOR_WIN)
26 #include <unistd.h>
27 #endif
28 
29 bool gSkVMAllowJIT{false};
30 bool gSkVMJITViaDylib{false};
31 
32 #if defined(SKVM_JIT)
33     #if defined(SK_BUILD_FOR_WIN)
34         #include "src/base/SkLeanWindows.h"
35         #include <memoryapi.h>
36 
alloc_jit_buffer(size_t * len)37         static void* alloc_jit_buffer(size_t* len) {
38             return VirtualAlloc(NULL, *len, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
39         }
remap_as_executable(void * ptr,size_t len)40         static void remap_as_executable(void* ptr, size_t len) {
41             DWORD old;
42             VirtualProtect(ptr, len, PAGE_EXECUTE_READ, &old);
43             SkASSERT(old == PAGE_READWRITE);
44         }
unmap_jit_buffer(void * ptr,size_t len)45         static void unmap_jit_buffer(void* ptr, size_t len) {
46             VirtualFree(ptr, 0, MEM_RELEASE);
47         }
close_dylib(void * dylib)48         static void close_dylib(void* dylib) {
49             SkASSERT(false);  // TODO?  For now just assert we never make one.
50         }
51     #else
52         #include <dlfcn.h>
53         #include <sys/mman.h>
54 
alloc_jit_buffer(size_t * len)55         static void* alloc_jit_buffer(size_t* len) {
56             // While mprotect and VirtualAlloc both work at page granularity,
57             // mprotect doesn't round up for you, and instead requires *len is at page granularity.
58             const size_t page = sysconf(_SC_PAGESIZE);
59             *len = ((*len + page - 1) / page) * page;
60             return mmap(nullptr,*len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1,0);
61         }
remap_as_executable(void * ptr,size_t len)62         static void remap_as_executable(void* ptr, size_t len) {
63             mprotect(ptr, len, PROT_READ|PROT_EXEC);
64             __builtin___clear_cache((char*)ptr,
65                                     (char*)ptr + len);
66         }
unmap_jit_buffer(void * ptr,size_t len)67         static void unmap_jit_buffer(void* ptr, size_t len) {
68             munmap(ptr, len);
69         }
close_dylib(void * dylib)70         static void close_dylib(void* dylib) {
71             dlclose(dylib);
72         }
73     #endif
74 #endif
75 
76 // JIT code isn't MSAN-instrumented, so we won't see when it uses
77 // uninitialized memory, and we'll not see the writes it makes as properly
78 // initializing memory.  Instead force the interpreter, which should let
79 // MSAN see everything our programs do properly.
80 //
81 // Similarly, we can't get ASAN's checks unless we let it instrument our interpreter.
82 #if defined(__has_feature)
83     #if __has_feature(memory_sanitizer) || __has_feature(address_sanitizer)
84         #define SKVM_JIT_BUT_IGNORE_IT
85     #endif
86 #endif
87 
88 #if defined(SKSL_STANDALONE)
89     // skslc needs to link against this module (for the VM code generator). This module pulls in
90     // color-space code, but attempting to add those transitive dependencies to skslc gets out of
91     // hand. So we terminate the chain here with stub functions. Note that skslc's usage of SkVM
92     // never cares about color management.
sk_program_transfer_fn(skvm::F32 v,skcms_TFType tf_type,skvm::F32 G,skvm::F32 A,skvm::F32 B,skvm::F32 C,skvm::F32 D,skvm::F32 E,skvm::F32 F)93     skvm::F32 sk_program_transfer_fn(
94         skvm::F32 v, skcms_TFType tf_type,
95         skvm::F32 G, skvm::F32 A, skvm::F32 B, skvm::F32 C, skvm::F32 D, skvm::F32 E, skvm::F32 F) {
96             return v;
97     }
98 
skcms_sRGB_TransferFunction()99     const skcms_TransferFunction* skcms_sRGB_TransferFunction() { return nullptr; }
skcms_sRGB_Inverse_TransferFunction()100     const skcms_TransferFunction* skcms_sRGB_Inverse_TransferFunction() { return nullptr; }
101 #endif
102 
103 namespace skvm {
104 
detect_features()105     static Features detect_features() {
106         static const bool fma =
107         #if defined(SK_CPU_X86)
108             SkCpu::Supports(SkCpu::HSW);
109         #elif defined(SK_CPU_ARM64)
110             true;
111         #else
112             false;
113         #endif
114 
115         static const bool fp16 = false;  // TODO
116 
117         return { fma, fp16 };
118     }
119 
Builder(bool createDuplicates)120     Builder::Builder(bool createDuplicates)
121         : fFeatures(detect_features()), fCreateDuplicates(createDuplicates) {}
Builder(Features features,bool createDuplicates)122     Builder::Builder(Features features, bool createDuplicates)
123         : fFeatures(features         ), fCreateDuplicates(createDuplicates) {}
124 
125     struct Program::Impl {
126         std::vector<InterpreterInstruction> instructions;
127         int regs = 0;
128         int loop = 0;
129         std::vector<int> strides;
130         std::vector<TraceHook*> traceHooks;
131         std::unique_ptr<viz::Visualizer> visualizer;
132 
133         std::atomic<void*> jit_entry{nullptr};   // TODO: minimal std::memory_orders
134         size_t jit_size = 0;
135         void*  dylib    = nullptr;
136     };
137 
138     // Debugging tools, mostly for printing various data structures out to a stream.
139 
140     namespace {
141         struct V { Val id; };
142         struct R { Reg id; };
143         struct Shift       { int bits; };
144         struct Splat       { int bits; };
145         struct Hex         { int bits; };
146         struct TraceHookID { int bits; };
147         // For op `trace_line`
148         struct Line  { int bits; };
149         // For op `trace_var`
150         struct VarSlot { int bits; };
151         // For op `trace_enter`/`trace_exit`
152         struct FnIdx { int bits; };
153 
write(SkWStream * o,const char * s)154         static void write(SkWStream* o, const char* s) {
155             o->writeText(s);
156         }
157 
name(Op op)158         static const char* name(Op op) {
159             switch (op) {
160             #define M(x) case Op::x: return #x;
161                 SKVM_OPS(M)
162             #undef M
163             }
164             return "unknown op";
165         }
166 
write(SkWStream * o,Op op)167         static void write(SkWStream* o, Op op) {
168             o->writeText(name(op));
169         }
write(SkWStream * o,Ptr p)170         static void write(SkWStream* o, Ptr p) {
171             write(o, "ptr");
172             o->writeDecAsText(p.ix);
173         }
write(SkWStream * o,V v)174         static void write(SkWStream* o, V v) {
175             write(o, "v");
176             o->writeDecAsText(v.id);
177         }
write(SkWStream * o,R r)178         static void write(SkWStream* o, R r) {
179             write(o, "r");
180             o->writeDecAsText(r.id);
181         }
write(SkWStream * o,Shift s)182         static void write(SkWStream* o, Shift s) {
183             o->writeDecAsText(s.bits);
184         }
write(SkWStream * o,Splat s)185         static void write(SkWStream* o, Splat s) {
186             float f;
187             memcpy(&f, &s.bits, 4);
188             o->writeHexAsText(s.bits);
189             write(o, " (");
190             o->writeScalarAsText(f);
191             write(o, ")");
192         }
write(SkWStream * o,Hex h)193         static void write(SkWStream* o, Hex h) {
194             o->writeHexAsText(h.bits);
195         }
write(SkWStream * o,TraceHookID h)196         static void write(SkWStream* o, TraceHookID h) {
197             o->writeDecAsText(h.bits);
198         }
write(SkWStream * o,Line d)199         static void write(SkWStream* o, Line d) {
200             write(o, "L");
201             o->writeDecAsText(d.bits);
202         }
write(SkWStream * o,VarSlot s)203         static void write(SkWStream* o, VarSlot s) {
204             write(o, "$");
205             o->writeDecAsText(s.bits);
206         }
write(SkWStream * o,FnIdx s)207         static void write(SkWStream* o, FnIdx s) {
208             write(o, "F");
209             o->writeDecAsText(s.bits);
210         }
211         template <typename T, typename... Ts>
write(SkWStream * o,T first,Ts...rest)212         static void write(SkWStream* o, T first, Ts... rest) {
213             write(o, first);
214             write(o, " ");
215             write(o, rest...);
216         }
217     }  // namespace
218 
write_one_instruction(Val id,const OptimizedInstruction & inst,SkWStream * o)219     static void write_one_instruction(Val id, const OptimizedInstruction& inst, SkWStream* o) {
220         Op  op = inst.op;
221         Val  x = inst.x,
222              y = inst.y,
223              z = inst.z,
224              w = inst.w;
225         int immA = inst.immA,
226             immB = inst.immB,
227             immC = inst.immC;
228         switch (op) {
229             case Op::assert_true: write(o, op, V{x}, V{y}); break;
230 
231             case Op::trace_line:  write(o, op, TraceHookID{immA}, V{x}, V{y}, Line{immB}); break;
232             case Op::trace_var:   write(o, op, TraceHookID{immA}, V{x}, V{y},
233                                                                   VarSlot{immB}, "=", V{z}); break;
234             case Op::trace_enter: write(o, op, TraceHookID{immA}, V{x}, V{y}, FnIdx{immB}); break;
235             case Op::trace_exit:  write(o, op, TraceHookID{immA}, V{x}, V{y}, FnIdx{immB}); break;
236             case Op::trace_scope: write(o, op, TraceHookID{immA}, V{x}, V{y}, Shift{immB}); break;
237 
238             case Op::store8:   write(o, op, Ptr{immA}, V{x}               ); break;
239             case Op::store16:  write(o, op, Ptr{immA}, V{x}               ); break;
240             case Op::store32:  write(o, op, Ptr{immA}, V{x}               ); break;
241             case Op::store64:  write(o, op, Ptr{immA}, V{x},V{y}          ); break;
242             case Op::store128: write(o, op, Ptr{immA}, V{x},V{y},V{z},V{w}); break;
243 
244             case Op::index: write(o, V{id}, "=", op); break;
245 
246             case Op::load8:   write(o, V{id}, "=", op, Ptr{immA}); break;
247             case Op::load16:  write(o, V{id}, "=", op, Ptr{immA}); break;
248             case Op::load32:  write(o, V{id}, "=", op, Ptr{immA}); break;
249             case Op::load64:  write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break;
250             case Op::load128: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break;
251 
252             case Op::gather8:  write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break;
253             case Op::gather16: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break;
254             case Op::gather32: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break;
255 
256             case Op::uniform32: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break;
257             case Op::array32:   write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, Hex{immC}); break;
258 
259             case Op::splat: write(o, V{id}, "=", op, Splat{immA}); break;
260 
261             case Op:: add_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
262             case Op:: sub_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
263             case Op:: mul_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
264             case Op:: div_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
265             case Op:: min_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
266             case Op:: max_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
267             case Op:: fma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
268             case Op:: fms_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
269             case Op::fnma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
270 
271 
272             case Op::sqrt_f32: write(o, V{id}, "=", op, V{x}); break;
273 
274             case Op:: eq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
275             case Op::neq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
276             case Op:: gt_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
277             case Op::gte_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
278 
279 
280             case Op::add_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
281             case Op::sub_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
282             case Op::mul_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
283 
284             case Op::shl_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break;
285             case Op::shr_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break;
286             case Op::sra_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break;
287 
288             case Op::eq_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
289             case Op::gt_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
290 
291 
292             case Op::bit_and  : write(o, V{id}, "=", op, V{x}, V{y}); break;
293             case Op::bit_or   : write(o, V{id}, "=", op, V{x}, V{y}); break;
294             case Op::bit_xor  : write(o, V{id}, "=", op, V{x}, V{y}); break;
295             case Op::bit_clear: write(o, V{id}, "=", op, V{x}, V{y}); break;
296 
297             case Op::select: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
298 
299             case Op::ceil:      write(o, V{id}, "=", op, V{x}); break;
300             case Op::floor:     write(o, V{id}, "=", op, V{x}); break;
301             case Op::to_f32:    write(o, V{id}, "=", op, V{x}); break;
302             case Op::to_fp16:   write(o, V{id}, "=", op, V{x}); break;
303             case Op::from_fp16: write(o, V{id}, "=", op, V{x}); break;
304             case Op::trunc:     write(o, V{id}, "=", op, V{x}); break;
305             case Op::round:     write(o, V{id}, "=", op, V{x}); break;
306 
307             case Op::duplicate: write(o, V{id}, "=", op, Hex{immA}); break;
308         }
309 
310         write(o, "\n");
311     }
312 
dump(SkWStream * o) const313     void Builder::dump(SkWStream* o) const {
314         SkDebugfStream debug;
315         if (!o) { o = &debug; }
316 
317         std::vector<OptimizedInstruction> optimized = this->optimize();
318         o->writeDecAsText(optimized.size());
319         o->writeText(" values (originally ");
320         o->writeDecAsText(fProgram.size());
321         o->writeText("):\n");
322         for (Val id = 0; id < (Val)optimized.size(); id++) {
323             const OptimizedInstruction& inst = optimized[id];
324             write(o, inst.can_hoist ? "↑ " : "  ");
325             write_one_instruction(id, inst, o);
326         }
327     }
328 
visualize(SkWStream * output) const329     void Program::visualize(SkWStream* output) const {
330         if (fImpl->visualizer) {
331             fImpl->visualizer->dump(output);
332         }
333     }
334 
visualizer()335     viz::Visualizer* Program::visualizer() { return fImpl->visualizer.get(); }
dump(SkWStream * o) const336     void Program::dump(SkWStream* o) const {
337         SkDebugfStream debug;
338         if (!o) { o = &debug; }
339 
340         o->writeDecAsText(fImpl->regs);
341         o->writeText(" registers, ");
342         o->writeDecAsText(fImpl->instructions.size());
343         o->writeText(" instructions:\n");
344         for (Val i = 0; i < (Val)fImpl->instructions.size(); i++) {
345             if (i == fImpl->loop) { write(o, "loop:\n"); }
346             o->writeDecAsText(i);
347             o->writeText("\t");
348             if (i >= fImpl->loop) { write(o, "    "); }
349             const InterpreterInstruction& inst = fImpl->instructions[i];
350             Op   op = inst.op;
351             Reg   d = inst.d,
352                   x = inst.x,
353                   y = inst.y,
354                   z = inst.z,
355                   w = inst.w;
356             int immA = inst.immA,
357                 immB = inst.immB,
358                 immC = inst.immC;
359             switch (op) {
360                 case Op::assert_true: write(o, op, R{x}, R{y}); break;
361 
362                 case Op::trace_line:  write(o, op, TraceHookID{immA},
363                                                    R{x}, R{y}, Line{immB}); break;
364                 case Op::trace_var:   write(o, op, TraceHookID{immA}, R{x}, R{y},
365                                                    VarSlot{immB}, "=", R{z}); break;
366                 case Op::trace_enter: write(o, op, TraceHookID{immA},
367                                                    R{x}, R{y}, FnIdx{immB}); break;
368                 case Op::trace_exit:  write(o, op, TraceHookID{immA},
369                                                    R{x}, R{y}, FnIdx{immB}); break;
370                 case Op::trace_scope: write(o, op, TraceHookID{immA},
371                                                    R{x}, R{y}, Shift{immB}); break;
372 
373                 case Op::store8:   write(o, op, Ptr{immA}, R{x}                  ); break;
374                 case Op::store16:  write(o, op, Ptr{immA}, R{x}                  ); break;
375                 case Op::store32:  write(o, op, Ptr{immA}, R{x}                  ); break;
376                 case Op::store64:  write(o, op, Ptr{immA}, R{x}, R{y}            ); break;
377                 case Op::store128: write(o, op, Ptr{immA}, R{x}, R{y}, R{z}, R{w}); break;
378 
379                 case Op::index: write(o, R{d}, "=", op); break;
380 
381                 case Op::load8:   write(o, R{d}, "=", op, Ptr{immA}); break;
382                 case Op::load16:  write(o, R{d}, "=", op, Ptr{immA}); break;
383                 case Op::load32:  write(o, R{d}, "=", op, Ptr{immA}); break;
384                 case Op::load64:  write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break;
385                 case Op::load128: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break;
386 
387                 case Op::gather8:  write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break;
388                 case Op::gather16: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break;
389                 case Op::gather32: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break;
390 
391                 case Op::uniform32: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break;
392                 case Op::array32:   write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, Hex{immC}); break;
393 
394                 case Op::splat:     write(o, R{d}, "=", op, Splat{immA}); break;
395 
396                 case Op::add_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
397                 case Op::sub_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
398                 case Op::mul_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
399                 case Op::div_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
400                 case Op::min_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
401                 case Op::max_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
402                 case Op::fma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
403                 case Op::fms_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
404                 case Op::fnma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
405 
406                 case Op::sqrt_f32: write(o, R{d}, "=", op, R{x}); break;
407 
408                 case Op:: eq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
409                 case Op::neq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
410                 case Op:: gt_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
411                 case Op::gte_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
412 
413 
414                 case Op::add_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
415                 case Op::sub_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
416                 case Op::mul_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
417 
418                 case Op::shl_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break;
419                 case Op::shr_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break;
420                 case Op::sra_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break;
421 
422                 case Op::eq_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
423                 case Op::gt_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
424 
425                 case Op::bit_and  : write(o, R{d}, "=", op, R{x}, R{y}); break;
426                 case Op::bit_or   : write(o, R{d}, "=", op, R{x}, R{y}); break;
427                 case Op::bit_xor  : write(o, R{d}, "=", op, R{x}, R{y}); break;
428                 case Op::bit_clear: write(o, R{d}, "=", op, R{x}, R{y}); break;
429 
430                 case Op::select: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
431 
432                 case Op::ceil:      write(o, R{d}, "=", op, R{x}); break;
433                 case Op::floor:     write(o, R{d}, "=", op, R{x}); break;
434                 case Op::to_f32:    write(o, R{d}, "=", op, R{x}); break;
435                 case Op::to_fp16:   write(o, R{d}, "=", op, R{x}); break;
436                 case Op::from_fp16: write(o, R{d}, "=", op, R{x}); break;
437                 case Op::trunc:     write(o, R{d}, "=", op, R{x}); break;
438                 case Op::round:     write(o, R{d}, "=", op, R{x}); break;
439 
440                 case Op::duplicate: write(o, R{d}, "=", op, Hex{immA}); break;
441             }
442             write(o, "\n");
443         }
444     }
eliminate_dead_code(std::vector<Instruction> program,viz::Visualizer * visualizer)445     std::vector<Instruction> eliminate_dead_code(std::vector<Instruction> program,
446                                                  viz::Visualizer* visualizer) {
447         // Determine which Instructions are live by working back from side effects.
448         std::vector<bool> live(program.size(), false);
449         for (Val id = program.size(); id--;) {
450             if (live[id] || has_side_effect(program[id].op)) {
451                 live[id] = true;
452                 const Instruction& inst = program[id];
453                 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) {
454                     if (arg != NA) { live[arg] = true; }
455                 }
456             }
457         }
458 
459         // Rewrite the program with only live Instructions:
460         //   - remap IDs in live Instructions to what they'll be once dead Instructions are removed;
461         //   - then actually remove the dead Instructions.
462         std::vector<Val> new_id(program.size(), NA);
463         for (Val id = 0, next = 0; id < (Val)program.size(); id++) {
464             if (live[id]) {
465                 Instruction& inst = program[id];
466                 for (Val* arg : {&inst.x, &inst.y, &inst.z, &inst.w}) {
467                     if (*arg != NA) {
468                         *arg = new_id[*arg];
469                         SkASSERT(*arg != NA);
470                     }
471                 }
472                 new_id[id] = next++;
473             }
474         }
475 
476         if (visualizer) {
477             visualizer->addInstructions(program);
478             visualizer->markAsDeadCode(live, new_id);
479         }
480 
481         // Eliminate any non-live ops.
482         auto it = std::remove_if(program.begin(), program.end(), [&](const Instruction& inst) {
483             Val id = (Val)(&inst - program.data());
484             return !live[id];
485         });
486         program.erase(it, program.end());
487 
488         return program;
489     }
490 
finalize(const std::vector<Instruction> program,viz::Visualizer * visualizer)491     std::vector<OptimizedInstruction> finalize(const std::vector<Instruction> program,
492                                                viz::Visualizer* visualizer) {
493         std::vector<OptimizedInstruction> optimized(program.size());
494         for (Val id = 0; id < (Val)program.size(); id++) {
495             Instruction inst = program[id];
496             optimized[id] = {inst.op, inst.x,inst.y,inst.z,inst.w,
497                              inst.immA,inst.immB,inst.immC,
498                              /*death=*/id, /*can_hoist=*/true};
499         }
500 
501         // Each Instruction's inputs need to live at least until that Instruction issues.
502         for (Val id = 0; id < (Val)optimized.size(); id++) {
503             OptimizedInstruction& inst = optimized[id];
504             for (Val arg : {inst.x, inst.y, inst.z, inst.w}) {
505                 // (We're walking in order, so this is the same as max()ing with the existing Val.)
506                 if (arg != NA) { optimized[arg].death = id; }
507             }
508         }
509 
510         // Mark which values don't depend on the loop and can be hoisted.
511         for (OptimizedInstruction& inst : optimized) {
512             // Varying loads (and gathers) and stores cannot be hoisted out of the loop.
513             if (is_always_varying(inst.op) || is_trace(inst.op)) {
514                 inst.can_hoist = false;
515             }
516 
517             // If any of an instruction's inputs can't be hoisted, it can't be hoisted itself.
518             if (inst.can_hoist) {
519                 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) {
520                     if (arg != NA) { inst.can_hoist &= optimized[arg].can_hoist; }
521                 }
522             }
523         }
524 
525         // Extend the lifetime of any hoisted value that's used in the loop to infinity.
526         for (OptimizedInstruction& inst : optimized) {
527             if (!inst.can_hoist /*i.e. we're in the loop, so the arguments are used-in-loop*/) {
528                 for (Val arg : {inst.x, inst.y, inst.z, inst.w}) {
529                     if (arg != NA && optimized[arg].can_hoist) {
530                         optimized[arg].death = (Val)program.size();
531                     }
532                 }
533             }
534         }
535 
536         if (visualizer) {
537             visualizer->finalize(program, optimized);
538         }
539 
540         return optimized;
541     }
542 
optimize(viz::Visualizer * visualizer) const543     std::vector<OptimizedInstruction> Builder::optimize(viz::Visualizer* visualizer) const {
544         std::vector<Instruction> program = this->program();
545         program = eliminate_dead_code(std::move(program), visualizer);
546         return    finalize           (std::move(program), visualizer);
547     }
548 
done(const char * debug_name,bool allow_jit) const549     Program Builder::done(const char* debug_name,
550                           bool allow_jit) const {
551         return this->done(debug_name, allow_jit, /*visualizer=*/nullptr);
552     }
553 
done(const char * debug_name,bool allow_jit,std::unique_ptr<viz::Visualizer> visualizer) const554     Program Builder::done(const char* debug_name,
555                           bool allow_jit,
556                           std::unique_ptr<viz::Visualizer> visualizer) const {
557         char buf[64] = "skvm-jit-";
558         if (!debug_name) {
559             *SkStrAppendU32(buf+9, this->hash()) = '\0';
560             debug_name = buf;
561         }
562 
563         auto optimized = this->optimize(visualizer ? visualizer.get() : nullptr);
564         return {optimized,
565                 std::move(visualizer),
566                 fStrides,
567                 fTraceHooks, debug_name, allow_jit};
568     }
569 
hash() const570     uint64_t Builder::hash() const {
571         uint32_t lo = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 0),
572                  hi = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 1);
573         return (uint64_t)lo | (uint64_t)hi << 32;
574     }
575 
operator !=(Ptr a,Ptr b)576     bool operator!=(Ptr a, Ptr b) { return a.ix != b.ix; }
577 
operator ==(const Instruction & a,const Instruction & b)578     bool operator==(const Instruction& a, const Instruction& b) {
579         return a.op   == b.op
580             && a.x    == b.x
581             && a.y    == b.y
582             && a.z    == b.z
583             && a.w    == b.w
584             && a.immA == b.immA
585             && a.immB == b.immB
586             && a.immC == b.immC;
587     }
588 
operator ()(const Instruction & inst,uint32_t seed) const589     uint32_t InstructionHash::operator()(const Instruction& inst, uint32_t seed) const {
590         return SkOpts::hash(&inst, sizeof(inst), seed);
591     }
592 
593 
594     // Most instructions produce a value and return it by ID,
595     // the value-producing instruction's own index in the program vector.
push(Instruction inst)596     Val Builder::push(Instruction inst) {
597         // Basic common subexpression elimination:
598         // if we've already seen this exact Instruction, use it instead of creating a new one.
599         //
600         // But we never dedup loads or stores: an intervening store could change that memory.
601         // Uniforms and gathers touch only uniform memory, so they're fine to dedup,
602         // and index is varying but doesn't touch memory, so it's fine to dedup too.
603         if (!touches_varying_memory(inst.op) && !is_trace(inst.op)) {
604             if (Val* id = fIndex.find(inst)) {
605                 if (fCreateDuplicates) {
606                     inst.op = Op::duplicate;
607                     inst.immA = *id;
608                     fProgram.push_back(inst);
609                 }
610                 return *id;
611             }
612         }
613 
614         Val id = static_cast<Val>(fProgram.size());
615         fProgram.push_back(inst);
616         fIndex.set(inst, id);
617         return id;
618     }
619 
arg(int stride)620     Ptr Builder::arg(int stride) {
621         int ix = (int)fStrides.size();
622         fStrides.push_back(stride);
623         return {ix};
624     }
625 
assert_true(I32 cond,I32 debug)626     void Builder::assert_true(I32 cond, I32 debug) {
627     #ifdef SK_DEBUG
628         int imm;
629         if (this->allImm(cond.id,&imm)) { SkASSERT(imm); return; }
630         (void)push(Op::assert_true, cond.id, debug.id);
631     #endif
632     }
633 
attachTraceHook(TraceHook * hook)634     int Builder::attachTraceHook(TraceHook* hook) {
635         int traceHookID = (int)fTraceHooks.size();
636         fTraceHooks.push_back(hook);
637         return traceHookID;
638     }
639 
mergeMasks(I32 & mask,I32 & traceMask)640     bool Builder::mergeMasks(I32& mask, I32& traceMask) {
641         if (this->isImm(mask.id,      0)) { return false; }
642         if (this->isImm(traceMask.id, 0)) { return false; }
643         if (this->isImm(mask.id,     ~0)) { mask = traceMask; }
644         if (this->isImm(traceMask.id,~0)) { traceMask = mask; }
645         return true;
646     }
647 
trace_line(int traceHookID,I32 mask,I32 traceMask,int line)648     void Builder::trace_line(int traceHookID, I32 mask, I32 traceMask, int line) {
649         SkASSERT(traceHookID >= 0);
650         SkASSERT(traceHookID < (int)fTraceHooks.size());
651         if (!this->mergeMasks(mask, traceMask)) { return; }
652         (void)push(Op::trace_line, mask.id,traceMask.id,NA,NA, traceHookID, line);
653     }
trace_var(int traceHookID,I32 mask,I32 traceMask,int slot,I32 val)654     void Builder::trace_var(int traceHookID, I32 mask, I32 traceMask, int slot, I32 val) {
655         SkASSERT(traceHookID >= 0);
656         SkASSERT(traceHookID < (int)fTraceHooks.size());
657         if (!this->mergeMasks(mask, traceMask)) { return; }
658         (void)push(Op::trace_var, mask.id,traceMask.id,val.id,NA, traceHookID, slot);
659     }
trace_enter(int traceHookID,I32 mask,I32 traceMask,int fnIdx)660     void Builder::trace_enter(int traceHookID, I32 mask, I32 traceMask, int fnIdx) {
661         SkASSERT(traceHookID >= 0);
662         SkASSERT(traceHookID < (int)fTraceHooks.size());
663         if (!this->mergeMasks(mask, traceMask)) { return; }
664         (void)push(Op::trace_enter, mask.id,traceMask.id,NA,NA, traceHookID, fnIdx);
665     }
trace_exit(int traceHookID,I32 mask,I32 traceMask,int fnIdx)666     void Builder::trace_exit(int traceHookID, I32 mask, I32 traceMask, int fnIdx) {
667         SkASSERT(traceHookID >= 0);
668         SkASSERT(traceHookID < (int)fTraceHooks.size());
669         if (!this->mergeMasks(mask, traceMask)) { return; }
670         (void)push(Op::trace_exit, mask.id,traceMask.id,NA,NA, traceHookID, fnIdx);
671     }
trace_scope(int traceHookID,I32 mask,I32 traceMask,int delta)672     void Builder::trace_scope(int traceHookID, I32 mask, I32 traceMask, int delta) {
673         SkASSERT(traceHookID >= 0);
674         SkASSERT(traceHookID < (int)fTraceHooks.size());
675         if (!this->mergeMasks(mask, traceMask)) { return; }
676         (void)push(Op::trace_scope, mask.id,traceMask.id,NA,NA, traceHookID, delta);
677     }
678 
store8(Ptr ptr,I32 val)679     void Builder::store8 (Ptr ptr, I32 val) { (void)push(Op::store8 , val.id,NA,NA,NA, ptr.ix); }
store16(Ptr ptr,I32 val)680     void Builder::store16(Ptr ptr, I32 val) { (void)push(Op::store16, val.id,NA,NA,NA, ptr.ix); }
store32(Ptr ptr,I32 val)681     void Builder::store32(Ptr ptr, I32 val) { (void)push(Op::store32, val.id,NA,NA,NA, ptr.ix); }
store64(Ptr ptr,I32 lo,I32 hi)682     void Builder::store64(Ptr ptr, I32 lo, I32 hi) {
683         (void)push(Op::store64, lo.id,hi.id,NA,NA, ptr.ix);
684     }
store128(Ptr ptr,I32 x,I32 y,I32 z,I32 w)685     void Builder::store128(Ptr ptr, I32 x, I32 y, I32 z, I32 w) {
686         (void)push(Op::store128, x.id,y.id,z.id,w.id, ptr.ix);
687     }
688 
index()689     I32 Builder::index() { return {this, push(Op::index)}; }
690 
load8(Ptr ptr)691     I32 Builder::load8 (Ptr ptr) { return {this, push(Op::load8 , NA,NA,NA,NA, ptr.ix) }; }
load16(Ptr ptr)692     I32 Builder::load16(Ptr ptr) { return {this, push(Op::load16, NA,NA,NA,NA, ptr.ix) }; }
load32(Ptr ptr)693     I32 Builder::load32(Ptr ptr) { return {this, push(Op::load32, NA,NA,NA,NA, ptr.ix) }; }
load64(Ptr ptr,int lane)694     I32 Builder::load64(Ptr ptr, int lane) {
695         return {this, push(Op::load64 , NA,NA,NA,NA, ptr.ix,lane) };
696     }
load128(Ptr ptr,int lane)697     I32 Builder::load128(Ptr ptr, int lane) {
698         return {this, push(Op::load128, NA,NA,NA,NA, ptr.ix,lane) };
699     }
700 
gather8(UPtr ptr,int offset,I32 index)701     I32 Builder::gather8 (UPtr ptr, int offset, I32 index) {
702         return {this, push(Op::gather8 , index.id,NA,NA,NA, ptr.ix,offset)};
703     }
gather16(UPtr ptr,int offset,I32 index)704     I32 Builder::gather16(UPtr ptr, int offset, I32 index) {
705         return {this, push(Op::gather16, index.id,NA,NA,NA, ptr.ix,offset)};
706     }
gather32(UPtr ptr,int offset,I32 index)707     I32 Builder::gather32(UPtr ptr, int offset, I32 index) {
708         return {this, push(Op::gather32, index.id,NA,NA,NA, ptr.ix,offset)};
709     }
710 
uniform32(UPtr ptr,int offset)711     I32 Builder::uniform32(UPtr ptr, int offset) {
712         return {this, push(Op::uniform32, NA,NA,NA,NA, ptr.ix, offset)};
713     }
714 
715     // Note: this converts the array index into a byte offset for the op.
array32(UPtr ptr,int offset,int index)716     I32 Builder::array32  (UPtr ptr, int offset, int index) {
717         return {this, push(Op::array32, NA,NA,NA,NA, ptr.ix, offset, index * sizeof(int))};
718     }
719 
splat(int n)720     I32 Builder::splat(int n) { return {this, push(Op::splat, NA,NA,NA,NA, n) }; }
721 
722     template <typename F32_or_I32>
canonicalizeIdOrder(F32_or_I32 & x,F32_or_I32 & y)723     void Builder::canonicalizeIdOrder(F32_or_I32& x, F32_or_I32& y) {
724         bool immX = fProgram[x.id].op == Op::splat;
725         bool immY = fProgram[y.id].op == Op::splat;
726         if (immX != immY) {
727             if (immX) {
728                 // Prefer (val, imm) over (imm, val).
729                 std::swap(x, y);
730             }
731             return;
732         }
733         if (x.id > y.id) {
734             // Prefer (lower-ID, higher-ID) over (higher-ID, lower-ID).
735             std::swap(x, y);
736         }
737     }
738 
739     // Be careful peepholing float math!  Transformations you might expect to
740     // be legal can fail in the face of NaN/Inf, e.g. 0*x is not always 0.
741     // Float peepholes must pass this equivalence test for all ~4B floats:
742     //
743     //     bool equiv(float x, float y) { return (x == y) || (isnanf(x) && isnanf(y)); }
744     //
745     //     unsigned bits = 0;
746     //     do {
747     //        float f;
748     //        memcpy(&f, &bits, 4);
749     //        if (!equiv(f, ...)) {
750     //           abort();
751     //        }
752     //     } while (++bits != 0);
753 
add(F32 x,F32 y)754     F32 Builder::add(F32 x, F32 y) {
755         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); }
756         this->canonicalizeIdOrder(x, y);
757         if (this->isImm(y.id, 0.0f)) { return x; }   // x+0 == x
758 
759         if (fFeatures.fma) {
760             if (fProgram[x.id].op == Op::mul_f32) {
761                 return {this, this->push(Op::fma_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)};
762             }
763             if (fProgram[y.id].op == Op::mul_f32) {
764                 return {this, this->push(Op::fma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)};
765             }
766         }
767         return {this, this->push(Op::add_f32, x.id, y.id)};
768     }
769 
sub(F32 x,F32 y)770     F32 Builder::sub(F32 x, F32 y) {
771         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); }
772         if (this->isImm(y.id, 0.0f)) { return x; }   // x-0 == x
773         if (fFeatures.fma) {
774             if (fProgram[x.id].op == Op::mul_f32) {
775                 return {this, this->push(Op::fms_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)};
776             }
777             if (fProgram[y.id].op == Op::mul_f32) {
778                 return {this, this->push(Op::fnma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)};
779             }
780         }
781         return {this, this->push(Op::sub_f32, x.id, y.id)};
782     }
783 
mul(F32 x,F32 y)784     F32 Builder::mul(F32 x, F32 y) {
785         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); }
786         this->canonicalizeIdOrder(x, y);
787         if (this->isImm(y.id, 1.0f)) { return x; }  // x*1 == x
788         return {this, this->push(Op::mul_f32, x.id, y.id)};
789     }
790 
fast_mul(F32 x,F32 y)791     F32 Builder::fast_mul(F32 x, F32 y) {
792         if (this->isImm(x.id, 0.0f) || this->isImm(y.id, 0.0f)) { return splat(0.0f); }
793         return mul(x,y);
794     }
795 
div(F32 x,F32 y)796     F32 Builder::div(F32 x, F32 y) {
797         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(sk_ieee_float_divide(X,Y)); }
798         if (this->isImm(y.id, 1.0f)) { return x; }  // x/1 == x
799         return {this, this->push(Op::div_f32, x.id, y.id)};
800     }
801 
sqrt(F32 x)802     F32 Builder::sqrt(F32 x) {
803         if (float X; this->allImm(x.id,&X)) { return splat(std::sqrt(X)); }
804         return {this, this->push(Op::sqrt_f32, x.id)};
805     }
806 
807     // See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html.
approx_log2(F32 x)808     F32 Builder::approx_log2(F32 x) {
809         // e - 127 is a fair approximation of log2(x) in its own right...
810         F32 e = mul(to_F32(pun_to_I32(x)), splat(1.0f / (1<<23)));
811 
812         // ... but using the mantissa to refine its error is _much_ better.
813         F32 m = pun_to_F32(bit_or(bit_and(pun_to_I32(x), 0x007fffff),
814                                 0x3f000000));
815         F32 approx = sub(e,        124.225514990f);
816             approx = sub(approx, mul(1.498030302f, m));
817             approx = sub(approx, div(1.725879990f, add(0.3520887068f, m)));
818 
819         return approx;
820     }
821 
approx_pow2(F32 x)822     F32 Builder::approx_pow2(F32 x) {
823         constexpr float kInfinityBits = 0x7f800000;
824 
825         F32 f = fract(x);
826         F32 approx = add(x,         121.274057500f);
827             approx = sub(approx, mul( 1.490129070f, f));
828             approx = add(approx, div(27.728023300f, sub(4.84252568f, f)));
829             approx = mul(1.0f * (1<<23), approx);
830             approx = clamp(approx, 0, kInfinityBits);  // guard against underflow/overflow
831 
832         return pun_to_F32(round(approx));
833     }
834 
approx_powf(F32 x,F32 y)835     F32 Builder::approx_powf(F32 x, F32 y) {
836         // TODO: assert this instead?  Sometimes x is very slightly negative.  See skia:10210.
837         x = max(0.0f, x);
838 
839         if (this->isImm(x.id, 1.0f)) { return x; }                    // 1^y is one
840         if (this->isImm(x.id, 2.0f)) { return this->approx_pow2(y); } // 2^y is pow2(y)
841         if (this->isImm(y.id, 0.5f)) { return this->sqrt(x); }        // x^0.5 is sqrt(x)
842         if (this->isImm(y.id, 1.0f)) { return x; }                    // x^1 is x
843         if (this->isImm(y.id, 2.0f)) { return x * x; }                // x^2 is x*x
844 
845         auto is_x = bit_or(eq(x, 0.0f),
846                            eq(x, 1.0f));
847         return select(is_x, x, approx_pow2(mul(approx_log2(x), y)));
848     }
849 
850     // Bhaskara I's sine approximation
851     // 16x(pi - x) / (5*pi^2 - 4x(pi - x)
852     // ... divide by 4
853     // 4x(pi - x) / 5*pi^2/4 - x(pi - x)
854     //
855     // This is a good approximation only for 0 <= x <= pi, so we use symmetries to get
856     // radians into that range first.
857     //
approx_sin(F32 radians)858     F32 Builder::approx_sin(F32 radians) {
859         constexpr float Pi = SK_ScalarPI;
860         // x = radians mod 2pi
861         F32 x = fract(radians * (0.5f/Pi)) * (2*Pi);
862         I32 neg = x > Pi;   // are we pi < x < 2pi --> need to negate result
863         x = select(neg, x - Pi, x);
864 
865         F32 pair = x * (Pi - x);
866         x = 4.0f * pair / ((5*Pi*Pi/4) - pair);
867         x = select(neg, -x, x);
868         return x;
869     }
870 
871     /*  "GENERATING ACCURATE VALUES FOR THE TANGENT FUNCTION"
872          https://mae.ufl.edu/~uhk/ACCURATE-TANGENT.pdf
873 
874         approx = x + (1/3)x^3 + (2/15)x^5 + (17/315)x^7 + (62/2835)x^9
875 
876         Some simplifications:
877         1. tan(x) is periodic, -PI/2 < x < PI/2
878         2. tan(x) is odd, so tan(-x) = -tan(x)
879         3. Our polynomial approximation is best near zero, so we use the following identity
880                         tan(x) + tan(y)
881            tan(x + y) = -----------------
882                        1 - tan(x)*tan(y)
883            tan(PI/4) = 1
884 
885            So for x > PI/8, we do the following refactor:
886            x' = x - PI/4
887 
888                     1 + tan(x')
889            tan(x) = ------------
890                     1 - tan(x')
891      */
approx_tan(F32 x)892     F32 Builder::approx_tan(F32 x) {
893         constexpr float Pi = SK_ScalarPI;
894         // periodic between -pi/2 ... pi/2
895         // shift to 0...Pi, scale 1/Pi to get into 0...1, then fract, scale-up, shift-back
896         x = fract((1/Pi)*x + 0.5f) * Pi - (Pi/2);
897 
898         I32 neg = (x < 0.0f);
899         x = select(neg, -x, x);
900 
901         // minimize total error by shifting if x > pi/8
902         I32 use_quotient = (x > (Pi/8));
903         x = select(use_quotient, x - (Pi/4), x);
904 
905         // 9th order poly = 4th order(x^2) * x
906         x = poly(x*x, 62/2835.0f, 17/315.0f, 2/15.0f, 1/3.0f, 1.0f) * x;
907         x = select(use_quotient, (1+x)/(1-x), x);
908         x = select(neg, -x, x);
909         return x;
910     }
911 
912      // http://mathforum.org/library/drmath/view/54137.html
913      // referencing Handbook of Mathematical Functions,
914      //             by Milton Abramowitz and Irene Stegun
approx_asin(F32 x)915      F32 Builder::approx_asin(F32 x) {
916          I32 neg = (x < 0.0f);
917          x = select(neg, -x, x);
918          x = SK_ScalarPI/2 - sqrt(1-x) * poly(x, -0.0187293f, 0.0742610f, -0.2121144f, 1.5707288f);
919          x = select(neg, -x, x);
920          return x;
921      }
922 
923     /*  Use 4th order polynomial approximation from https://arachnoid.com/polysolve/
924      *      with 129 values of x,atan(x) for x:[0...1]
925      *  This only works for 0 <= x <= 1
926      */
approx_atan_unit(F32 x)927     static F32 approx_atan_unit(F32 x) {
928         // for now we might be given NaN... let that through
929         x->assert_true((x != x) | ((x >= 0) & (x <= 1)));
930         return poly(x, 0.14130025741326729f,
931                       -0.34312835980675116f,
932                       -0.016172900528248768f,
933                        1.0037696976200385f,
934                       -0.00014758242182738969f);
935     }
936 
937     /*  Use identity atan(x) = pi/2 - atan(1/x) for x > 1
938      */
approx_atan(F32 x)939     F32 Builder::approx_atan(F32 x) {
940         I32 neg = (x < 0.0f);
941         x = select(neg, -x, x);
942         I32 flip = (x > 1.0f);
943         x = select(flip, 1/x, x);
944         x = approx_atan_unit(x);
945         x = select(flip, SK_ScalarPI/2 - x, x);
946         x = select(neg, -x, x);
947         return x;
948     }
949 
950     /*  Use identity atan(x) = pi/2 - atan(1/x) for x > 1
951      *  By swapping y,x to ensure the ratio is <= 1, we can safely call atan_unit()
952      *  which avoids a 2nd divide instruction if we had instead called atan().
953      */
approx_atan2(F32 y0,F32 x0)954     F32 Builder::approx_atan2(F32 y0, F32 x0) {
955 
956         I32 flip = (abs(y0) > abs(x0));
957         F32 y = select(flip, x0, y0);
958         F32 x = select(flip, y0, x0);
959         F32 arg = y/x;
960 
961         I32 neg = (arg < 0.0f);
962         arg = select(neg, -arg, arg);
963 
964         F32 r = approx_atan_unit(arg);
965         r = select(flip, SK_ScalarPI/2 - r, r);
966         r = select(neg, -r, r);
967 
968         // handle quadrant distinctions
969         r = select((y0 >= 0) & (x0  < 0), r + SK_ScalarPI, r);
970         r = select((y0  < 0) & (x0 <= 0), r - SK_ScalarPI, r);
971         // Note: we don't try to handle 0,0 or infinities
972         return r;
973     }
974 
min(F32 x,F32 y)975     F32 Builder::min(F32 x, F32 y) {
976         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::min(X,Y)); }
977         return {this, this->push(Op::min_f32, x.id, y.id)};
978     }
max(F32 x,F32 y)979     F32 Builder::max(F32 x, F32 y) {
980         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::max(X,Y)); }
981         return {this, this->push(Op::max_f32, x.id, y.id)};
982     }
983 
984     SK_NO_SANITIZE("signed-integer-overflow")
add(I32 x,I32 y)985     I32 Builder::add(I32 x, I32 y) {
986         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); }
987         this->canonicalizeIdOrder(x, y);
988         if (this->isImm(y.id, 0)) { return x; }  // x+0 == x
989         return {this, this->push(Op::add_i32, x.id, y.id)};
990     }
991     SK_NO_SANITIZE("signed-integer-overflow")
sub(I32 x,I32 y)992     I32 Builder::sub(I32 x, I32 y) {
993         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); }
994         if (this->isImm(y.id, 0)) { return x; }
995         return {this, this->push(Op::sub_i32, x.id, y.id)};
996     }
997     SK_NO_SANITIZE("signed-integer-overflow")
mul(I32 x,I32 y)998     I32 Builder::mul(I32 x, I32 y) {
999         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); }
1000         this->canonicalizeIdOrder(x, y);
1001         if (this->isImm(y.id, 0)) { return splat(0); }  // x*0 == 0
1002         if (this->isImm(y.id, 1)) { return x; }         // x*1 == x
1003         return {this, this->push(Op::mul_i32, x.id, y.id)};
1004     }
1005 
1006     SK_NO_SANITIZE("shift")
shl(I32 x,int bits)1007     I32 Builder::shl(I32 x, int bits) {
1008         if (bits == 0) { return x; }
1009         if (int X; this->allImm(x.id,&X)) { return splat(X << bits); }
1010         return {this, this->push(Op::shl_i32, x.id,NA,NA,NA, bits)};
1011     }
shr(I32 x,int bits)1012     I32 Builder::shr(I32 x, int bits) {
1013         if (bits == 0) { return x; }
1014         if (int X; this->allImm(x.id,&X)) { return splat(unsigned(X) >> bits); }
1015         return {this, this->push(Op::shr_i32, x.id,NA,NA,NA, bits)};
1016     }
sra(I32 x,int bits)1017     I32 Builder::sra(I32 x, int bits) {
1018         if (bits == 0) { return x; }
1019         if (int X; this->allImm(x.id,&X)) { return splat(X >> bits); }
1020         return {this, this->push(Op::sra_i32, x.id,NA,NA,NA, bits)};
1021     }
1022 
eq(F32 x,F32 y)1023     I32 Builder:: eq(F32 x, F32 y) {
1024         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X==Y ? ~0 : 0); }
1025         this->canonicalizeIdOrder(x, y);
1026         return {this, this->push(Op::eq_f32, x.id, y.id)};
1027     }
neq(F32 x,F32 y)1028     I32 Builder::neq(F32 x, F32 y) {
1029         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X!=Y ? ~0 : 0); }
1030         this->canonicalizeIdOrder(x, y);
1031         return {this, this->push(Op::neq_f32, x.id, y.id)};
1032     }
lt(F32 x,F32 y)1033     I32 Builder::lt(F32 x, F32 y) {
1034         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y> X ? ~0 : 0); }
1035         return {this, this->push(Op::gt_f32, y.id, x.id)};
1036     }
lte(F32 x,F32 y)1037     I32 Builder::lte(F32 x, F32 y) {
1038         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y>=X ? ~0 : 0); }
1039         return {this, this->push(Op::gte_f32, y.id, x.id)};
1040     }
gt(F32 x,F32 y)1041     I32 Builder::gt(F32 x, F32 y) {
1042         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X> Y ? ~0 : 0); }
1043         return {this, this->push(Op::gt_f32, x.id, y.id)};
1044     }
gte(F32 x,F32 y)1045     I32 Builder::gte(F32 x, F32 y) {
1046         if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X>=Y ? ~0 : 0); }
1047         return {this, this->push(Op::gte_f32, x.id, y.id)};
1048     }
1049 
eq(I32 x,I32 y)1050     I32 Builder:: eq(I32 x, I32 y) {
1051         if (x.id == y.id) { return splat(~0); }
1052         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X==Y ? ~0 : 0); }
1053         this->canonicalizeIdOrder(x, y);
1054         return {this, this->push(Op:: eq_i32, x.id, y.id)};
1055     }
neq(I32 x,I32 y)1056     I32 Builder::neq(I32 x, I32 y) {
1057         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X!=Y ? ~0 : 0); }
1058         return ~(x == y);
1059     }
gt(I32 x,I32 y)1060     I32 Builder:: gt(I32 x, I32 y) {
1061         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X> Y ? ~0 : 0); }
1062         return {this, this->push(Op:: gt_i32, x.id, y.id)};
1063     }
gte(I32 x,I32 y)1064     I32 Builder::gte(I32 x, I32 y) {
1065         if (x.id == y.id) { return splat(~0); }
1066         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X>=Y ? ~0 : 0); }
1067         return ~(x < y);
1068     }
lt(I32 x,I32 y)1069     I32 Builder:: lt(I32 x, I32 y) { return y>x; }
lte(I32 x,I32 y)1070     I32 Builder::lte(I32 x, I32 y) { return y>=x; }
1071 
holdsBitNot(Val id)1072     Val Builder::holdsBitNot(Val id) {
1073         // We represent `~x` as `x ^ ~0`.
1074         if (fProgram[id].op == Op::bit_xor && this->isImm(fProgram[id].y, ~0)) {
1075             return fProgram[id].x;
1076         }
1077         return NA;
1078     }
1079 
bit_and(I32 x,I32 y)1080     I32 Builder::bit_and(I32 x, I32 y) {
1081         if (x.id == y.id) { return x; }
1082         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&Y); }
1083         this->canonicalizeIdOrder(x, y);
1084         if (this->isImm(y.id, 0)) { return splat(0); }         // (x & false) == false
1085         if (this->isImm(y.id,~0)) { return x; }                // (x & true) == x
1086         if (Val notX = this->holdsBitNot(x.id); notX != NA) {  // (~x & y) == bit_clear(y, ~x)
1087             return bit_clear(y, {this, notX});
1088         }
1089         if (Val notY = this->holdsBitNot(y.id); notY != NA) {  // (x & ~y) == bit_clear(x, ~y)
1090             return bit_clear(x, {this, notY});
1091         }
1092         return {this, this->push(Op::bit_and, x.id, y.id)};
1093     }
bit_or(I32 x,I32 y)1094     I32 Builder::bit_or(I32 x, I32 y) {
1095         if (x.id == y.id) { return x; }
1096         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X|Y); }
1097         this->canonicalizeIdOrder(x, y);
1098         if (this->isImm(y.id, 0)) { return x; }           // (x | false) == x
1099         if (this->isImm(y.id,~0)) { return splat(~0); }   // (x | true) == true
1100         return {this, this->push(Op::bit_or, x.id, y.id)};
1101     }
bit_xor(I32 x,I32 y)1102     I32 Builder::bit_xor(I32 x, I32 y) {
1103         if (x.id == y.id) { return splat(0); }
1104         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X^Y); }
1105         this->canonicalizeIdOrder(x, y);
1106         if (this->isImm(y.id, 0)) { return x; }   // (x ^ false) == x
1107         return {this, this->push(Op::bit_xor, x.id, y.id)};
1108     }
1109 
bit_clear(I32 x,I32 y)1110     I32 Builder::bit_clear(I32 x, I32 y) {
1111         if (x.id == y.id) { return splat(0); }
1112         if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&~Y); }
1113         if (this->isImm(y.id, 0)) { return x; }          // (x & ~false) == x
1114         if (this->isImm(y.id,~0)) { return splat(0); }   // (x & ~true) == false
1115         if (this->isImm(x.id, 0)) { return splat(0); }   // (false & ~y) == false
1116         return {this, this->push(Op::bit_clear, x.id, y.id)};
1117     }
1118 
select(I32 x,I32 y,I32 z)1119     I32 Builder::select(I32 x, I32 y, I32 z) {
1120         if (y.id == z.id) { return y; }
1121         if (int X,Y,Z; this->allImm(x.id,&X, y.id,&Y, z.id,&Z)) { return splat(X?Y:Z); }
1122         if (this->isImm(x.id,~0)) { return y; }                // (true  ? y : z) == y
1123         if (this->isImm(x.id, 0)) { return z; }                // (false ? y : z) == z
1124         if (this->isImm(y.id, 0)) { return bit_clear(z,x); }   //     (x ? 0 : z) == ~x&z
1125         if (this->isImm(z.id, 0)) { return bit_and  (y,x); }   //     (x ? y : 0) ==  x&y
1126         if (Val notX = this->holdsBitNot(x.id); notX != NA) {  //    (!x ? y : z) == (x ? z : y)
1127             x.id = notX;
1128             std::swap(y, z);
1129         }
1130         return {this, this->push(Op::select, x.id, y.id, z.id)};
1131     }
1132 
extract(I32 x,int bits,I32 z)1133     I32 Builder::extract(I32 x, int bits, I32 z) {
1134         if (unsigned Z; this->allImm(z.id,&Z) && (~0u>>bits) == Z) { return shr(x, bits); }
1135         return bit_and(z, shr(x, bits));
1136     }
1137 
pack(I32 x,I32 y,int bits)1138     I32 Builder::pack(I32 x, I32 y, int bits) {
1139         return bit_or(x, shl(y, bits));
1140     }
1141 
ceil(F32 x)1142     F32 Builder::ceil(F32 x) {
1143         if (float X; this->allImm(x.id,&X)) { return splat(ceilf(X)); }
1144         return {this, this->push(Op::ceil, x.id)};
1145     }
floor(F32 x)1146     F32 Builder::floor(F32 x) {
1147         if (float X; this->allImm(x.id,&X)) { return splat(floorf(X)); }
1148         return {this, this->push(Op::floor, x.id)};
1149     }
to_F32(I32 x)1150     F32 Builder::to_F32(I32 x) {
1151         if (int X; this->allImm(x.id,&X)) { return splat((float)X); }
1152         return {this, this->push(Op::to_f32, x.id)};
1153     }
trunc(F32 x)1154     I32 Builder::trunc(F32 x) {
1155         if (float X; this->allImm(x.id,&X)) { return splat((int)X); }
1156         return {this, this->push(Op::trunc, x.id)};
1157     }
round(F32 x)1158     I32 Builder::round(F32 x) {
1159         if (float X; this->allImm(x.id,&X)) { return splat((int)lrintf(X)); }
1160         return {this, this->push(Op::round, x.id)};
1161     }
1162 
to_fp16(F32 x)1163     I32 Builder::to_fp16(F32 x) {
1164         if (float X; this->allImm(x.id,&X)) { return splat((int)SkFloatToHalf(X)); }
1165         return {this, this->push(Op::to_fp16, x.id)};
1166     }
from_fp16(I32 x)1167     F32 Builder::from_fp16(I32 x) {
1168         if (int X; this->allImm(x.id,&X)) { return splat(SkHalfToFloat(X)); }
1169         return {this, this->push(Op::from_fp16, x.id)};
1170     }
1171 
from_unorm(int bits,I32 x)1172     F32 Builder::from_unorm(int bits, I32 x) {
1173         F32 limit = splat(1 / ((1<<bits)-1.0f));
1174         return mul(to_F32(x), limit);
1175     }
to_unorm(int bits,F32 x)1176     I32 Builder::to_unorm(int bits, F32 x) {
1177         F32 limit = splat((1<<bits)-1.0f);
1178         return round(mul(x, limit));
1179     }
1180 
SkColorType_to_PixelFormat(SkColorType ct)1181     PixelFormat SkColorType_to_PixelFormat(SkColorType ct) {
1182         auto UNORM = PixelFormat::UNORM,
1183              SRGB  = PixelFormat::SRGB,
1184              FLOAT = PixelFormat::FLOAT,
1185              XRNG  = PixelFormat::XRNG;
1186         switch (ct) {
1187             case kUnknown_SkColorType: break;
1188 
1189             case kRGBA_F32_SkColorType: return {FLOAT,32,32,32,32, 0,32,64,96};
1190 
1191             case kRGBA_F16Norm_SkColorType:       return {FLOAT,16,16,16,16, 0,16,32,48};
1192             case kRGBA_F16_SkColorType:           return {FLOAT,16,16,16,16, 0,16,32,48};
1193             case kR16G16B16A16_unorm_SkColorType: return {UNORM,16,16,16,16, 0,16,32,48};
1194 
1195             case kA16_float_SkColorType:    return {FLOAT,  0, 0,0,16, 0, 0,0,0};
1196             case kR16G16_float_SkColorType: return {FLOAT, 16,16,0, 0, 0,16,0,0};
1197 
1198             case kAlpha_8_SkColorType:  return {UNORM, 0,0,0,8, 0,0,0,0};
1199             case kGray_8_SkColorType:   return {UNORM, 8,8,8,0, 0,0,0,0};  // Subtle.
1200             case kR8_unorm_SkColorType: return {UNORM, 8,0,0,0, 0,0,0,0};
1201 
1202             case kRGB_565_SkColorType:   return {UNORM, 5,6,5,0, 11,5,0,0};  // (BGR)
1203             case kARGB_4444_SkColorType: return {UNORM, 4,4,4,4, 12,8,4,0};  // (ABGR)
1204 
1205             case kRGBA_8888_SkColorType:  return {UNORM, 8,8,8,8,  0,8,16,24};
1206             case kRGB_888x_SkColorType:   return {UNORM, 8,8,8,0,  0,8,16,32};  // 32-bit
1207             case kBGRA_8888_SkColorType:  return {UNORM, 8,8,8,8, 16,8, 0,24};
1208             case kSRGBA_8888_SkColorType: return { SRGB, 8,8,8,8,  0,8,16,24};
1209 
1210             case kRGBA_1010102_SkColorType:   return {UNORM, 10,10,10,2,  0,10,20,30};
1211             case kBGRA_1010102_SkColorType:   return {UNORM, 10,10,10,2, 20,10, 0,30};
1212             case kRGB_101010x_SkColorType:    return {UNORM, 10,10,10,0,  0,10,20, 0};
1213             case kBGR_101010x_SkColorType:    return {UNORM, 10,10,10,0, 20,10, 0, 0};
1214             case kBGR_101010x_XR_SkColorType: return { XRNG, 10,10,10,0, 20,10, 0, 0};
1215 
1216             case kR8G8_unorm_SkColorType:   return {UNORM,  8, 8,0, 0, 0, 8,0,0};
1217             case kR16G16_unorm_SkColorType: return {UNORM, 16,16,0, 0, 0,16,0,0};
1218             case kA16_unorm_SkColorType:    return {UNORM,  0, 0,0,16, 0, 0,0,0};
1219         }
1220         SkASSERT(false);
1221         return {UNORM, 0,0,0,0, 0,0,0,0};
1222     }
1223 
byte_size(PixelFormat f)1224     static int byte_size(PixelFormat f) {
1225         // What's the highest bit we read?
1226         int bits = std::max(f.r_bits + f.r_shift,
1227                    std::max(f.g_bits + f.g_shift,
1228                    std::max(f.b_bits + f.b_shift,
1229                             f.a_bits + f.a_shift)));
1230         // Round up to bytes.
1231         return (bits + 7) / 8;
1232     }
1233 
unpack(PixelFormat f,I32 x)1234     static Color unpack(PixelFormat f, I32 x) {
1235         SkASSERT(byte_size(f) <= 4);
1236 
1237         auto from_srgb = [](int bits, I32 channel) -> F32 {
1238             const skcms_TransferFunction* tf = skcms_sRGB_TransferFunction();
1239             F32 v = from_unorm(bits, channel);
1240             return sk_program_transfer_fn(v, skcms_TFType_sRGBish,
1241                                           v->splat(tf->g),
1242                                           v->splat(tf->a),
1243                                           v->splat(tf->b),
1244                                           v->splat(tf->c),
1245                                           v->splat(tf->d),
1246                                           v->splat(tf->e),
1247                                           v->splat(tf->f));
1248         };
1249         auto from_xr = [](int bits, I32 channel) -> F32 {
1250             static constexpr float min = -0.752941f;
1251             static constexpr float max = 1.25098f;
1252             static constexpr float range = max - min;
1253             F32 v = from_unorm(bits, channel);
1254             return v * range + min;
1255         };
1256 
1257         auto unpack_rgb = [=](int bits, int shift) -> F32 {
1258             I32 channel = extract(x, shift, (1<<bits)-1);
1259             switch (f.encoding) {
1260                 case PixelFormat::UNORM: return from_unorm(bits, channel);
1261                 case PixelFormat:: SRGB: return from_srgb (bits, channel);
1262                 case PixelFormat::FLOAT: return from_fp16 (      channel);
1263                 case PixelFormat:: XRNG: return from_xr   (bits, channel);
1264             }
1265             SkUNREACHABLE;
1266         };
1267         auto unpack_alpha = [=](int bits, int shift) -> F32 {
1268             I32 channel = extract(x, shift, (1<<bits)-1);
1269             switch (f.encoding) {
1270                 case PixelFormat::UNORM:
1271                 case PixelFormat:: SRGB: return from_unorm(bits, channel);
1272                 case PixelFormat::FLOAT: return from_fp16 (      channel);
1273                 case PixelFormat:: XRNG: return from_xr   (bits, channel);
1274             }
1275             SkUNREACHABLE;
1276         };
1277         return {
1278             f.r_bits ? unpack_rgb  (f.r_bits, f.r_shift) : x->splat(0.0f),
1279             f.g_bits ? unpack_rgb  (f.g_bits, f.g_shift) : x->splat(0.0f),
1280             f.b_bits ? unpack_rgb  (f.b_bits, f.b_shift) : x->splat(0.0f),
1281             f.a_bits ? unpack_alpha(f.a_bits, f.a_shift) : x->splat(1.0f),
1282         };
1283     }
1284 
split_disjoint_8byte_format(PixelFormat f,PixelFormat * lo,PixelFormat * hi)1285     static void split_disjoint_8byte_format(PixelFormat f, PixelFormat* lo, PixelFormat* hi) {
1286         SkASSERT(byte_size(f) == 8);
1287         // We assume some of the channels are in the low 32 bits, some in the high 32 bits.
1288         // The assert on byte_size(lo) will trigger if this assumption is violated.
1289         *lo = f;
1290         if (f.r_shift >= 32) { lo->r_bits = 0; lo->r_shift = 32; }
1291         if (f.g_shift >= 32) { lo->g_bits = 0; lo->g_shift = 32; }
1292         if (f.b_shift >= 32) { lo->b_bits = 0; lo->b_shift = 32; }
1293         if (f.a_shift >= 32) { lo->a_bits = 0; lo->a_shift = 32; }
1294         SkASSERT(byte_size(*lo) == 4);
1295 
1296         *hi = f;
1297         if (f.r_shift < 32) { hi->r_bits = 0; hi->r_shift = 32; } else { hi->r_shift -= 32; }
1298         if (f.g_shift < 32) { hi->g_bits = 0; hi->g_shift = 32; } else { hi->g_shift -= 32; }
1299         if (f.b_shift < 32) { hi->b_bits = 0; hi->b_shift = 32; } else { hi->b_shift -= 32; }
1300         if (f.a_shift < 32) { hi->a_bits = 0; hi->a_shift = 32; } else { hi->a_shift -= 32; }
1301         SkASSERT(byte_size(*hi) == 4);
1302     }
1303 
1304     // The only 16-byte format we support today is RGBA F32,
1305     // though, TODO, we could generalize that to any swizzle, and to allow UNORM too.
assert_16byte_is_rgba_f32(PixelFormat f)1306     static void assert_16byte_is_rgba_f32(PixelFormat f) {
1307     #if defined(SK_DEBUG)
1308         SkASSERT(byte_size(f) == 16);
1309         PixelFormat rgba_f32 = SkColorType_to_PixelFormat(kRGBA_F32_SkColorType);
1310 
1311         SkASSERT(f.encoding == rgba_f32.encoding);
1312 
1313         SkASSERT(f.r_bits == rgba_f32.r_bits);
1314         SkASSERT(f.g_bits == rgba_f32.g_bits);
1315         SkASSERT(f.b_bits == rgba_f32.b_bits);
1316         SkASSERT(f.a_bits == rgba_f32.a_bits);
1317 
1318         SkASSERT(f.r_shift == rgba_f32.r_shift);
1319         SkASSERT(f.g_shift == rgba_f32.g_shift);
1320         SkASSERT(f.b_shift == rgba_f32.b_shift);
1321         SkASSERT(f.a_shift == rgba_f32.a_shift);
1322     #endif
1323     }
1324 
load(PixelFormat f,Ptr ptr)1325     Color Builder::load(PixelFormat f, Ptr ptr) {
1326         switch (byte_size(f)) {
1327             case 1: return unpack(f, load8 (ptr));
1328             case 2: return unpack(f, load16(ptr));
1329             case 4: return unpack(f, load32(ptr));
1330             case 8: {
1331                 PixelFormat lo,hi;
1332                 split_disjoint_8byte_format(f, &lo,&hi);
1333                 Color l = unpack(lo, load64(ptr, 0)),
1334                       h = unpack(hi, load64(ptr, 1));
1335                 return {
1336                     lo.r_bits ? l.r : h.r,
1337                     lo.g_bits ? l.g : h.g,
1338                     lo.b_bits ? l.b : h.b,
1339                     lo.a_bits ? l.a : h.a,
1340                 };
1341             }
1342             case 16: {
1343                 assert_16byte_is_rgba_f32(f);
1344                 return {
1345                     pun_to_F32(load128(ptr, 0)),
1346                     pun_to_F32(load128(ptr, 1)),
1347                     pun_to_F32(load128(ptr, 2)),
1348                     pun_to_F32(load128(ptr, 3)),
1349                 };
1350             }
1351             default: SkUNREACHABLE;
1352         }
1353     }
1354 
gather(PixelFormat f,UPtr ptr,int offset,I32 index)1355     Color Builder::gather(PixelFormat f, UPtr ptr, int offset, I32 index) {
1356         switch (byte_size(f)) {
1357             case 1: return unpack(f, gather8 (ptr, offset, index));
1358             case 2: return unpack(f, gather16(ptr, offset, index));
1359             case 4: return unpack(f, gather32(ptr, offset, index));
1360             case 8: {
1361                 PixelFormat lo,hi;
1362                 split_disjoint_8byte_format(f, &lo,&hi);
1363                 Color l = unpack(lo, gather32(ptr, offset, (index<<1)+0)),
1364                       h = unpack(hi, gather32(ptr, offset, (index<<1)+1));
1365                 return {
1366                     lo.r_bits ? l.r : h.r,
1367                     lo.g_bits ? l.g : h.g,
1368                     lo.b_bits ? l.b : h.b,
1369                     lo.a_bits ? l.a : h.a,
1370                 };
1371             }
1372             case 16: {
1373                 assert_16byte_is_rgba_f32(f);
1374                 return {
1375                     gatherF(ptr, offset, (index<<2)+0),
1376                     gatherF(ptr, offset, (index<<2)+1),
1377                     gatherF(ptr, offset, (index<<2)+2),
1378                     gatherF(ptr, offset, (index<<2)+3),
1379                 };
1380             }
1381             default: SkUNREACHABLE;
1382         }
1383     }
1384 
pack32(PixelFormat f,Color c)1385     static I32 pack32(PixelFormat f, Color c) {
1386         SkASSERT(byte_size(f) <= 4);
1387 
1388         auto to_srgb = [](int bits, F32 v) {
1389             const skcms_TransferFunction* tf = skcms_sRGB_Inverse_TransferFunction();
1390             return to_unorm(bits, sk_program_transfer_fn(v, skcms_TFType_sRGBish,
1391                                                          v->splat(tf->g),
1392                                                          v->splat(tf->a),
1393                                                          v->splat(tf->b),
1394                                                          v->splat(tf->c),
1395                                                          v->splat(tf->d),
1396                                                          v->splat(tf->e),
1397                                                          v->splat(tf->f)));
1398         };
1399         auto to_xr = [](int bits, F32 v) {
1400             static constexpr float min = -0.752941f;
1401             static constexpr float max = 1.25098f;
1402             static constexpr float range = max - min;
1403             return to_unorm(bits, (v - min) * (1.0f / range));
1404         };
1405 
1406         I32 packed = c->splat(0);
1407         auto pack_rgb = [&](F32 channel, int bits, int shift) {
1408             I32 encoded;
1409             switch (f.encoding) {
1410                 case PixelFormat::UNORM: encoded = to_unorm(bits, channel); break;
1411                 case PixelFormat:: SRGB: encoded = to_srgb (bits, channel); break;
1412                 case PixelFormat::FLOAT: encoded = to_fp16 (      channel); break;
1413                 case PixelFormat:: XRNG: encoded = to_xr   (bits, channel); break;
1414             }
1415             packed = pack(packed, encoded, shift);
1416         };
1417         auto pack_alpha = [&](F32 channel, int bits, int shift) {
1418             I32 encoded;
1419             switch (f.encoding) {
1420                 case PixelFormat::UNORM:
1421                 case PixelFormat:: SRGB: encoded = to_unorm(bits, channel); break;
1422                 case PixelFormat::FLOAT: encoded = to_fp16 (      channel); break;
1423                 case PixelFormat:: XRNG: encoded = to_xr   (bits, channel); break;
1424             }
1425             packed = pack(packed, encoded, shift);
1426         };
1427         if (f.r_bits) { pack_rgb  (c.r, f.r_bits, f.r_shift); }
1428         if (f.g_bits) { pack_rgb  (c.g, f.g_bits, f.g_shift); }
1429         if (f.b_bits) { pack_rgb  (c.b, f.b_bits, f.b_shift); }
1430         if (f.a_bits) { pack_alpha(c.a, f.a_bits, f.a_shift); }
1431         return packed;
1432     }
1433 
store(PixelFormat f,Ptr ptr,Color c)1434     void Builder::store(PixelFormat f, Ptr ptr, Color c) {
1435         // Detect a grayscale PixelFormat: r,g,b bit counts and shifts all equal.
1436         if (f.r_bits  == f.g_bits  && f.g_bits  == f.b_bits &&
1437             f.r_shift == f.g_shift && f.g_shift == f.b_shift) {
1438 
1439             // TODO: pull these coefficients from an SkColorSpace?  This is sRGB luma/luminance.
1440             c.r = c.r * 0.2126f
1441                 + c.g * 0.7152f
1442                 + c.b * 0.0722f;
1443             f.g_bits = f.b_bits = 0;
1444         }
1445 
1446         switch (byte_size(f)) {
1447             case 1: store8 (ptr, pack32(f,c)); break;
1448             case 2: store16(ptr, pack32(f,c)); break;
1449             case 4: store32(ptr, pack32(f,c)); break;
1450             case 8: {
1451                 PixelFormat lo,hi;
1452                 split_disjoint_8byte_format(f, &lo,&hi);
1453                 store64(ptr, pack32(lo,c)
1454                            , pack32(hi,c));
1455                 break;
1456             }
1457             case 16: {
1458                 assert_16byte_is_rgba_f32(f);
1459                 store128(ptr, pun_to_I32(c.r), pun_to_I32(c.g), pun_to_I32(c.b), pun_to_I32(c.a));
1460                 break;
1461             }
1462             default: SkUNREACHABLE;
1463         }
1464     }
1465 
unpremul(F32 * r,F32 * g,F32 * b,F32 a)1466     void Builder::unpremul(F32* r, F32* g, F32* b, F32 a) {
1467         skvm::F32 invA = 1.0f / a,
1468                   inf  = pun_to_F32(splat(0x7f800000));
1469         // If a is 0, so are *r,*g,*b, so set invA to 0 to avoid 0*inf=NaN (instead 0*0 = 0).
1470         invA = select(invA < inf, invA
1471                                 , 0.0f);
1472         *r *= invA;
1473         *g *= invA;
1474         *b *= invA;
1475     }
1476 
premul(F32 * r,F32 * g,F32 * b,F32 a)1477     void Builder::premul(F32* r, F32* g, F32* b, F32 a) {
1478         *r *= a;
1479         *g *= a;
1480         *b *= a;
1481     }
1482 
uniformColor(SkColor4f color,Uniforms * uniforms)1483     Color Builder::uniformColor(SkColor4f color, Uniforms* uniforms) {
1484         auto [r,g,b,a] = color;
1485         return {
1486             uniformF(uniforms->pushF(r)),
1487             uniformF(uniforms->pushF(g)),
1488             uniformF(uniforms->pushF(b)),
1489             uniformF(uniforms->pushF(a)),
1490         };
1491     }
1492 
lerp(F32 lo,F32 hi,F32 t)1493     F32 Builder::lerp(F32 lo, F32 hi, F32 t) {
1494         if (this->isImm(t.id, 0.0f)) { return lo; }
1495         if (this->isImm(t.id, 1.0f)) { return hi; }
1496         return mad(sub(hi, lo), t, lo);
1497     }
1498 
lerp(Color lo,Color hi,F32 t)1499     Color Builder::lerp(Color lo, Color hi, F32 t) {
1500         return {
1501             lerp(lo.r, hi.r, t),
1502             lerp(lo.g, hi.g, t),
1503             lerp(lo.b, hi.b, t),
1504             lerp(lo.a, hi.a, t),
1505         };
1506     }
1507 
to_hsla(Color c)1508     HSLA Builder::to_hsla(Color c) {
1509         F32 mx = max(max(c.r,c.g),c.b),
1510             mn = min(min(c.r,c.g),c.b),
1511              d = mx - mn,
1512           invd = 1.0f / d,
1513         g_lt_b = select(c.g < c.b, splat(6.0f)
1514                                  , splat(0.0f));
1515 
1516         F32 h = (1/6.0f) * select(mx == mn,  0.0f,
1517                            select(mx == c.r, invd * (c.g - c.b) + g_lt_b,
1518                            select(mx == c.g, invd * (c.b - c.r) + 2.0f
1519                                            , invd * (c.r - c.g) + 4.0f)));
1520 
1521         F32 sum = mx + mn,
1522               l = sum * 0.5f,
1523               s = select(mx == mn, 0.0f
1524                                  , d / select(l > 0.5f, 2.0f - sum
1525                                                       , sum));
1526         return {h, s, l, c.a};
1527     }
1528 
to_rgba(HSLA c)1529     Color Builder::to_rgba(HSLA c) {
1530         // See GrRGBToHSLFilterEffect.fp
1531 
1532         auto [h,s,l,a] = c;
1533         F32 x = s * (1.0f - abs(l + l - 1.0f));
1534 
1535         auto hue_to_rgb = [&,l=l](auto hue) {
1536             auto q = abs(6.0f * fract(hue) - 3.0f) - 1.0f;
1537             return x * (clamp01(q) - 0.5f) + l;
1538         };
1539 
1540         return {
1541             hue_to_rgb(h + 0/3.0f),
1542             hue_to_rgb(h + 2/3.0f),
1543             hue_to_rgb(h + 1/3.0f),
1544             c.a,
1545         };
1546     }
1547 
1548     // We're basing our implementation of non-separable blend modes on
1549     //   https://www.w3.org/TR/compositing-1/#blendingnonseparable.
1550     // and
1551     //   https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf
1552     // They're equivalent, but ES' math has been better simplified.
1553     //
1554     // Anything extra we add beyond that is to make the math work with premul inputs.
1555 
saturation(skvm::F32 r,skvm::F32 g,skvm::F32 b)1556     static skvm::F32 saturation(skvm::F32 r, skvm::F32 g, skvm::F32 b) {
1557         return max(r, max(g, b))
1558              - min(r, min(g, b));
1559     }
1560 
luminance(skvm::F32 r,skvm::F32 g,skvm::F32 b)1561     static skvm::F32 luminance(skvm::F32 r, skvm::F32 g, skvm::F32 b) {
1562         return r*0.30f + g*0.59f + b*0.11f;
1563     }
1564 
set_sat(skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 s)1565     static void set_sat(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 s) {
1566         F32 mn  = min(*r, min(*g, *b)),
1567             mx  = max(*r, max(*g, *b)),
1568             sat = mx - mn;
1569 
1570         // Map min channel to 0, max channel to s, and scale the middle proportionally.
1571         auto scale = [&](skvm::F32 c) {
1572             auto scaled = ((c - mn) * s) / sat;
1573             return select(is_finite(scaled), scaled, 0.0f);
1574         };
1575         *r = scale(*r);
1576         *g = scale(*g);
1577         *b = scale(*b);
1578     }
1579 
set_lum(skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 lu)1580     static void set_lum(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 lu) {
1581         auto diff = lu - luminance(*r, *g, *b);
1582         *r += diff;
1583         *g += diff;
1584         *b += diff;
1585     }
1586 
clip_color(skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 a)1587     static void clip_color(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 a) {
1588         F32 mn  = min(*r, min(*g, *b)),
1589             mx  = max(*r, max(*g, *b)),
1590             lu = luminance(*r, *g, *b);
1591 
1592         auto clip = [&](auto c) {
1593             c = select(mn < 0 & lu != mn, lu + ((c-lu)*(  lu)) / (lu-mn), c);
1594             c = select(mx > a & lu != mx, lu + ((c-lu)*(a-lu)) / (mx-lu), c);
1595             return clamp01(c);  // May be a little negative, or worse, NaN.
1596         };
1597         *r = clip(*r);
1598         *g = clip(*g);
1599         *b = clip(*b);
1600     }
1601 
blend(SkBlendMode mode,Color src,Color dst)1602     Color Builder::blend(SkBlendMode mode, Color src, Color dst) {
1603         auto mma = [](skvm::F32 x, skvm::F32 y, skvm::F32 z, skvm::F32 w) {
1604             return x*y + z*w;
1605         };
1606 
1607         auto two = [](skvm::F32 x) { return x+x; };
1608 
1609         auto apply_rgba = [&](auto fn) {
1610             return Color {
1611                 fn(src.r, dst.r),
1612                 fn(src.g, dst.g),
1613                 fn(src.b, dst.b),
1614                 fn(src.a, dst.a),
1615             };
1616         };
1617 
1618         auto apply_rgb_srcover_a = [&](auto fn) {
1619             return Color {
1620                 fn(src.r, dst.r),
1621                 fn(src.g, dst.g),
1622                 fn(src.b, dst.b),
1623                 mad(dst.a, 1-src.a, src.a),   // srcover for alpha
1624             };
1625         };
1626 
1627         auto non_sep = [&](auto R, auto G, auto B) {
1628             return Color{
1629                 R + mma(src.r, 1-dst.a,  dst.r, 1-src.a),
1630                 G + mma(src.g, 1-dst.a,  dst.g, 1-src.a),
1631                 B + mma(src.b, 1-dst.a,  dst.b, 1-src.a),
1632                 mad(dst.a, 1-src.a, src.a),   // srcover for alpha
1633             };
1634         };
1635 
1636         switch (mode) {
1637             default:
1638                 SkASSERT(false);
1639                 [[fallthrough]]; /*but also, for safety, fallthrough*/
1640 
1641             case SkBlendMode::kClear: return { splat(0.0f), splat(0.0f), splat(0.0f), splat(0.0f) };
1642 
1643             case SkBlendMode::kSrc: return src;
1644             case SkBlendMode::kDst: return dst;
1645 
1646             case SkBlendMode::kDstOver: std::swap(src, dst); [[fallthrough]];
1647             case SkBlendMode::kSrcOver:
1648                 return apply_rgba([&](auto s, auto d) {
1649                     return mad(d,1-src.a, s);
1650                 });
1651 
1652             case SkBlendMode::kDstIn: std::swap(src, dst); [[fallthrough]];
1653             case SkBlendMode::kSrcIn:
1654                 return apply_rgba([&](auto s, auto d) {
1655                     return s * dst.a;
1656                 });
1657 
1658             case SkBlendMode::kDstOut: std::swap(src, dst); [[fallthrough]];
1659 
1660             case SkBlendMode::kSrcOut:
1661                 return apply_rgba([&](auto s, auto d) {
1662                     return s * (1-dst.a);
1663                 });
1664 
1665             case SkBlendMode::kDstATop: std::swap(src, dst); [[fallthrough]];
1666             case SkBlendMode::kSrcATop:
1667                 return apply_rgba([&](auto s, auto d) {
1668                     return mma(s, dst.a,  d, 1-src.a);
1669                 });
1670 
1671             case SkBlendMode::kXor:
1672                 return apply_rgba([&](auto s, auto d) {
1673                     return mma(s, 1-dst.a,  d, 1-src.a);
1674                 });
1675 
1676             case SkBlendMode::kPlus:
1677                 return apply_rgba([&](auto s, auto d) {
1678                     return min(s+d, 1.0f);
1679                 });
1680 
1681             case SkBlendMode::kModulate:
1682                 return apply_rgba([&](auto s, auto d) {
1683                     return s * d;
1684                 });
1685 
1686             case SkBlendMode::kScreen:
1687                 // (s+d)-(s*d) gave us trouble with our "r,g,b <= after blending" asserts.
1688                 // It's kind of plausible that s + (d - sd) keeps more precision?
1689                 return apply_rgba([&](auto s, auto d) {
1690                     return s + (d - s*d);
1691                 });
1692 
1693             case SkBlendMode::kDarken:
1694                 return apply_rgb_srcover_a([&](auto s, auto d) {
1695                     return s + (d - max(s * dst.a,
1696                                         d * src.a));
1697                 });
1698 
1699             case SkBlendMode::kLighten:
1700                 return apply_rgb_srcover_a([&](auto s, auto d) {
1701                     return s + (d - min(s * dst.a,
1702                                         d * src.a));
1703                 });
1704 
1705             case SkBlendMode::kDifference:
1706                 return apply_rgb_srcover_a([&](auto s, auto d) {
1707                     return s + (d - two(min(s * dst.a,
1708                                             d * src.a)));
1709                 });
1710 
1711             case SkBlendMode::kExclusion:
1712                 return apply_rgb_srcover_a([&](auto s, auto d) {
1713                     return s + (d - two(s * d));
1714                 });
1715 
1716             case SkBlendMode::kColorBurn:
1717                 return apply_rgb_srcover_a([&](auto s, auto d) {
1718                     auto mn   = min(dst.a,
1719                                     src.a * (dst.a - d) / s),
1720                          burn = src.a * (dst.a - mn) + mma(s, 1-dst.a, d, 1-src.a);
1721                     return select(d == dst.a     , s * (1-dst.a) + d,
1722                            select(is_finite(burn), burn
1723                                                  , d * (1-src.a) + s));
1724                 });
1725 
1726             case SkBlendMode::kColorDodge:
1727                 return apply_rgb_srcover_a([&](auto s, auto d) {
1728                     auto dodge = src.a * min(dst.a,
1729                                              d * src.a / (src.a - s))
1730                                        + mma(s, 1-dst.a, d, 1-src.a);
1731                     return select(d == 0.0f       , s * (1-dst.a) + d,
1732                            select(is_finite(dodge), dodge
1733                                                   , d * (1-src.a) + s));
1734                 });
1735 
1736             case SkBlendMode::kHardLight:
1737                 return apply_rgb_srcover_a([&](auto s, auto d) {
1738                     return mma(s, 1-dst.a, d, 1-src.a) +
1739                            select(two(s) <= src.a,
1740                                   two(s * d),
1741                                   src.a * dst.a - two((dst.a - d) * (src.a - s)));
1742                 });
1743 
1744             case SkBlendMode::kOverlay:
1745                 return apply_rgb_srcover_a([&](auto s, auto d) {
1746                     return mma(s, 1-dst.a, d, 1-src.a) +
1747                            select(two(d) <= dst.a,
1748                                   two(s * d),
1749                                   src.a * dst.a - two((dst.a - d) * (src.a - s)));
1750                 });
1751 
1752             case SkBlendMode::kMultiply:
1753                 return apply_rgba([&](auto s, auto d) {
1754                     return mma(s, 1-dst.a, d, 1-src.a) + s * d;
1755                 });
1756 
1757             case SkBlendMode::kSoftLight:
1758                 return apply_rgb_srcover_a([&](auto s, auto d) {
1759                     auto  m = select(dst.a > 0.0f, d / dst.a
1760                                                  , 0.0f),
1761                          s2 = two(s),
1762                          m4 = 4*m;
1763 
1764                          // The logic forks three ways:
1765                          //    1. dark src?
1766                          //    2. light src, dark dst?
1767                          //    3. light src, light dst?
1768 
1769                          // Used in case 1
1770                     auto darkSrc = d * ((s2-src.a) * (1-m) + src.a),
1771                          // Used in case 2
1772                          darkDst = (m4 * m4 + m4) * (m-1) + 7*m,
1773                          // Used in case 3.
1774                          liteDst = sqrt(m) - m,
1775                          // Used in 2 or 3?
1776                          liteSrc = dst.a * (s2 - src.a) * select(4*d <= dst.a, darkDst
1777                                                                              , liteDst)
1778                                    + d * src.a;
1779                     return s * (1-dst.a) + d * (1-src.a) + select(s2 <= src.a, darkSrc
1780                                                                              , liteSrc);
1781                 });
1782 
1783             case SkBlendMode::kHue: {
1784                 skvm::F32 R = src.r * src.a,
1785                           G = src.g * src.a,
1786                           B = src.b * src.a;
1787 
1788                 set_sat   (&R, &G, &B, src.a * saturation(dst.r, dst.g, dst.b));
1789                 set_lum   (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b));
1790                 clip_color(&R, &G, &B, src.a * dst.a);
1791 
1792                 return non_sep(R, G, B);
1793             }
1794 
1795             case SkBlendMode::kSaturation: {
1796                 skvm::F32 R = dst.r * src.a,
1797                           G = dst.g * src.a,
1798                           B = dst.b * src.a;
1799 
1800                 set_sat   (&R, &G, &B, dst.a * saturation(src.r, src.g, src.b));
1801                 set_lum   (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b));
1802                 clip_color(&R, &G, &B, src.a * dst.a);
1803 
1804                 return non_sep(R, G, B);
1805             }
1806 
1807             case SkBlendMode::kColor: {
1808                 skvm::F32 R = src.r * dst.a,
1809                           G = src.g * dst.a,
1810                           B = src.b * dst.a;
1811 
1812                 set_lum   (&R, &G, &B, src.a * luminance(dst.r, dst.g, dst.b));
1813                 clip_color(&R, &G, &B, src.a * dst.a);
1814 
1815                 return non_sep(R, G, B);
1816             }
1817 
1818             case SkBlendMode::kLuminosity: {
1819                 skvm::F32 R = dst.r * src.a,
1820                           G = dst.g * src.a,
1821                           B = dst.b * src.a;
1822 
1823                 set_lum   (&R, &G, &B, dst.a * luminance(src.r, src.g, src.b));
1824                 clip_color(&R, &G, &B, dst.a * src.a);
1825 
1826                 return non_sep(R, G, B);
1827             }
1828         }
1829     }
1830 
1831     // ~~~~ Program::eval() and co. ~~~~ //
1832 
1833     // Handy references for x86-64 instruction encoding:
1834     // https://wiki.osdev.org/X86-64_Instruction_Encoding
1835     // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x64.htm
1836     // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x86.htm
1837     // http://ref.x86asm.net/coder64.html
1838 
1839     // Used for ModRM / immediate instruction encoding.
_233(int a,int b,int c)1840     static uint8_t _233(int a, int b, int c) {
1841         return (a & 3) << 6
1842              | (b & 7) << 3
1843              | (c & 7) << 0;
1844     }
1845 
1846     // ModRM byte encodes the arguments of an opcode.
1847     enum class Mod { Indirect, OneByteImm, FourByteImm, Direct };
mod_rm(Mod mod,int reg,int rm)1848     static uint8_t mod_rm(Mod mod, int reg, int rm) {
1849         return _233((int)mod, reg, rm);
1850     }
1851 
mod(int imm)1852     static Mod mod(int imm) {
1853         if (imm == 0)               { return Mod::Indirect; }
1854         if (SkTFitsIn<int8_t>(imm)) { return Mod::OneByteImm; }
1855         return Mod::FourByteImm;
1856     }
1857 
imm_bytes(Mod mod)1858     static int imm_bytes(Mod mod) {
1859         switch (mod) {
1860             case Mod::Indirect:    return 0;
1861             case Mod::OneByteImm:  return 1;
1862             case Mod::FourByteImm: return 4;
1863             case Mod::Direct: SkUNREACHABLE;
1864         }
1865         SkUNREACHABLE;
1866     }
1867 
1868     // SIB byte encodes a memory address, base + (index * scale).
sib(Assembler::Scale scale,int index,int base)1869     static uint8_t sib(Assembler::Scale scale, int index, int base) {
1870         return _233((int)scale, index, base);
1871     }
1872 
1873     // The REX prefix is used to extend most old 32-bit instructions to 64-bit.
rex(bool W,bool R,bool X,bool B)1874     static uint8_t rex(bool W,   // If set, operation is 64-bit, otherwise default, usually 32-bit.
1875                        bool R,   // Extra top bit to select ModRM reg, registers 8-15.
1876                        bool X,   // Extra top bit for SIB index register.
1877                        bool B) { // Extra top bit for SIB base or ModRM rm register.
1878         return 0b01000000   // Fixed 0100 for top four bits.
1879              | (W << 3)
1880              | (R << 2)
1881              | (X << 1)
1882              | (B << 0);
1883     }
1884 
1885 
1886     // The VEX prefix extends SSE operations to AVX.  Used generally, even with XMM.
1887     struct VEX {
1888         int     len;
1889         uint8_t bytes[3];
1890     };
1891 
vex(bool WE,bool R,bool X,bool B,int map,int vvvv,bool L,int pp)1892     static VEX vex(bool  WE,   // Like REX W for int operations, or opcode extension for float?
1893                    bool   R,   // Same as REX R.  Pass high bit of dst register, dst>>3.
1894                    bool   X,   // Same as REX X.
1895                    bool   B,   // Same as REX B.  Pass y>>3 for 3-arg ops, x>>3 for 2-arg.
1896                    int  map,   // SSE opcode map selector: 0x0f, 0x380f, 0x3a0f.
1897                    int vvvv,   // 4-bit second operand register.  Pass our x for 3-arg ops.
1898                    bool   L,   // Set for 256-bit ymm operations, off for 128-bit xmm.
1899                    int   pp) { // SSE mandatory prefix: 0x66, 0xf3, 0xf2, else none.
1900 
1901         // Pack x86 opcode map selector to 5-bit VEX encoding.
1902         map = [map]{
1903             switch (map) {
1904                 case   0x0f: return 0b00001;
1905                 case 0x380f: return 0b00010;
1906                 case 0x3a0f: return 0b00011;
1907                 // Several more cases only used by XOP / TBM.
1908             }
1909             SkUNREACHABLE;
1910         }();
1911 
1912         // Pack  mandatory SSE opcode prefix byte to 2-bit VEX encoding.
1913         pp = [pp]{
1914             switch (pp) {
1915                 case 0x66: return 0b01;
1916                 case 0xf3: return 0b10;
1917                 case 0xf2: return 0b11;
1918             }
1919             return 0b00;
1920         }();
1921 
1922         VEX vex = {0, {0,0,0}};
1923         if (X == 0 && B == 0 && WE == 0 && map == 0b00001) {
1924             // With these conditions met, we can optionally compress VEX to 2-byte.
1925             vex.len = 2;
1926             vex.bytes[0] = 0xc5;
1927             vex.bytes[1] = (pp      &  3) << 0
1928                          | (L       &  1) << 2
1929                          | (~vvvv   & 15) << 3
1930                          | (~(int)R &  1) << 7;
1931         } else {
1932             // We could use this 3-byte VEX prefix all the time if we like.
1933             vex.len = 3;
1934             vex.bytes[0] = 0xc4;
1935             vex.bytes[1] = (map     & 31) << 0
1936                          | (~(int)B &  1) << 5
1937                          | (~(int)X &  1) << 6
1938                          | (~(int)R &  1) << 7;
1939             vex.bytes[2] = (pp    &  3) << 0
1940                          | (L     &  1) << 2
1941                          | (~vvvv & 15) << 3
1942                          | (WE    &  1) << 7;
1943         }
1944         return vex;
1945     }
1946 
Assembler(void * buf)1947     Assembler::Assembler(void* buf) : fCode((uint8_t*)buf), fSize(0) {}
1948 
size() const1949     size_t Assembler::size() const { return fSize; }
1950 
bytes(const void * p,int n)1951     void Assembler::bytes(const void* p, int n) {
1952         if (fCode) {
1953             memcpy(fCode+fSize, p, n);
1954         }
1955         fSize += n;
1956     }
1957 
byte(uint8_t b)1958     void Assembler::byte(uint8_t b) { this->bytes(&b, 1); }
word(uint32_t w)1959     void Assembler::word(uint32_t w) { this->bytes(&w, 4); }
1960 
align(int mod)1961     void Assembler::align(int mod) {
1962         while (this->size() % mod) {
1963             this->byte(0x00);
1964         }
1965     }
1966 
int3()1967     void Assembler::int3() {
1968         this->byte(0xcc);
1969     }
1970 
vzeroupper()1971     void Assembler::vzeroupper() {
1972         this->byte(0xc5);
1973         this->byte(0xf8);
1974         this->byte(0x77);
1975     }
ret()1976     void Assembler::ret() { this->byte(0xc3); }
1977 
op(int opcode,Operand dst,GP64 x)1978     void Assembler::op(int opcode, Operand dst, GP64 x) {
1979         if (dst.kind == Operand::REG) {
1980             this->byte(rex(W1,x>>3,0,dst.reg>>3));
1981             this->bytes(&opcode, SkTFitsIn<uint8_t>(opcode) ? 1 : 2);
1982             this->byte(mod_rm(Mod::Direct, x, dst.reg&7));
1983         } else {
1984             SkASSERT(dst.kind == Operand::MEM);
1985             const Mem& m = dst.mem;
1986             const bool need_SIB = (m.base&7) == rsp
1987                                || m.index != rsp;
1988 
1989             this->byte(rex(W1,x>>3,m.index>>3,m.base>>3));
1990             this->bytes(&opcode, SkTFitsIn<uint8_t>(opcode) ? 1 : 2);
1991             this->byte(mod_rm(mod(m.disp), x&7, (need_SIB ? rsp : m.base)&7));
1992             if (need_SIB) {
1993                 this->byte(sib(m.scale, m.index&7, m.base&7));
1994             }
1995             this->bytes(&m.disp, imm_bytes(mod(m.disp)));
1996         }
1997     }
1998 
op(int opcode,int opcode_ext,Operand dst,int imm)1999     void Assembler::op(int opcode, int opcode_ext, Operand dst, int imm) {
2000         opcode |= 0b1000'0000;   // top bit set for instructions with any immediate
2001 
2002         int imm_bytes = 4;
2003         if (SkTFitsIn<int8_t>(imm)) {
2004             imm_bytes = 1;
2005             opcode |= 0b0000'0010;  // second bit set for 8-bit immediate, else 32-bit.
2006         }
2007 
2008         this->op(opcode, dst, (GP64)opcode_ext);
2009         this->bytes(&imm, imm_bytes);
2010     }
2011 
add(Operand dst,int imm)2012     void Assembler::add(Operand dst, int imm) { this->op(0x01,0b000, dst,imm); }
sub(Operand dst,int imm)2013     void Assembler::sub(Operand dst, int imm) { this->op(0x01,0b101, dst,imm); }
cmp(Operand dst,int imm)2014     void Assembler::cmp(Operand dst, int imm) { this->op(0x01,0b111, dst,imm); }
2015 
2016     // These don't work quite like the other instructions with immediates:
2017     // these immediates are always fixed size at 4 bytes or 1 byte.
mov(Operand dst,int imm)2018     void Assembler::mov(Operand dst, int imm) {
2019         this->op(0xC7,dst,(GP64)0b000);
2020         this->word(imm);
2021     }
movb(Operand dst,int imm)2022     void Assembler::movb(Operand dst, int imm) {
2023         this->op(0xC6,dst,(GP64)0b000);
2024         this->byte(imm);
2025     }
2026 
add(Operand dst,GP64 x)2027     void Assembler::add (Operand dst, GP64 x) { this->op(0x01, dst,x); }
sub(Operand dst,GP64 x)2028     void Assembler::sub (Operand dst, GP64 x) { this->op(0x29, dst,x); }
cmp(Operand dst,GP64 x)2029     void Assembler::cmp (Operand dst, GP64 x) { this->op(0x39, dst,x); }
mov(Operand dst,GP64 x)2030     void Assembler::mov (Operand dst, GP64 x) { this->op(0x89, dst,x); }
movb(Operand dst,GP64 x)2031     void Assembler::movb(Operand dst, GP64 x) { this->op(0x88, dst,x); }
2032 
add(GP64 dst,Operand x)2033     void Assembler::add (GP64 dst, Operand x) { this->op(0x03, x,dst); }
sub(GP64 dst,Operand x)2034     void Assembler::sub (GP64 dst, Operand x) { this->op(0x2B, x,dst); }
cmp(GP64 dst,Operand x)2035     void Assembler::cmp (GP64 dst, Operand x) { this->op(0x3B, x,dst); }
mov(GP64 dst,Operand x)2036     void Assembler::mov (GP64 dst, Operand x) { this->op(0x8B, x,dst); }
movb(GP64 dst,Operand x)2037     void Assembler::movb(GP64 dst, Operand x) { this->op(0x8A, x,dst); }
2038 
movzbq(GP64 dst,Operand x)2039     void Assembler::movzbq(GP64 dst, Operand x) { this->op(0xB60F, x,dst); }
movzwq(GP64 dst,Operand x)2040     void Assembler::movzwq(GP64 dst, Operand x) { this->op(0xB70F, x,dst); }
2041 
vpaddd(Ymm dst,Ymm x,Operand y)2042     void Assembler::vpaddd (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xfe, dst,x,y); }
vpsubd(Ymm dst,Ymm x,Operand y)2043     void Assembler::vpsubd (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xfa, dst,x,y); }
vpmulld(Ymm dst,Ymm x,Operand y)2044     void Assembler::vpmulld(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x40, dst,x,y); }
2045 
vpaddw(Ymm dst,Ymm x,Operand y)2046     void Assembler::vpaddw   (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xfd, dst,x,y); }
vpsubw(Ymm dst,Ymm x,Operand y)2047     void Assembler::vpsubw   (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xf9, dst,x,y); }
vpmullw(Ymm dst,Ymm x,Operand y)2048     void Assembler::vpmullw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xd5, dst,x,y); }
vpavgw(Ymm dst,Ymm x,Operand y)2049     void Assembler::vpavgw   (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xe3, dst,x,y); }
vpmulhrsw(Ymm dst,Ymm x,Operand y)2050     void Assembler::vpmulhrsw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x0b, dst,x,y); }
vpminsw(Ymm dst,Ymm x,Operand y)2051     void Assembler::vpminsw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xea, dst,x,y); }
vpmaxsw(Ymm dst,Ymm x,Operand y)2052     void Assembler::vpmaxsw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xee, dst,x,y); }
vpminuw(Ymm dst,Ymm x,Operand y)2053     void Assembler::vpminuw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x3a, dst,x,y); }
vpmaxuw(Ymm dst,Ymm x,Operand y)2054     void Assembler::vpmaxuw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x3e, dst,x,y); }
2055 
vpabsw(Ymm dst,Operand x)2056     void Assembler::vpabsw(Ymm dst, Operand x) { this->op(0x66,0x380f,0x1d, dst,x); }
2057 
2058 
vpand(Ymm dst,Ymm x,Operand y)2059     void Assembler::vpand (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xdb, dst,x,y); }
vpor(Ymm dst,Ymm x,Operand y)2060     void Assembler::vpor  (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xeb, dst,x,y); }
vpxor(Ymm dst,Ymm x,Operand y)2061     void Assembler::vpxor (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xef, dst,x,y); }
vpandn(Ymm dst,Ymm x,Operand y)2062     void Assembler::vpandn(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xdf, dst,x,y); }
2063 
vaddps(Ymm dst,Ymm x,Operand y)2064     void Assembler::vaddps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x58, dst,x,y); }
vsubps(Ymm dst,Ymm x,Operand y)2065     void Assembler::vsubps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5c, dst,x,y); }
vmulps(Ymm dst,Ymm x,Operand y)2066     void Assembler::vmulps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x59, dst,x,y); }
vdivps(Ymm dst,Ymm x,Operand y)2067     void Assembler::vdivps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5e, dst,x,y); }
vminps(Ymm dst,Ymm x,Operand y)2068     void Assembler::vminps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5d, dst,x,y); }
vmaxps(Ymm dst,Ymm x,Operand y)2069     void Assembler::vmaxps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5f, dst,x,y); }
2070 
vfmadd132ps(Ymm dst,Ymm x,Operand y)2071     void Assembler::vfmadd132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x98, dst,x,y); }
vfmadd213ps(Ymm dst,Ymm x,Operand y)2072     void Assembler::vfmadd213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xa8, dst,x,y); }
vfmadd231ps(Ymm dst,Ymm x,Operand y)2073     void Assembler::vfmadd231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xb8, dst,x,y); }
2074 
vfmsub132ps(Ymm dst,Ymm x,Operand y)2075     void Assembler::vfmsub132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x9a, dst,x,y); }
vfmsub213ps(Ymm dst,Ymm x,Operand y)2076     void Assembler::vfmsub213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xaa, dst,x,y); }
vfmsub231ps(Ymm dst,Ymm x,Operand y)2077     void Assembler::vfmsub231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xba, dst,x,y); }
2078 
vfnmadd132ps(Ymm dst,Ymm x,Operand y)2079     void Assembler::vfnmadd132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x9c, dst,x,y); }
vfnmadd213ps(Ymm dst,Ymm x,Operand y)2080     void Assembler::vfnmadd213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xac, dst,x,y); }
vfnmadd231ps(Ymm dst,Ymm x,Operand y)2081     void Assembler::vfnmadd231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xbc, dst,x,y); }
2082 
vpackusdw(Ymm dst,Ymm x,Operand y)2083     void Assembler::vpackusdw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x2b, dst,x,y); }
vpackuswb(Ymm dst,Ymm x,Operand y)2084     void Assembler::vpackuswb(Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0x67, dst,x,y); }
2085 
vpunpckldq(Ymm dst,Ymm x,Operand y)2086     void Assembler::vpunpckldq(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x62, dst,x,y); }
vpunpckhdq(Ymm dst,Ymm x,Operand y)2087     void Assembler::vpunpckhdq(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x6a, dst,x,y); }
2088 
vpcmpeqd(Ymm dst,Ymm x,Operand y)2089     void Assembler::vpcmpeqd(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x76, dst,x,y); }
vpcmpeqw(Ymm dst,Ymm x,Operand y)2090     void Assembler::vpcmpeqw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x75, dst,x,y); }
vpcmpgtd(Ymm dst,Ymm x,Operand y)2091     void Assembler::vpcmpgtd(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x66, dst,x,y); }
vpcmpgtw(Ymm dst,Ymm x,Operand y)2092     void Assembler::vpcmpgtw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x65, dst,x,y); }
2093 
2094 
imm_byte_after_operand(const Operand & operand,int imm)2095     void Assembler::imm_byte_after_operand(const Operand& operand, int imm) {
2096         // When we've embedded a label displacement in the middle of an instruction,
2097         // we need to tweak it a little so that the resolved displacement starts
2098         // from the end of the instruction and not the end of the displacement.
2099         if (operand.kind == Operand::LABEL && fCode) {
2100             int disp;
2101             memcpy(&disp, fCode+fSize-4, 4);
2102             disp--;
2103             memcpy(fCode+fSize-4, &disp, 4);
2104         }
2105         this->byte(imm);
2106     }
2107 
vcmpps(Ymm dst,Ymm x,Operand y,int imm)2108     void Assembler::vcmpps(Ymm dst, Ymm x, Operand y, int imm) {
2109         this->op(0,0x0f,0xc2, dst,x,y);
2110         this->imm_byte_after_operand(y, imm);
2111     }
2112 
vpblendvb(Ymm dst,Ymm x,Operand y,Ymm z)2113     void Assembler::vpblendvb(Ymm dst, Ymm x, Operand y, Ymm z) {
2114         this->op(0x66,0x3a0f,0x4c, dst,x,y);
2115         this->imm_byte_after_operand(y, z << 4);
2116     }
2117 
2118     // Shift instructions encode their opcode extension as "dst", dst as x, and x as y.
vpslld(Ymm dst,Ymm x,int imm)2119     void Assembler::vpslld(Ymm dst, Ymm x, int imm) {
2120         this->op(0x66,0x0f,0x72,(Ymm)6, dst,x);
2121         this->byte(imm);
2122     }
vpsrld(Ymm dst,Ymm x,int imm)2123     void Assembler::vpsrld(Ymm dst, Ymm x, int imm) {
2124         this->op(0x66,0x0f,0x72,(Ymm)2, dst,x);
2125         this->byte(imm);
2126     }
vpsrad(Ymm dst,Ymm x,int imm)2127     void Assembler::vpsrad(Ymm dst, Ymm x, int imm) {
2128         this->op(0x66,0x0f,0x72,(Ymm)4, dst,x);
2129         this->byte(imm);
2130     }
vpsllw(Ymm dst,Ymm x,int imm)2131     void Assembler::vpsllw(Ymm dst, Ymm x, int imm) {
2132         this->op(0x66,0x0f,0x71,(Ymm)6, dst,x);
2133         this->byte(imm);
2134     }
vpsrlw(Ymm dst,Ymm x,int imm)2135     void Assembler::vpsrlw(Ymm dst, Ymm x, int imm) {
2136         this->op(0x66,0x0f,0x71,(Ymm)2, dst,x);
2137         this->byte(imm);
2138     }
vpsraw(Ymm dst,Ymm x,int imm)2139     void Assembler::vpsraw(Ymm dst, Ymm x, int imm) {
2140         this->op(0x66,0x0f,0x71,(Ymm)4, dst,x);
2141         this->byte(imm);
2142     }
2143 
vpermq(Ymm dst,Operand x,int imm)2144     void Assembler::vpermq(Ymm dst, Operand x, int imm) {
2145         // A bit unusual among the instructions we use, this is 64-bit operation, so we set W.
2146         this->op(0x66,0x3a0f,0x00, dst,x,W1);
2147         this->imm_byte_after_operand(x, imm);
2148     }
2149 
vperm2f128(Ymm dst,Ymm x,Operand y,int imm)2150     void Assembler::vperm2f128(Ymm dst, Ymm x, Operand y, int imm) {
2151         this->op(0x66,0x3a0f,0x06, dst,x,y);
2152         this->imm_byte_after_operand(y, imm);
2153     }
2154 
vpermps(Ymm dst,Ymm ix,Operand src)2155     void Assembler::vpermps(Ymm dst, Ymm ix, Operand src) {
2156         this->op(0x66,0x380f,0x16, dst,ix,src);
2157     }
2158 
vroundps(Ymm dst,Operand x,Rounding imm)2159     void Assembler::vroundps(Ymm dst, Operand x, Rounding imm) {
2160         this->op(0x66,0x3a0f,0x08, dst,x);
2161         this->imm_byte_after_operand(x, imm);
2162     }
2163 
vmovdqa(Ymm dst,Operand src)2164     void Assembler::vmovdqa(Ymm dst, Operand src) { this->op(0x66,0x0f,0x6f, dst,src); }
vmovups(Ymm dst,Operand src)2165     void Assembler::vmovups(Ymm dst, Operand src) { this->op(   0,0x0f,0x10, dst,src); }
vmovups(Xmm dst,Operand src)2166     void Assembler::vmovups(Xmm dst, Operand src) { this->op(   0,0x0f,0x10, dst,src); }
vmovups(Operand dst,Ymm src)2167     void Assembler::vmovups(Operand dst, Ymm src) { this->op(   0,0x0f,0x11, src,dst); }
vmovups(Operand dst,Xmm src)2168     void Assembler::vmovups(Operand dst, Xmm src) { this->op(   0,0x0f,0x11, src,dst); }
2169 
vcvtdq2ps(Ymm dst,Operand x)2170     void Assembler::vcvtdq2ps (Ymm dst, Operand x) { this->op(   0,0x0f,0x5b, dst,x); }
vcvttps2dq(Ymm dst,Operand x)2171     void Assembler::vcvttps2dq(Ymm dst, Operand x) { this->op(0xf3,0x0f,0x5b, dst,x); }
vcvtps2dq(Ymm dst,Operand x)2172     void Assembler::vcvtps2dq (Ymm dst, Operand x) { this->op(0x66,0x0f,0x5b, dst,x); }
vsqrtps(Ymm dst,Operand x)2173     void Assembler::vsqrtps   (Ymm dst, Operand x) { this->op(   0,0x0f,0x51, dst,x); }
2174 
vcvtps2ph(Operand dst,Ymm x,Rounding imm)2175     void Assembler::vcvtps2ph(Operand dst, Ymm x, Rounding imm) {
2176         this->op(0x66,0x3a0f,0x1d, x,dst);
2177         this->imm_byte_after_operand(dst, imm);
2178     }
vcvtph2ps(Ymm dst,Operand x)2179     void Assembler::vcvtph2ps(Ymm dst, Operand x) {
2180         this->op(0x66,0x380f,0x13, dst,x);
2181     }
2182 
disp19(Label * l)2183     int Assembler::disp19(Label* l) {
2184         SkASSERT(l->kind == Label::NotYetSet ||
2185                  l->kind == Label::ARMDisp19);
2186         int here = (int)this->size();
2187         l->kind = Label::ARMDisp19;
2188         l->references.push_back(here);
2189         // ARM 19-bit instruction count, from the beginning of this instruction.
2190         return (l->offset - here) / 4;
2191     }
2192 
disp32(Label * l)2193     int Assembler::disp32(Label* l) {
2194         SkASSERT(l->kind == Label::NotYetSet ||
2195                  l->kind == Label::X86Disp32);
2196         int here = (int)this->size();
2197         l->kind = Label::X86Disp32;
2198         l->references.push_back(here);
2199         // x86 32-bit byte count, from the end of this instruction.
2200         return l->offset - (here + 4);
2201     }
2202 
op(int prefix,int map,int opcode,int dst,int x,Operand y,W w,L l)2203     void Assembler::op(int prefix, int map, int opcode, int dst, int x, Operand y, W w, L l) {
2204         switch (y.kind) {
2205             case Operand::REG: {
2206                 VEX v = vex(w, dst>>3, 0, y.reg>>3,
2207                             map, x, l, prefix);
2208                 this->bytes(v.bytes, v.len);
2209                 this->byte(opcode);
2210                 this->byte(mod_rm(Mod::Direct, dst&7, y.reg&7));
2211             } return;
2212 
2213             case Operand::MEM: {
2214                 // Passing rsp as the rm argument to mod_rm() signals an SIB byte follows;
2215                 // without an SIB byte, that's where the base register would usually go.
2216                 // This means we have to use an SIB byte if we want to use rsp as a base register.
2217                 const Mem& m = y.mem;
2218                 const bool need_SIB = m.base  == rsp
2219                                    || m.index != rsp;
2220 
2221                 VEX v = vex(w, dst>>3, m.index>>3, m.base>>3,
2222                             map, x, l, prefix);
2223                 this->bytes(v.bytes, v.len);
2224                 this->byte(opcode);
2225                 this->byte(mod_rm(mod(m.disp), dst&7, (need_SIB ? rsp : m.base)&7));
2226                 if (need_SIB) {
2227                     this->byte(sib(m.scale, m.index&7, m.base&7));
2228                 }
2229                 this->bytes(&m.disp, imm_bytes(mod(m.disp)));
2230             } return;
2231 
2232             case Operand::LABEL: {
2233                 // IP-relative addressing uses Mod::Indirect with the R/M encoded as-if rbp or r13.
2234                 const int rip = rbp;
2235 
2236                 VEX v = vex(w, dst>>3, 0, rip>>3,
2237                             map, x, l, prefix);
2238                 this->bytes(v.bytes, v.len);
2239                 this->byte(opcode);
2240                 this->byte(mod_rm(Mod::Indirect, dst&7, rip&7));
2241                 this->word(this->disp32(y.label));
2242             } return;
2243         }
2244     }
2245 
vpshufb(Ymm dst,Ymm x,Operand y)2246     void Assembler::vpshufb(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x00, dst,x,y); }
2247 
vptest(Ymm x,Operand y)2248     void Assembler::vptest(Ymm x, Operand y) { this->op(0x66, 0x380f, 0x17, x,y); }
2249 
vbroadcastss(Ymm dst,Operand y)2250     void Assembler::vbroadcastss(Ymm dst, Operand y) { this->op(0x66,0x380f,0x18, dst,y); }
2251 
jump(uint8_t condition,Label * l)2252     void Assembler::jump(uint8_t condition, Label* l) {
2253         // These conditional jumps can be either 2 bytes (short) or 6 bytes (near):
2254         //    7?     one-byte-disp
2255         //    0F 8? four-byte-disp
2256         // We always use the near displacement to make updating labels simpler (no resizing).
2257         this->byte(0x0f);
2258         this->byte(condition);
2259         this->word(this->disp32(l));
2260     }
je(Label * l)2261     void Assembler::je (Label* l) { this->jump(0x84, l); }
jne(Label * l)2262     void Assembler::jne(Label* l) { this->jump(0x85, l); }
jl(Label * l)2263     void Assembler::jl (Label* l) { this->jump(0x8c, l); }
jc(Label * l)2264     void Assembler::jc (Label* l) { this->jump(0x82, l); }
2265 
jmp(Label * l)2266     void Assembler::jmp(Label* l) {
2267         // Like above in jump(), we could use 8-bit displacement here, but always use 32-bit.
2268         this->byte(0xe9);
2269         this->word(this->disp32(l));
2270     }
2271 
vpmovzxwd(Ymm dst,Operand src)2272     void Assembler::vpmovzxwd(Ymm dst, Operand src) { this->op(0x66,0x380f,0x33, dst,src); }
vpmovzxbd(Ymm dst,Operand src)2273     void Assembler::vpmovzxbd(Ymm dst, Operand src) { this->op(0x66,0x380f,0x31, dst,src); }
2274 
vmovq(Operand dst,Xmm src)2275     void Assembler::vmovq(Operand dst, Xmm src) { this->op(0x66,0x0f,0xd6, src,dst); }
2276 
vmovd(Operand dst,Xmm src)2277     void Assembler::vmovd(Operand dst, Xmm src) { this->op(0x66,0x0f,0x7e, src,dst); }
vmovd(Xmm dst,Operand src)2278     void Assembler::vmovd(Xmm dst, Operand src) { this->op(0x66,0x0f,0x6e, dst,src); }
2279 
vpinsrd(Xmm dst,Xmm src,Operand y,int imm)2280     void Assembler::vpinsrd(Xmm dst, Xmm src, Operand y, int imm) {
2281         this->op(0x66,0x3a0f,0x22, dst,src,y);
2282         this->imm_byte_after_operand(y, imm);
2283     }
vpinsrw(Xmm dst,Xmm src,Operand y,int imm)2284     void Assembler::vpinsrw(Xmm dst, Xmm src, Operand y, int imm) {
2285         this->op(0x66,0x0f,0xc4, dst,src,y);
2286         this->imm_byte_after_operand(y, imm);
2287     }
vpinsrb(Xmm dst,Xmm src,Operand y,int imm)2288     void Assembler::vpinsrb(Xmm dst, Xmm src, Operand y, int imm) {
2289         this->op(0x66,0x3a0f,0x20, dst,src,y);
2290         this->imm_byte_after_operand(y, imm);
2291     }
2292 
vextracti128(Operand dst,Ymm src,int imm)2293     void Assembler::vextracti128(Operand dst, Ymm src, int imm) {
2294         this->op(0x66,0x3a0f,0x39, src,dst);
2295         SkASSERT(dst.kind != Operand::LABEL);
2296         this->byte(imm);
2297     }
vpextrd(Operand dst,Xmm src,int imm)2298     void Assembler::vpextrd(Operand dst, Xmm src, int imm) {
2299         this->op(0x66,0x3a0f,0x16, src,dst);
2300         SkASSERT(dst.kind != Operand::LABEL);
2301         this->byte(imm);
2302     }
vpextrw(Operand dst,Xmm src,int imm)2303     void Assembler::vpextrw(Operand dst, Xmm src, int imm) {
2304         this->op(0x66,0x3a0f,0x15, src,dst);
2305         SkASSERT(dst.kind != Operand::LABEL);
2306         this->byte(imm);
2307     }
vpextrb(Operand dst,Xmm src,int imm)2308     void Assembler::vpextrb(Operand dst, Xmm src, int imm) {
2309         this->op(0x66,0x3a0f,0x14, src,dst);
2310         SkASSERT(dst.kind != Operand::LABEL);
2311         this->byte(imm);
2312     }
2313 
vgatherdps(Ymm dst,Scale scale,Ymm ix,GP64 base,Ymm mask)2314     void Assembler::vgatherdps(Ymm dst, Scale scale, Ymm ix, GP64 base, Ymm mask) {
2315         // Unlike most instructions, no aliasing is permitted here.
2316         SkASSERT(dst != ix);
2317         SkASSERT(dst != mask);
2318         SkASSERT(mask != ix);
2319 
2320         int prefix = 0x66,
2321             map    = 0x380f,
2322             opcode = 0x92;
2323         VEX v = vex(0, dst>>3, ix>>3, base>>3,
2324                     map, mask, /*ymm?*/1, prefix);
2325         this->bytes(v.bytes, v.len);
2326         this->byte(opcode);
2327         this->byte(mod_rm(Mod::Indirect, dst&7, rsp/*use SIB*/));
2328         this->byte(sib(scale, ix&7, base&7));
2329     }
2330 
2331     // https://static.docs.arm.com/ddi0596/a/DDI_0596_ARM_a64_instruction_set_architecture.pdf
2332 
mask(unsigned long long bits)2333     static int mask(unsigned long long bits) { return (1<<(int)bits)-1; }
2334 
op(uint32_t hi,V m,uint32_t lo,V n,V d)2335     void Assembler::op(uint32_t hi, V m, uint32_t lo, V n, V d) {
2336         this->word( (hi & mask(11)) << 21
2337                   | (m  & mask(5)) << 16
2338                   | (lo & mask(6)) << 10
2339                   | (n  & mask(5)) <<  5
2340                   | (d  & mask(5)) <<  0);
2341     }
op(uint32_t op22,V n,V d,int imm)2342     void Assembler::op(uint32_t op22, V n, V d, int imm) {
2343         this->word( (op22 & mask(22)) << 10
2344                   | imm  // size and location depends on the instruction
2345                   | (n    & mask(5)) <<  5
2346                   | (d    & mask(5)) <<  0);
2347     }
2348 
and16b(V d,V n,V m)2349     void Assembler::and16b(V d, V n, V m) { this->op(0b0'1'0'01110'00'1, m, 0b00011'1, n, d); }
orr16b(V d,V n,V m)2350     void Assembler::orr16b(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b00011'1, n, d); }
eor16b(V d,V n,V m)2351     void Assembler::eor16b(V d, V n, V m) { this->op(0b0'1'1'01110'00'1, m, 0b00011'1, n, d); }
bic16b(V d,V n,V m)2352     void Assembler::bic16b(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b00011'1, n, d); }
bsl16b(V d,V n,V m)2353     void Assembler::bsl16b(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b00011'1, n, d); }
not16b(V d,V n)2354     void Assembler::not16b(V d, V n)      { this->op(0b0'1'1'01110'00'10000'00101'10,  n, d); }
2355 
add4s(V d,V n,V m)2356     void Assembler::add4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10000'1, n, d); }
sub4s(V d,V n,V m)2357     void Assembler::sub4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10000'1, n, d); }
mul4s(V d,V n,V m)2358     void Assembler::mul4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10011'1, n, d); }
2359 
cmeq4s(V d,V n,V m)2360     void Assembler::cmeq4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10001'1, n, d); }
cmgt4s(V d,V n,V m)2361     void Assembler::cmgt4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b0011'0'1, n, d); }
2362 
sub8h(V d,V n,V m)2363     void Assembler::sub8h(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b10000'1, n, d); }
mul8h(V d,V n,V m)2364     void Assembler::mul8h(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b10011'1, n, d); }
2365 
fadd4s(V d,V n,V m)2366     void Assembler::fadd4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11010'1, n, d); }
fsub4s(V d,V n,V m)2367     void Assembler::fsub4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11010'1, n, d); }
fmul4s(V d,V n,V m)2368     void Assembler::fmul4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11011'1, n, d); }
fdiv4s(V d,V n,V m)2369     void Assembler::fdiv4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11111'1, n, d); }
fmin4s(V d,V n,V m)2370     void Assembler::fmin4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11110'1, n, d); }
fmax4s(V d,V n,V m)2371     void Assembler::fmax4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11110'1, n, d); }
2372 
fneg4s(V d,V n)2373     void Assembler::fneg4s (V d, V n) { this->op(0b0'1'1'01110'1'0'10000'01111'10, n,d); }
fsqrt4s(V d,V n)2374     void Assembler::fsqrt4s(V d, V n) { this->op(0b0'1'1'01110'1'0'10000'11111'10, n,d); }
2375 
fcmeq4s(V d,V n,V m)2376     void Assembler::fcmeq4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b1110'0'1, n, d); }
fcmgt4s(V d,V n,V m)2377     void Assembler::fcmgt4s(V d, V n, V m) { this->op(0b0'1'1'01110'1'0'1, m, 0b1110'0'1, n, d); }
fcmge4s(V d,V n,V m)2378     void Assembler::fcmge4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b1110'0'1, n, d); }
2379 
fmla4s(V d,V n,V m)2380     void Assembler::fmla4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11001'1, n, d); }
fmls4s(V d,V n,V m)2381     void Assembler::fmls4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11001'1, n, d); }
2382 
tbl(V d,V n,V m)2383     void Assembler::tbl(V d, V n, V m) { this->op(0b0'1'001110'00'0, m, 0b0'00'0'00, n, d); }
2384 
uzp14s(V d,V n,V m)2385     void Assembler::uzp14s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'0'01'10, n, d); }
uzp24s(V d,V n,V m)2386     void Assembler::uzp24s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'1'01'10, n, d); }
zip14s(V d,V n,V m)2387     void Assembler::zip14s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'0'11'10, n, d); }
zip24s(V d,V n,V m)2388     void Assembler::zip24s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'1'11'10, n, d); }
2389 
sli4s(V d,V n,int imm5)2390     void Assembler::sli4s(V d, V n, int imm5) {
2391         this->op(0b0'1'1'011110'0100'000'01010'1,    n, d, ( imm5 & mask(5))<<16);
2392     }
shl4s(V d,V n,int imm5)2393     void Assembler::shl4s(V d, V n, int imm5) {
2394         this->op(0b0'1'0'011110'0100'000'01010'1,    n, d, ( imm5 & mask(5))<<16);
2395     }
sshr4s(V d,V n,int imm5)2396     void Assembler::sshr4s(V d, V n, int imm5) {
2397         this->op(0b0'1'0'011110'0100'000'00'0'0'0'1, n, d, (-imm5 & mask(5))<<16);
2398     }
ushr4s(V d,V n,int imm5)2399     void Assembler::ushr4s(V d, V n, int imm5) {
2400         this->op(0b0'1'1'011110'0100'000'00'0'0'0'1, n, d, (-imm5 & mask(5))<<16);
2401     }
ushr8h(V d,V n,int imm4)2402     void Assembler::ushr8h(V d, V n, int imm4) {
2403         this->op(0b0'1'1'011110'0010'000'00'0'0'0'1, n, d, (-imm4 & mask(4))<<16);
2404     }
2405 
scvtf4s(V d,V n)2406     void Assembler::scvtf4s (V d, V n) { this->op(0b0'1'0'01110'0'0'10000'11101'10, n,d); }
fcvtzs4s(V d,V n)2407     void Assembler::fcvtzs4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1101'1'10, n,d); }
fcvtns4s(V d,V n)2408     void Assembler::fcvtns4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1101'0'10, n,d); }
frintp4s(V d,V n)2409     void Assembler::frintp4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1100'0'10, n,d); }
frintm4s(V d,V n)2410     void Assembler::frintm4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1100'1'10, n,d); }
2411 
fcvtn(V d,V n)2412     void Assembler::fcvtn(V d, V n) { this->op(0b0'0'0'01110'0'0'10000'10110'10, n,d); }
fcvtl(V d,V n)2413     void Assembler::fcvtl(V d, V n) { this->op(0b0'0'0'01110'0'0'10000'10111'10, n,d); }
2414 
xtns2h(V d,V n)2415     void Assembler::xtns2h(V d, V n) { this->op(0b0'0'0'01110'01'10000'10010'10, n,d); }
xtnh2b(V d,V n)2416     void Assembler::xtnh2b(V d, V n) { this->op(0b0'0'0'01110'00'10000'10010'10, n,d); }
2417 
uxtlb2h(V d,V n)2418     void Assembler::uxtlb2h(V d, V n) { this->op(0b0'0'1'011110'0001'000'10100'1, n,d); }
uxtlh2s(V d,V n)2419     void Assembler::uxtlh2s(V d, V n) { this->op(0b0'0'1'011110'0010'000'10100'1, n,d); }
2420 
uminv4s(V d,V n)2421     void Assembler::uminv4s(V d, V n) { this->op(0b0'1'1'01110'10'11000'1'1010'10, n,d); }
2422 
brk(int imm16)2423     void Assembler::brk(int imm16) {
2424         this->op(0b11010100'001'00000000000, (imm16 & mask(16)) << 5);
2425     }
2426 
ret(X n)2427     void Assembler::ret(X n) { this->op(0b1101011'0'0'10'11111'0000'0'0, n, (X)0); }
2428 
add(X d,X n,int imm12)2429     void Assembler::add(X d, X n, int imm12) {
2430         this->op(0b1'0'0'10001'00'000000000000, n,d, (imm12 & mask(12)) << 10);
2431     }
sub(X d,X n,int imm12)2432     void Assembler::sub(X d, X n, int imm12) {
2433         this->op(0b1'1'0'10001'00'000000000000, n,d, (imm12 & mask(12)) << 10);
2434     }
subs(X d,X n,int imm12)2435     void Assembler::subs(X d, X n, int imm12) {
2436         this->op(0b1'1'1'10001'00'000000000000, n,d, (imm12 & mask(12)) << 10);
2437     }
2438 
add(X d,X n,X m,Shift shift,int imm6)2439     void Assembler::add(X d, X n, X m, Shift shift, int imm6) {
2440         SkASSERT(shift != ROR);
2441 
2442         int imm = (imm6  & mask(6)) << 0
2443                 | (m     & mask(5)) << 6
2444                 | (0     & mask(1)) << 11
2445                 | (shift & mask(2)) << 12;
2446         this->op(0b1'0'0'01011'00'0'00000'000000, n,d, imm << 10);
2447     }
2448 
b(Condition cond,Label * l)2449     void Assembler::b(Condition cond, Label* l) {
2450         const int imm19 = this->disp19(l);
2451         this->op(0b0101010'0'00000000000000, (X)0, (V)cond, (imm19 & mask(19)) << 5);
2452     }
cbz(X t,Label * l)2453     void Assembler::cbz(X t, Label* l) {
2454         const int imm19 = this->disp19(l);
2455         this->op(0b1'011010'0'00000000000000, (X)0, t, (imm19 & mask(19)) << 5);
2456     }
cbnz(X t,Label * l)2457     void Assembler::cbnz(X t, Label* l) {
2458         const int imm19 = this->disp19(l);
2459         this->op(0b1'011010'1'00000000000000, (X)0, t, (imm19 & mask(19)) << 5);
2460     }
2461 
ldrd(X dst,X src,int imm12)2462     void Assembler::ldrd(X dst, X src, int imm12) {
2463         this->op(0b11'111'0'01'01'000000000000, src, dst, (imm12 & mask(12)) << 10);
2464     }
ldrs(X dst,X src,int imm12)2465     void Assembler::ldrs(X dst, X src, int imm12) {
2466         this->op(0b10'111'0'01'01'000000000000, src, dst, (imm12 & mask(12)) << 10);
2467     }
ldrh(X dst,X src,int imm12)2468     void Assembler::ldrh(X dst, X src, int imm12) {
2469         this->op(0b01'111'0'01'01'000000000000, src, dst, (imm12 & mask(12)) << 10);
2470     }
ldrb(X dst,X src,int imm12)2471     void Assembler::ldrb(X dst, X src, int imm12) {
2472         this->op(0b00'111'0'01'01'000000000000, src, dst, (imm12 & mask(12)) << 10);
2473     }
2474 
ldrq(V dst,X src,int imm12)2475     void Assembler::ldrq(V dst, X src, int imm12) {
2476         this->op(0b00'111'1'01'11'000000000000, src, dst, (imm12 & mask(12)) << 10);
2477     }
ldrd(V dst,X src,int imm12)2478     void Assembler::ldrd(V dst, X src, int imm12) {
2479         this->op(0b11'111'1'01'01'000000000000, src, dst, (imm12 & mask(12)) << 10);
2480     }
ldrs(V dst,X src,int imm12)2481     void Assembler::ldrs(V dst, X src, int imm12) {
2482         this->op(0b10'111'1'01'01'000000000000, src, dst, (imm12 & mask(12)) << 10);
2483     }
ldrh(V dst,X src,int imm12)2484     void Assembler::ldrh(V dst, X src, int imm12) {
2485         this->op(0b01'111'1'01'01'000000000000, src, dst, (imm12 & mask(12)) << 10);
2486     }
ldrb(V dst,X src,int imm12)2487     void Assembler::ldrb(V dst, X src, int imm12) {
2488         this->op(0b00'111'1'01'01'000000000000, src, dst, (imm12 & mask(12)) << 10);
2489     }
2490 
strs(X src,X dst,int imm12)2491     void Assembler::strs(X src, X dst, int imm12) {
2492         this->op(0b10'111'0'01'00'000000000000, dst, src, (imm12 & mask(12)) << 10);
2493     }
2494 
strq(V src,X dst,int imm12)2495     void Assembler::strq(V src, X dst, int imm12) {
2496         this->op(0b00'111'1'01'10'000000000000, dst, src, (imm12 & mask(12)) << 10);
2497     }
strd(V src,X dst,int imm12)2498     void Assembler::strd(V src, X dst, int imm12) {
2499         this->op(0b11'111'1'01'00'000000000000, dst, src, (imm12 & mask(12)) << 10);
2500     }
strs(V src,X dst,int imm12)2501     void Assembler::strs(V src, X dst, int imm12) {
2502         this->op(0b10'111'1'01'00'000000000000, dst, src, (imm12 & mask(12)) << 10);
2503     }
strh(V src,X dst,int imm12)2504     void Assembler::strh(V src, X dst, int imm12) {
2505         this->op(0b01'111'1'01'00'000000000000, dst, src, (imm12 & mask(12)) << 10);
2506     }
strb(V src,X dst,int imm12)2507     void Assembler::strb(V src, X dst, int imm12) {
2508         this->op(0b00'111'1'01'00'000000000000, dst, src, (imm12 & mask(12)) << 10);
2509     }
2510 
movs(X dst,V src,int lane)2511     void Assembler::movs(X dst, V src, int lane) {
2512         int imm5 = (lane << 3) | 0b100;
2513         this->op(0b0'0'0'01110000'00000'0'01'1'1'1, src, dst, (imm5 & mask(5)) << 16);
2514     }
inss(V dst,X src,int lane)2515     void Assembler::inss(V dst, X src, int lane) {
2516         int imm5 = (lane << 3) | 0b100;
2517         this->op(0b0'1'0'01110000'00000'0'0011'1, src, dst, (imm5 & mask(5)) << 16);
2518     }
2519 
2520 
ldrq(V dst,Label * l)2521     void Assembler::ldrq(V dst, Label* l) {
2522         const int imm19 = this->disp19(l);
2523         this->op(0b10'011'1'00'00000000000000, (V)0, dst, (imm19 & mask(19)) << 5);
2524     }
2525 
dup4s(V dst,X src)2526     void Assembler::dup4s(V dst, X src) {
2527         this->op(0b0'1'0'01110000'00100'0'0001'1, src, dst);
2528     }
2529 
ld1r4s(V dst,X src)2530     void Assembler::ld1r4s(V dst, X src) {
2531         this->op(0b0'1'0011010'1'0'00000'110'0'10, src, dst);
2532     }
ld1r8h(V dst,X src)2533     void Assembler::ld1r8h(V dst, X src) {
2534         this->op(0b0'1'0011010'1'0'00000'110'0'01, src, dst);
2535     }
ld1r16b(V dst,X src)2536     void Assembler::ld1r16b(V dst, X src) {
2537         this->op(0b0'1'0011010'1'0'00000'110'0'00, src, dst);
2538     }
2539 
ld24s(V dst,X src)2540     void Assembler::ld24s(V dst, X src) { this->op(0b0'1'0011000'1'000000'1000'10, src, dst); }
ld44s(V dst,X src)2541     void Assembler::ld44s(V dst, X src) { this->op(0b0'1'0011000'1'000000'0000'10, src, dst); }
st24s(V src,X dst)2542     void Assembler::st24s(V src, X dst) { this->op(0b0'1'0011000'0'000000'1000'10, dst, src); }
st44s(V src,X dst)2543     void Assembler::st44s(V src, X dst) { this->op(0b0'1'0011000'0'000000'0000'10, dst, src); }
2544 
ld24s(V dst,X src,int lane)2545     void Assembler::ld24s(V dst, X src, int lane) {
2546         int Q = (lane & 2)>>1,
2547             S = (lane & 1);
2548                  /*  Q                       S */
2549         this->op(0b0'0'0011010'1'1'00000'100'0'00, src, dst, (Q<<30)|(S<<12));
2550     }
ld44s(V dst,X src,int lane)2551     void Assembler::ld44s(V dst, X src, int lane) {
2552         int Q = (lane & 2)>>1,
2553             S = (lane & 1);
2554         this->op(0b0'0'0011010'1'1'00000'101'0'00, src, dst, (Q<<30)|(S<<12));
2555     }
2556 
label(Label * l)2557     void Assembler::label(Label* l) {
2558         if (fCode) {
2559             // The instructions all currently point to l->offset.
2560             // We'll want to add a delta to point them to here.
2561             int here = (int)this->size();
2562             int delta = here - l->offset;
2563             l->offset = here;
2564 
2565             if (l->kind == Label::ARMDisp19) {
2566                 for (int ref : l->references) {
2567                     // ref points to a 32-bit instruction with 19-bit displacement in instructions.
2568                     uint32_t inst;
2569                     memcpy(&inst, fCode + ref, 4);
2570 
2571                     // [ 8 bits to preserve] [ 19 bit signed displacement ] [ 5 bits to preserve ]
2572                     int disp = (int)(inst << 8) >> 13;
2573 
2574                     disp += delta/4;  // delta is in bytes, we want instructions.
2575 
2576                     // Put it all back together, preserving the high 8 bits and low 5.
2577                     inst = ((disp << 5) &  (mask(19) << 5))
2578                          | ((inst     ) & ~(mask(19) << 5));
2579                     memcpy(fCode + ref, &inst, 4);
2580                 }
2581             }
2582 
2583             if (l->kind == Label::X86Disp32) {
2584                 for (int ref : l->references) {
2585                     // ref points to a 32-bit displacement in bytes.
2586                     int disp;
2587                     memcpy(&disp, fCode + ref, 4);
2588 
2589                     disp += delta;
2590 
2591                     memcpy(fCode + ref, &disp, 4);
2592                 }
2593             }
2594         }
2595     }
2596 
eval(int n,void * args[]) const2597     void Program::eval(int n, void* args[]) const {
2598     #define SKVM_JIT_STATS 0
2599     #if SKVM_JIT_STATS
2600         static std::atomic<int64_t>  calls{0}, jits{0},
2601                                     pixels{0}, fast{0};
2602         pixels += n;
2603         if (0 == calls++) {
2604             atexit([]{
2605                 int64_t num = jits .load(),
2606                         den = calls.load();
2607                 SkDebugf("%.3g%% of %lld eval() calls went through JIT.\n", (100.0 * num)/den, den);
2608                 num = fast  .load();
2609                 den = pixels.load();
2610                 SkDebugf("%.3g%% of %lld pixels went through JIT.\n", (100.0 * num)/den, den);
2611             });
2612         }
2613     #endif
2614 
2615     #if !defined(SKVM_JIT_BUT_IGNORE_IT)
2616         const void* jit_entry = fImpl->jit_entry.load();
2617         // jit_entry may be null if we can't JIT
2618         //
2619         // Ordinarily we'd never find ourselves with non-null jit_entry and !gSkVMAllowJIT, but it
2620         // can happen during interactive programs like Viewer that toggle gSkVMAllowJIT on and off,
2621         // due to timing or program caching.
2622         if (jit_entry != nullptr && gSkVMAllowJIT) {
2623         #if SKVM_JIT_STATS
2624             jits++;
2625             fast += n;
2626         #endif
2627             void** a = args;
2628             switch (fImpl->strides.size()) {
2629                 case 0: return ((void(*)(int                        ))jit_entry)(n               );
2630                 case 1: return ((void(*)(int,void*                  ))jit_entry)(n,a[0]          );
2631                 case 2: return ((void(*)(int,void*,void*            ))jit_entry)(n,a[0],a[1]     );
2632                 case 3: return ((void(*)(int,void*,void*,void*      ))jit_entry)(n,a[0],a[1],a[2]);
2633                 case 4: return ((void(*)(int,void*,void*,void*,void*))jit_entry)
2634                                 (n,a[0],a[1],a[2],a[3]);
2635                 case 5: return ((void(*)(int,void*,void*,void*,void*,void*))jit_entry)
2636                                 (n,a[0],a[1],a[2],a[3],a[4]);
2637                 case 6: return ((void(*)(int,void*,void*,void*,void*,void*,void*))jit_entry)
2638                                 (n,a[0],a[1],a[2],a[3],a[4],a[5]);
2639                 case 7: return ((void(*)(int,void*,void*,void*,void*,void*,void*,void*))jit_entry)
2640                                 (n,a[0],a[1],a[2],a[3],a[4],a[5],a[6]);
2641                 default: break; //SkASSERT(fImpl->strides.size() <= 7);
2642             }
2643         }
2644     #endif
2645 
2646         // So we'll sometimes use the interpreter here even if later calls will use the JIT.
2647         SkOpts::interpret_skvm(fImpl->instructions.data(), (int)fImpl->instructions.size(),
2648                                this->nregs(), this->loop(), fImpl->strides.data(),
2649                                fImpl->traceHooks.data(), fImpl->traceHooks.size(),
2650                                this->nargs(), n, args);
2651     }
2652 
hasTraceHooks() const2653     bool Program::hasTraceHooks() const {
2654         // Identifies a program which has been instrumented for debugging.
2655         return !fImpl->traceHooks.empty();
2656     }
2657 
hasJIT() const2658     bool Program::hasJIT() const {
2659         return fImpl->jit_entry.load() != nullptr;
2660     }
2661 
dropJIT()2662     void Program::dropJIT() {
2663     #if defined(SKVM_JIT)
2664         if (fImpl->dylib) {
2665             close_dylib(fImpl->dylib);
2666         } else if (auto jit_entry = fImpl->jit_entry.load()) {
2667             unmap_jit_buffer(jit_entry, fImpl->jit_size);
2668         }
2669     #else
2670         SkASSERT(!this->hasJIT());
2671     #endif
2672 
2673         fImpl->jit_entry.store(nullptr);
2674         fImpl->jit_size  = 0;
2675         fImpl->dylib     = nullptr;
2676     }
2677 
Program()2678     Program::Program() : fImpl(std::make_unique<Impl>()) {}
2679 
~Program()2680     Program::~Program() {
2681         // Moved-from Programs may have fImpl == nullptr.
2682         if (fImpl) {
2683             this->dropJIT();
2684         }
2685     }
2686 
Program(Program && other)2687     Program::Program(Program&& other) : fImpl(std::move(other.fImpl)) {}
2688 
operator =(Program && other)2689     Program& Program::operator=(Program&& other) {
2690         fImpl = std::move(other.fImpl);
2691         return *this;
2692     }
2693 
Program(const std::vector<OptimizedInstruction> & instructions,std::unique_ptr<viz::Visualizer> visualizer,const std::vector<int> & strides,const std::vector<TraceHook * > & traceHooks,const char * debug_name,bool allow_jit)2694     Program::Program(const std::vector<OptimizedInstruction>& instructions,
2695                      std::unique_ptr<viz::Visualizer> visualizer,
2696                      const std::vector<int>& strides,
2697                      const std::vector<TraceHook*>& traceHooks,
2698                      const char* debug_name, bool allow_jit) : Program() {
2699         fImpl->visualizer = std::move(visualizer);
2700         fImpl->strides = strides;
2701         fImpl->traceHooks = traceHooks;
2702         if (gSkVMAllowJIT && allow_jit) {
2703         #if defined(SKVM_JIT)
2704             this->setupJIT(instructions, debug_name);
2705         #endif
2706         }
2707 
2708         this->setupInterpreter(instructions);
2709     }
2710 
instructions() const2711     std::vector<InterpreterInstruction> Program::instructions() const { return fImpl->instructions; }
nargs() const2712     int  Program::nargs() const { return (int)fImpl->strides.size(); }
nregs() const2713     int  Program::nregs() const { return fImpl->regs; }
loop() const2714     int  Program::loop () const { return fImpl->loop; }
empty() const2715     bool Program::empty() const { return fImpl->instructions.empty(); }
2716 
2717     // Translate OptimizedInstructions to InterpreterInstructions.
setupInterpreter(const std::vector<OptimizedInstruction> & instructions)2718     void Program::setupInterpreter(const std::vector<OptimizedInstruction>& instructions) {
2719         // Register each instruction is assigned to.
2720         std::vector<Reg> reg(instructions.size());
2721 
2722         // This next bit is a bit more complicated than strictly necessary;
2723         // we could just assign every instruction to its own register.
2724         //
2725         // But recycling registers is fairly cheap, and good practice for the
2726         // JITs where minimizing register pressure really is important.
2727         //
2728         // We have effectively infinite registers, so we hoist any value we can.
2729         // (The JIT may choose a more complex policy to reduce register pressure.)
2730 
2731         fImpl->regs = 0;
2732         std::vector<Reg> avail;
2733 
2734         // Assign this value to a register, recycling them where we can.
2735         auto assign_register = [&](Val id) {
2736             const OptimizedInstruction& inst = instructions[id];
2737 
2738             // If this is a real input and it's lifetime ends at this instruction,
2739             // we can recycle the register it's occupying.
2740             auto maybe_recycle_register = [&](Val input) {
2741                 if (input != NA && instructions[input].death == id) {
2742                     avail.push_back(reg[input]);
2743                 }
2744             };
2745 
2746             // Take care to not recycle the same register twice.
2747             const Val x = inst.x, y = inst.y, z = inst.z, w = inst.w;
2748             if (true                      ) { maybe_recycle_register(x); }
2749             if (y != x                    ) { maybe_recycle_register(y); }
2750             if (z != x && z != y          ) { maybe_recycle_register(z); }
2751             if (w != x && w != y && w != z) { maybe_recycle_register(w); }
2752 
2753             // Instructions that die at themselves (stores) don't need a register.
2754             if (inst.death != id) {
2755                 // Allocate a register if we have to, preferring to reuse anything available.
2756                 if (avail.empty()) {
2757                     reg[id] = fImpl->regs++;
2758                 } else {
2759                     reg[id] = avail.back();
2760                     avail.pop_back();
2761                 }
2762             }
2763         };
2764 
2765         // Assign a register to each hoisted instruction, then each non-hoisted loop instruction.
2766         for (Val id = 0; id < (Val)instructions.size(); id++) {
2767             if ( instructions[id].can_hoist) { assign_register(id); }
2768         }
2769         for (Val id = 0; id < (Val)instructions.size(); id++) {
2770             if (!instructions[id].can_hoist) { assign_register(id); }
2771         }
2772 
2773         // Translate OptimizedInstructions to InterpreterIstructions by mapping values to
2774         // registers.  This will be two passes, first hoisted instructions, then inside the loop.
2775 
2776         // The loop begins at the fImpl->loop'th Instruction.
2777         fImpl->loop = 0;
2778         fImpl->instructions.reserve(instructions.size());
2779 
2780         // Add a mapping for the N/A sentinel Val to any arbitrary register
2781         // so lookups don't have to know which arguments are used by which Ops.
2782         auto lookup_register = [&](Val id) {
2783             return id == NA ? (Reg)0
2784                             : reg[id];
2785         };
2786 
2787         auto push_instruction = [&](Val id, const OptimizedInstruction& inst) {
2788             InterpreterInstruction pinst{
2789                 inst.op,
2790                 lookup_register(id),
2791                 lookup_register(inst.x),
2792                 lookup_register(inst.y),
2793                 lookup_register(inst.z),
2794                 lookup_register(inst.w),
2795                 inst.immA,
2796                 inst.immB,
2797                 inst.immC,
2798             };
2799             fImpl->instructions.push_back(pinst);
2800         };
2801 
2802         for (Val id = 0; id < (Val)instructions.size(); id++) {
2803             const OptimizedInstruction& inst = instructions[id];
2804             if (inst.can_hoist) {
2805                 push_instruction(id, inst);
2806                 fImpl->loop++;
2807             }
2808         }
2809         for (Val id = 0; id < (Val)instructions.size(); id++) {
2810             const OptimizedInstruction& inst = instructions[id];
2811             if (!inst.can_hoist) {
2812                 push_instruction(id, inst);
2813             }
2814         }
2815     }
2816 
2817 #if defined(SKVM_JIT)
2818 
2819     namespace SkVMJitTypes {
2820     #if defined(__x86_64__) || defined(_M_X64)
2821         using Reg = Assembler::Ymm;
2822     #elif defined(__aarch64__)
2823         using Reg = Assembler::V;
2824     #endif
2825     }  // namespace SkVMJitTypes
2826 
jit(const std::vector<OptimizedInstruction> & instructions,int * stack_hint,uint32_t * registers_used,Assembler * a) const2827     bool Program::jit(const std::vector<OptimizedInstruction>& instructions,
2828                       int* stack_hint,
2829                       uint32_t* registers_used,
2830                       Assembler* a) const {
2831         using A = Assembler;
2832         using SkVMJitTypes::Reg;
2833 
2834         SkTHashMap<int, A::Label> constants;    // Constants (mostly splats) share the same pool.
2835         A::Label                  iota;         // Varies per lane, for Op::index.
2836         A::Label                  load64_index; // Used to load low or high half of 64-bit lanes.
2837 
2838         // The `regs` array tracks everything we know about each register's state:
2839         //   - NA:   empty
2840         //   - RES:  reserved by ABI
2841         //   - TMP:  holding a temporary
2842         //   - id:   holding Val id
2843         constexpr Val RES = NA-1,
2844                       TMP = RES-1;
2845 
2846         // Map val -> stack slot.
2847         std::vector<int> stack_slot(instructions.size(), NA);
2848         int next_stack_slot = 0;
2849 
2850         const int nstack_slots = *stack_hint >= 0 ? *stack_hint
2851                                                   : stack_slot.size();
2852     #if defined(__x86_64__) || defined(_M_X64)
2853         if (!SkCpu::Supports(SkCpu::HSW)) {
2854             return false;
2855         }
2856         const int K = 8;
2857         #if defined(_M_X64)  // Important to check this first; clang-cl defines both.
2858             const A::GP64 N = A::rcx,
2859                         GP0 = A::rax,
2860                         GP1 = A::r11,
2861                         arg[]    = { A::rdx, A::r8, A::r9, A::r10, A::rdi, A::rsi };
2862 
2863             // xmm6-15 need are callee-saved.
2864             std::array<Val,16> regs = {
2865                  NA, NA, NA, NA,  NA, NA,RES,RES,
2866                 RES,RES,RES,RES, RES,RES,RES,RES,
2867             };
2868             const uint32_t incoming_registers_used = *registers_used;
2869 
2870             auto enter = [&]{
2871                 // rcx,rdx,r8,r9 are all already holding their correct values.
2872                 // Load caller-saved r10 from rsp+40 if there's a fourth arg.
2873                 if (fImpl->strides.size() >= 4) {
2874                     a->mov(A::r10, A::Mem{A::rsp, 40});
2875                 }
2876                 // Load callee-saved rdi from rsp+48 if there's a fifth arg,
2877                 // first saving it to ABI reserved shadow area rsp+8.
2878                 if (fImpl->strides.size() >= 5) {
2879                     a->mov(A::Mem{A::rsp, 8}, A::rdi);
2880                     a->mov(A::rdi, A::Mem{A::rsp, 48});
2881                 }
2882                 // Load callee-saved rsi from rsp+56 if there's a sixth arg,
2883                 // first saving it to ABI reserved shadow area rsp+16.
2884                 if (fImpl->strides.size() >= 6) {
2885                     a->mov(A::Mem{A::rsp, 16}, A::rsi);
2886                     a->mov(A::rsi, A::Mem{A::rsp, 56});
2887                 }
2888 
2889                 // Allocate stack for our values and callee-saved xmm6-15.
2890                 int stack_needed = nstack_slots*K*4;
2891                 for (int r = 6; r < 16; r++) {
2892                     if (incoming_registers_used & (1<<r)) {
2893                         stack_needed += 16;
2894                     }
2895                 }
2896                 if (stack_needed) { a->sub(A::rsp, stack_needed); }
2897 
2898                 int next_saved_xmm = nstack_slots*K*4;
2899                 for (int r = 6; r < 16; r++) {
2900                     if (incoming_registers_used & (1<<r)) {
2901                         a->vmovups(A::Mem{A::rsp, next_saved_xmm}, (A::Xmm)r);
2902                         next_saved_xmm += 16;
2903                         regs[r] = NA;
2904                     }
2905                 }
2906             };
2907             auto exit  = [&]{
2908                 // The second pass of jit() shouldn't use any register it didn't in the first pass.
2909                 SkASSERT((*registers_used & incoming_registers_used) == *registers_used);
2910 
2911                 // Restore callee-saved xmm6-15 and the stack pointer.
2912                 int stack_used = nstack_slots*K*4;
2913                 for (int r = 6; r < 16; r++) {
2914                     if (incoming_registers_used & (1<<r)) {
2915                         a->vmovups((A::Xmm)r, A::Mem{A::rsp, stack_used});
2916                         stack_used += 16;
2917                     }
2918                 }
2919                 if (stack_used) { a->add(A::rsp, stack_used); }
2920 
2921                 // Restore callee-saved rdi/rsi if we used them.
2922                 if (fImpl->strides.size() >= 5) {
2923                     a->mov(A::rdi, A::Mem{A::rsp, 8});
2924                 }
2925                 if (fImpl->strides.size() >= 6) {
2926                     a->mov(A::rsi, A::Mem{A::rsp, 16});
2927                 }
2928 
2929                 a->vzeroupper();
2930                 a->ret();
2931             };
2932         #elif defined(__x86_64__)
2933             const A::GP64 N = A::rdi,
2934                         GP0 = A::rax,
2935                         GP1 = A::r11,
2936                         arg[]    = { A::rsi, A::rdx, A::rcx, A::r8, A::r9, A::r10 };
2937 
2938             // All 16 ymm registers are available to use.
2939             std::array<Val,16> regs = {
2940                 NA,NA,NA,NA, NA,NA,NA,NA,
2941                 NA,NA,NA,NA, NA,NA,NA,NA,
2942             };
2943 
2944             auto enter = [&]{
2945                 // Load caller-saved r10 from rsp+8 if there's a sixth arg.
2946                 if (fImpl->strides.size() >= 6) {
2947                     a->mov(A::r10, A::Mem{A::rsp, 8});
2948                 }
2949                 if (nstack_slots) { a->sub(A::rsp, nstack_slots*K*4); }
2950             };
2951             auto exit  = [&]{
2952                 if (nstack_slots) { a->add(A::rsp, nstack_slots*K*4); }
2953                 a->vzeroupper();
2954                 a->ret();
2955             };
2956         #endif
2957 
2958         auto load_from_memory = [&](Reg r, Val v) {
2959             if (instructions[v].op == Op::splat) {
2960                 if (instructions[v].immA == 0) {
2961                     a->vpxor(r,r,r);
2962                 } else {
2963                     a->vmovups(r, constants.find(instructions[v].immA));
2964                 }
2965             } else {
2966                 SkASSERT(stack_slot[v] != NA);
2967                 a->vmovups(r, A::Mem{A::rsp, stack_slot[v]*K*4});
2968             }
2969         };
2970         auto store_to_stack = [&](Reg r, Val v) {
2971             SkASSERT(next_stack_slot < nstack_slots);
2972             stack_slot[v] = next_stack_slot++;
2973             a->vmovups(A::Mem{A::rsp, stack_slot[v]*K*4}, r);
2974         };
2975     #elif defined(__aarch64__)
2976         const int K = 4;
2977         const A::X N     = A::x0,
2978                    GP0   = A::x8,
2979                    GP1   = A::x9,
2980                    arg[] = { A::x1, A::x2, A::x3, A::x4, A::x5, A::x6, A::x7 };
2981 
2982         // We can use v0-v7 and v16-v31 freely; we'd need to preserve v8-v15 in enter/exit.
2983         std::array<Val,32> regs = {
2984              NA, NA, NA, NA,  NA, NA, NA, NA,
2985             RES,RES,RES,RES, RES,RES,RES,RES,
2986              NA, NA, NA, NA,  NA, NA, NA, NA,
2987              NA, NA, NA, NA,  NA, NA, NA, NA,
2988         };
2989 
2990         auto enter = [&]{ if (nstack_slots) { a->sub(A::sp, A::sp, nstack_slots*K*4); } };
2991         auto exit  = [&]{ if (nstack_slots) { a->add(A::sp, A::sp, nstack_slots*K*4); }
2992                           a->ret(A::x30); };
2993 
2994         auto load_from_memory = [&](Reg r, Val v) {
2995             if (instructions[v].op == Op::splat) {
2996                 if (instructions[v].immA == 0) {
2997                     a->eor16b(r,r,r);
2998                 } else {
2999                     a->ldrq(r, constants.find(instructions[v].immA));
3000                 }
3001             } else {
3002                 SkASSERT(stack_slot[v] != NA);
3003                 a->ldrq(r, A::sp, stack_slot[v]);
3004             }
3005         };
3006         auto store_to_stack  = [&](Reg r, Val v) {
3007             SkASSERT(next_stack_slot < nstack_slots);
3008             stack_slot[v] = next_stack_slot++;
3009             a->strq(r, A::sp, stack_slot[v]);
3010         };
3011     #endif
3012 
3013         *registers_used = 0;  // We'll update this as we go.
3014 
3015         if (std::size(arg) < fImpl->strides.size()) {
3016             return false;
3017         }
3018 
3019         auto emit = [&](Val id, bool scalar) {
3020             const int active_lanes = scalar ? 1 : K;
3021             const OptimizedInstruction& inst = instructions[id];
3022             const Op op = inst.op;
3023             const Val x = inst.x,
3024                       y = inst.y,
3025                       z = inst.z,
3026                       w = inst.w;
3027             const int immA = inst.immA,
3028                       immB = inst.immB,
3029                       immC = inst.immC;
3030 
3031             // alloc_tmp() returns the first of N adjacent temporary registers,
3032             // each freed manually with free_tmp() or noted as our result with mark_tmp_as_dst().
3033             auto alloc_tmp = [&](int N=1) -> Reg {
3034                 auto needs_spill = [&](Val v) -> bool {
3035                     SkASSERT(v >= 0);   // {NA,TMP,RES} need to be handled before calling this.
3036                     return stack_slot[v] == NA               // We haven't spilled it already?
3037                         && instructions[v].op != Op::splat;  // No need to spill constants.
3038                 };
3039 
3040                 // We want to find a block of N adjacent registers requiring the fewest spills.
3041                 int best_block = -1,
3042                     min_spills = 0x7fff'ffff;
3043                 for (int block = 0; block+N <= (int)regs.size(); block++) {
3044                     int spills = 0;
3045                     for (int r = block; r < block+N; r++) {
3046                         Val v = regs[r];
3047                         // Registers holding NA (nothing) are ideal, nothing to spill.
3048                         if (v == NA) {
3049                             continue;
3050                         }
3051                         // We can't spill anything REServed or that we'll need this instruction.
3052                         if (v == RES ||
3053                             v == TMP || v == id || v == x || v == y || v == z || v == w) {
3054                             spills = 0x7fff'ffff;
3055                             block  = r;   // (optimization) continue outer loop at next register.
3056                             break;
3057                         }
3058                         // Usually here we've got a value v that we'd have to spill to the stack
3059                         // before reusing its register, but sometimes even now we get a freebie.
3060                         spills += needs_spill(v) ? 1 : 0;
3061                     }
3062 
3063                     // TODO: non-arbitrary tie-breaking?
3064                     if (min_spills > spills) {
3065                         min_spills = spills;
3066                         best_block = block;
3067                     }
3068                     if (min_spills == 0) {
3069                         break;  // (optimization) stop early if we find an unbeatable block.
3070                     }
3071                 }
3072 
3073                 // TODO: our search's success isn't obviously guaranteed... it depends on N
3074                 // and the number and relative position in regs of any unspillable values.
3075                 // I think we should be able to get away with N≤2 on x86-64 and N≤4 on arm64;
3076                 // we'll need to revisit this logic should this assert fire.
3077                 SkASSERT(min_spills <= N);
3078 
3079                 // Spill what needs spilling, and mark the block all as TMP.
3080                 for (int r = best_block; r < best_block+N; r++) {
3081                     Val& v = regs[r];
3082                     *registers_used |= (1<<r);
3083 
3084                     SkASSERT(v == NA || v >= 0);
3085                     if (v >= 0 && needs_spill(v)) {
3086                         store_to_stack((Reg)r, v);
3087                         SkASSERT(!needs_spill(v));
3088                         min_spills--;
3089                     }
3090 
3091                     v = TMP;
3092                 }
3093                 SkASSERT(min_spills == 0);
3094                 return (Reg)best_block;
3095             };
3096 
3097             auto free_tmp = [&](Reg r) {
3098                 SkASSERT(regs[r] == TMP);
3099                 regs[r] = NA;
3100             };
3101 
3102             // Which register holds dst,x,y,z,w for this instruction?  NA if none does yet.
3103             int rd = NA,
3104                 rx = NA,
3105                 ry = NA,
3106                 rz = NA,
3107                 rw = NA;
3108 
3109             auto update_regs = [&](Reg r, Val v) {
3110                 if (v == id) { rd = r; }
3111                 if (v ==  x) { rx = r; }
3112                 if (v ==  y) { ry = r; }
3113                 if (v ==  z) { rz = r; }
3114                 if (v ==  w) { rw = r; }
3115                 return r;
3116             };
3117 
3118             auto find_existing_reg = [&](Val v) -> int {
3119                 // Quick-check our working registers.
3120                 if (v == id && rd != NA) { return rd; }
3121                 if (v ==  x && rx != NA) { return rx; }
3122                 if (v ==  y && ry != NA) { return ry; }
3123                 if (v ==  z && rz != NA) { return rz; }
3124                 if (v ==  w && rw != NA) { return rw; }
3125 
3126                 // Search inter-instruction register map.
3127                 for (auto [r,val] : SkMakeEnumerate(regs)) {
3128                     if (val == v) {
3129                         return update_regs((Reg)r, v);
3130                     }
3131                 }
3132                 return NA;
3133             };
3134 
3135             // Return a register for Val, holding that value if it already exists.
3136             // During this instruction all calls to r(v) will return the same register.
3137             auto r = [&](Val v) -> Reg {
3138                 SkASSERT(v >= 0);
3139 
3140                 if (int found = find_existing_reg(v); found != NA) {
3141                     return (Reg)found;
3142                 }
3143 
3144                 Reg r = alloc_tmp();
3145                 SkASSERT(regs[r] == TMP);
3146 
3147                 SkASSERT(v <= id);
3148                 if (v < id) {
3149                     // If v < id, we're loading one of this instruction's inputs.
3150                     // If v == id we're just allocating its destination register.
3151                     load_from_memory(r, v);
3152                 }
3153                 regs[r] = v;
3154                 return update_regs(r, v);
3155             };
3156 
3157             auto dies_here = [&](Val v) -> bool {
3158                 SkASSERT(v >= 0);
3159                 return instructions[v].death == id;
3160             };
3161 
3162             // Alias dst() to r(v) if dies_here(v).
3163             auto try_alias = [&](Val v) -> bool {
3164                 SkASSERT(v == x || v == y || v == z || v == w);
3165                 if (dies_here(v)) {
3166                     rd = r(v);      // Vals v and id share a register for this instruction.
3167                     regs[rd] = id;  // Next instruction, Val id will be in the register, not Val v.
3168                     return true;
3169                 }
3170                 return false;
3171             };
3172 
3173             // Generally r(id),
3174             // but with a hint, try to alias dst() to r(v) if dies_here(v).
3175             auto dst = [&](Val hint1 = NA, Val hint2 = NA) -> Reg {
3176                 if (hint1 != NA && try_alias(hint1)) { return r(id); }
3177                 if (hint2 != NA && try_alias(hint2)) { return r(id); }
3178                 return r(id);
3179             };
3180 
3181         #if defined(__aarch64__)  // Nothing sneaky, just unused on x86-64.
3182             auto mark_tmp_as_dst = [&](Reg tmp) {
3183                 SkASSERT(regs[tmp] == TMP);
3184                 rd = tmp;
3185                 regs[rd] = id;
3186                 SkASSERT(dst() == tmp);
3187             };
3188         #endif
3189 
3190         #if defined(__x86_64__) || defined(_M_X64)
3191             // On x86 we can work with many values directly from the stack or program constant pool.
3192             auto any = [&](Val v) -> A::Operand {
3193                 SkASSERT(v >= 0);
3194                 SkASSERT(v < id);
3195 
3196                 if (int found = find_existing_reg(v); found != NA) {
3197                     return (Reg)found;
3198                 }
3199                 if (instructions[v].op == Op::splat) {
3200                     return constants.find(instructions[v].immA);
3201                 }
3202                 return A::Mem{A::rsp, stack_slot[v]*K*4};
3203             };
3204 
3205             // This is never really worth asking except when any() might be used;
3206             // if we need this value in ARM, might as well just call r(v) to get it into a register.
3207             auto in_reg = [&](Val v) -> bool {
3208                 return find_existing_reg(v) != NA;
3209             };
3210         #endif
3211 
3212             switch (op) {
3213                 // Make sure splat constants can be found by load_from_memory() or any().
3214                 case Op::splat:
3215                     (void)constants[immA];
3216                     break;
3217 
3218             #if defined(__x86_64__) || defined(_M_X64)
3219                 case Op::assert_true: {
3220                     a->vptest (r(x), &constants[0xffffffff]);
3221                     A::Label all_true;
3222                     a->jc(&all_true);
3223                     a->int3();
3224                     a->label(&all_true);
3225                 } break;
3226 
3227                 case Op::trace_line:
3228                 case Op::trace_var:
3229                 case Op::trace_enter:
3230                 case Op::trace_exit:
3231                 case Op::trace_scope:
3232                     /* Force this program to run in the interpreter. */
3233                     return false;
3234 
3235                 case Op::store8:
3236                     if (scalar) {
3237                         a->vpextrb(A::Mem{arg[immA]}, (A::Xmm)r(x), 0);
3238                     } else {
3239                         a->vpackusdw(dst(x), r(x), r(x));
3240                         a->vpermq   (dst(), dst(), 0xd8);
3241                         a->vpackuswb(dst(), dst(), dst());
3242                         a->vmovq    (A::Mem{arg[immA]}, (A::Xmm)dst());
3243                     } break;
3244 
3245                 case Op::store16:
3246                     if (scalar) {
3247                         a->vpextrw(A::Mem{arg[immA]}, (A::Xmm)r(x), 0);
3248                     } else {
3249                         a->vpackusdw(dst(x), r(x), r(x));
3250                         a->vpermq   (dst(), dst(), 0xd8);
3251                         a->vmovups  (A::Mem{arg[immA]}, (A::Xmm)dst());
3252                     } break;
3253 
3254                 case Op::store32: if (scalar) { a->vmovd  (A::Mem{arg[immA]}, (A::Xmm)r(x)); }
3255                                   else        { a->vmovups(A::Mem{arg[immA]},         r(x)); }
3256                                   break;
3257 
3258                 case Op::store64: if (scalar) {
3259                                       a->vmovd(A::Mem{arg[immA],0}, (A::Xmm)r(x));
3260                                       a->vmovd(A::Mem{arg[immA],4}, (A::Xmm)r(y));
3261                                   } else {
3262                                       // r(x) = {a,b,c,d|e,f,g,h}
3263                                       // r(y) = {i,j,k,l|m,n,o,p}
3264                                       // We want to write a,i,b,j,c,k,d,l,e,m...
3265                                       A::Ymm L = alloc_tmp(),
3266                                              H = alloc_tmp();
3267                                       a->vpunpckldq(L, r(x), any(y));  // L = {a,i,b,j|e,m,f,n}
3268                                       a->vpunpckhdq(H, r(x), any(y));  // H = {c,k,d,l|g,o,h,p}
3269                                       a->vperm2f128(dst(), L,H, 0x20); //   = {a,i,b,j|c,k,d,l}
3270                                       a->vmovups(A::Mem{arg[immA], 0}, dst());
3271                                       a->vperm2f128(dst(), L,H, 0x31); //   = {e,m,f,n|g,o,h,p}
3272                                       a->vmovups(A::Mem{arg[immA],32}, dst());
3273                                       free_tmp(L);
3274                                       free_tmp(H);
3275                                   } break;
3276 
3277                 case Op::store128: {
3278                     // TODO: >32-bit stores
3279                     a->vmovd  (A::Mem{arg[immA], 0*16 +  0}, (A::Xmm)r(x)   );
3280                     a->vmovd  (A::Mem{arg[immA], 0*16 +  4}, (A::Xmm)r(y)   );
3281                     a->vmovd  (A::Mem{arg[immA], 0*16 +  8}, (A::Xmm)r(z)   );
3282                     a->vmovd  (A::Mem{arg[immA], 0*16 + 12}, (A::Xmm)r(w)   );
3283                     if (scalar) { break; }
3284 
3285                     a->vpextrd(A::Mem{arg[immA], 1*16 +  0}, (A::Xmm)r(x), 1);
3286                     a->vpextrd(A::Mem{arg[immA], 1*16 +  4}, (A::Xmm)r(y), 1);
3287                     a->vpextrd(A::Mem{arg[immA], 1*16 +  8}, (A::Xmm)r(z), 1);
3288                     a->vpextrd(A::Mem{arg[immA], 1*16 + 12}, (A::Xmm)r(w), 1);
3289 
3290                     a->vpextrd(A::Mem{arg[immA], 2*16 +  0}, (A::Xmm)r(x), 2);
3291                     a->vpextrd(A::Mem{arg[immA], 2*16 +  4}, (A::Xmm)r(y), 2);
3292                     a->vpextrd(A::Mem{arg[immA], 2*16 +  8}, (A::Xmm)r(z), 2);
3293                     a->vpextrd(A::Mem{arg[immA], 2*16 + 12}, (A::Xmm)r(w), 2);
3294 
3295                     a->vpextrd(A::Mem{arg[immA], 3*16 +  0}, (A::Xmm)r(x), 3);
3296                     a->vpextrd(A::Mem{arg[immA], 3*16 +  4}, (A::Xmm)r(y), 3);
3297                     a->vpextrd(A::Mem{arg[immA], 3*16 +  8}, (A::Xmm)r(z), 3);
3298                     a->vpextrd(A::Mem{arg[immA], 3*16 + 12}, (A::Xmm)r(w), 3);
3299                     // Now we need to store the upper 128 bits of x,y,z,w.
3300                     // Storing in this order rather than interlacing minimizes temporaries.
3301                     a->vextracti128(dst(), r(x), 1);
3302                     a->vmovd  (A::Mem{arg[immA], 4*16 +  0}, (A::Xmm)dst()   );
3303                     a->vpextrd(A::Mem{arg[immA], 5*16 +  0}, (A::Xmm)dst(), 1);
3304                     a->vpextrd(A::Mem{arg[immA], 6*16 +  0}, (A::Xmm)dst(), 2);
3305                     a->vpextrd(A::Mem{arg[immA], 7*16 +  0}, (A::Xmm)dst(), 3);
3306 
3307                     a->vextracti128(dst(), r(y), 1);
3308                     a->vmovd  (A::Mem{arg[immA], 4*16 +  4}, (A::Xmm)dst()   );
3309                     a->vpextrd(A::Mem{arg[immA], 5*16 +  4}, (A::Xmm)dst(), 1);
3310                     a->vpextrd(A::Mem{arg[immA], 6*16 +  4}, (A::Xmm)dst(), 2);
3311                     a->vpextrd(A::Mem{arg[immA], 7*16 +  4}, (A::Xmm)dst(), 3);
3312 
3313                     a->vextracti128(dst(), r(z), 1);
3314                     a->vmovd  (A::Mem{arg[immA], 4*16 +  8}, (A::Xmm)dst()   );
3315                     a->vpextrd(A::Mem{arg[immA], 5*16 +  8}, (A::Xmm)dst(), 1);
3316                     a->vpextrd(A::Mem{arg[immA], 6*16 +  8}, (A::Xmm)dst(), 2);
3317                     a->vpextrd(A::Mem{arg[immA], 7*16 +  8}, (A::Xmm)dst(), 3);
3318 
3319                     a->vextracti128(dst(), r(w), 1);
3320                     a->vmovd  (A::Mem{arg[immA], 4*16 + 12}, (A::Xmm)dst()   );
3321                     a->vpextrd(A::Mem{arg[immA], 5*16 + 12}, (A::Xmm)dst(), 1);
3322                     a->vpextrd(A::Mem{arg[immA], 6*16 + 12}, (A::Xmm)dst(), 2);
3323                     a->vpextrd(A::Mem{arg[immA], 7*16 + 12}, (A::Xmm)dst(), 3);
3324                 } break;
3325 
3326                 case Op::load8:  if (scalar) {
3327                                      a->vpxor  (dst(), dst(), dst());
3328                                      a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{arg[immA]}, 0);
3329                                  } else {
3330                                      a->vpmovzxbd(dst(), A::Mem{arg[immA]});
3331                                  } break;
3332 
3333                 case Op::load16: if (scalar) {
3334                                      a->vpxor  (dst(), dst(), dst());
3335                                      a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{arg[immA]}, 0);
3336                                  } else {
3337                                      a->vpmovzxwd(dst(), A::Mem{arg[immA]});
3338                                  } break;
3339 
3340                 case Op::load32: if (scalar) { a->vmovd  ((A::Xmm)dst(), A::Mem{arg[immA]}); }
3341                                  else        { a->vmovups(        dst(), A::Mem{arg[immA]}); }
3342                                  break;
3343 
3344                 case Op::load64: if (scalar) {
3345                                     a->vmovd((A::Xmm)dst(), A::Mem{arg[immA], 4*immB});
3346                                  } else {
3347                                     A::Ymm tmp = alloc_tmp();
3348                                     a->vmovups(tmp, &load64_index);
3349                                     a->vpermps(dst(), tmp, A::Mem{arg[immA],  0});
3350                                     a->vpermps(  tmp, tmp, A::Mem{arg[immA], 32});
3351                                     // Low 128 bits holds immB=0 lanes, high 128 bits holds immB=1.
3352                                     a->vperm2f128(dst(), dst(),tmp, immB ? 0x31 : 0x20);
3353                                     free_tmp(tmp);
3354                                  } break;
3355 
3356                 case Op::load128: if (scalar) {
3357                                       a->vmovd((A::Xmm)dst(), A::Mem{arg[immA], 4*immB});
3358                                   } else {
3359                                       // Load 4 low values into xmm tmp,
3360                                       A::Ymm tmp = alloc_tmp();
3361                                       A::Xmm t = (A::Xmm)tmp;
3362                                       a->vmovd  (t,   A::Mem{arg[immA], 0*16 + 4*immB}   );
3363                                       a->vpinsrd(t,t, A::Mem{arg[immA], 1*16 + 4*immB}, 1);
3364                                       a->vpinsrd(t,t, A::Mem{arg[immA], 2*16 + 4*immB}, 2);
3365                                       a->vpinsrd(t,t, A::Mem{arg[immA], 3*16 + 4*immB}, 3);
3366 
3367                                       // Load 4 high values into xmm dst(),
3368                                       A::Xmm d = (A::Xmm)dst();
3369                                       a->vmovd  (d,   A::Mem{arg[immA], 4*16 + 4*immB}   );
3370                                       a->vpinsrd(d,d, A::Mem{arg[immA], 5*16 + 4*immB}, 1);
3371                                       a->vpinsrd(d,d, A::Mem{arg[immA], 6*16 + 4*immB}, 2);
3372                                       a->vpinsrd(d,d, A::Mem{arg[immA], 7*16 + 4*immB}, 3);
3373 
3374                                       // Merge the two, ymm dst() = {xmm tmp|xmm dst()}
3375                                       a->vperm2f128(dst(), tmp,dst(), 0x20);
3376                                       free_tmp(tmp);
3377                                   } break;
3378 
3379                 case Op::gather8: {
3380                     // As usual, the gather base pointer is immB bytes off of uniform immA.
3381                     a->mov(GP0, A::Mem{arg[immA], immB});
3382 
3383                     A::Ymm tmp = alloc_tmp();
3384                     a->vmovups(tmp, any(x));
3385 
3386                     for (int i = 0; i < active_lanes; i++) {
3387                         if (i == 4) {
3388                             // vpextrd can only pluck indices out from an Xmm register,
3389                             // so we manually swap over to the top when we're halfway through.
3390                             a->vextracti128((A::Xmm)tmp, tmp, 1);
3391                         }
3392                         a->vpextrd(GP1, (A::Xmm)tmp, i%4);
3393                         a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{GP0,0,GP1,A::ONE}, i);
3394                     }
3395                     a->vpmovzxbd(dst(), dst());
3396                     free_tmp(tmp);
3397                 } break;
3398 
3399                 case Op::gather16: {
3400                     // Just as gather8 except vpinsrb->vpinsrw, ONE->TWO, and vpmovzxbd->vpmovzxwd.
3401                     a->mov(GP0, A::Mem{arg[immA], immB});
3402 
3403                     A::Ymm tmp = alloc_tmp();
3404                     a->vmovups(tmp, any(x));
3405 
3406                     for (int i = 0; i < active_lanes; i++) {
3407                         if (i == 4) {
3408                             a->vextracti128((A::Xmm)tmp, tmp, 1);
3409                         }
3410                         a->vpextrd(GP1, (A::Xmm)tmp, i%4);
3411                         a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{GP0,0,GP1,A::TWO}, i);
3412                     }
3413                     a->vpmovzxwd(dst(), dst());
3414                     free_tmp(tmp);
3415                 } break;
3416 
3417                 case Op::gather32:
3418                 if (scalar) {
3419                     // Our gather base pointer is immB bytes off of uniform immA.
3420                     a->mov(GP0, A::Mem{arg[immA], immB});
3421 
3422                     // Grab our index from lane 0 of the index argument.
3423                     a->vmovd(GP1, (A::Xmm)r(x));
3424 
3425                     // dst = *(base + 4*index)
3426                     a->vmovd((A::Xmm)dst(x), A::Mem{GP0, 0, GP1, A::FOUR});
3427                 } else {
3428                     a->mov(GP0, A::Mem{arg[immA], immB});
3429 
3430                     A::Ymm mask = alloc_tmp();
3431                     a->vpcmpeqd(mask, mask, mask);   // (All lanes enabled.)
3432 
3433                     a->vgatherdps(dst(), A::FOUR, r(x), GP0, mask);
3434                     free_tmp(mask);
3435                 }
3436                 break;
3437 
3438                 case Op::uniform32: a->vbroadcastss(dst(), A::Mem{arg[immA], immB});
3439                                     break;
3440 
3441                 case Op::array32: a->mov(GP0, A::Mem{arg[immA], immB});
3442                                   a->vbroadcastss(dst(), A::Mem{GP0, immC});
3443                                   break;
3444 
3445                 case Op::index: a->vmovd((A::Xmm)dst(), N);
3446                                 a->vbroadcastss(dst(), dst());
3447                                 a->vpsubd(dst(), dst(), &iota);
3448                                 break;
3449 
3450                 // We can swap the arguments of symmetric instructions to make better use of any().
3451                 case Op::add_f32:
3452                     if (in_reg(x)) { a->vaddps(dst(x), r(x), any(y)); }
3453                     else           { a->vaddps(dst(y), r(y), any(x)); }
3454                                      break;
3455 
3456                 case Op::mul_f32:
3457                     if (in_reg(x)) { a->vmulps(dst(x), r(x), any(y)); }
3458                     else           { a->vmulps(dst(y), r(y), any(x)); }
3459                                      break;
3460 
3461                 case Op::sub_f32: a->vsubps(dst(x), r(x), any(y)); break;
3462                 case Op::div_f32: a->vdivps(dst(x), r(x), any(y)); break;
3463                 case Op::min_f32: a->vminps(dst(y), r(y), any(x)); break;  // Order matters,
3464                 case Op::max_f32: a->vmaxps(dst(y), r(y), any(x)); break;  // see test SkVM_min_max.
3465 
3466                 case Op::fma_f32:
3467                     if (try_alias(x)) { a->vfmadd132ps(dst(x), r(z), any(y)); } else
3468                     if (try_alias(y)) { a->vfmadd213ps(dst(y), r(x), any(z)); } else
3469                     if (try_alias(z)) { a->vfmadd231ps(dst(z), r(x), any(y)); } else
3470                                       { a->vmovups    (dst(), any(x));
3471                                         a->vfmadd132ps(dst(), r(z), any(y)); }
3472                                         break;
3473 
3474                 case Op::fms_f32:
3475                     if (try_alias(x)) { a->vfmsub132ps(dst(x), r(z), any(y)); } else
3476                     if (try_alias(y)) { a->vfmsub213ps(dst(y), r(x), any(z)); } else
3477                     if (try_alias(z)) { a->vfmsub231ps(dst(z), r(x), any(y)); } else
3478                                       { a->vmovups    (dst(), any(x));
3479                                         a->vfmsub132ps(dst(), r(z), any(y)); }
3480                                         break;
3481 
3482                 case Op::fnma_f32:
3483                     if (try_alias(x)) { a->vfnmadd132ps(dst(x), r(z), any(y)); } else
3484                     if (try_alias(y)) { a->vfnmadd213ps(dst(y), r(x), any(z)); } else
3485                     if (try_alias(z)) { a->vfnmadd231ps(dst(z), r(x), any(y)); } else
3486                                       { a->vmovups     (dst(), any(x));
3487                                         a->vfnmadd132ps(dst(), r(z), any(y)); }
3488                                         break;
3489 
3490                 // In situations like this we want to try aliasing dst(x) when x is
3491                 // already in a register, but not if we'd have to load it from the stack
3492                 // just to alias it.  That's done better directly into the new register.
3493                 case Op::sqrt_f32:
3494                     if (in_reg(x)) { a->vsqrtps(dst(x),  r(x)); }
3495                     else           { a->vsqrtps(dst(), any(x)); }
3496                                      break;
3497 
3498                 case Op::add_i32:
3499                     if (in_reg(x)) { a->vpaddd(dst(x), r(x), any(y)); }
3500                     else           { a->vpaddd(dst(y), r(y), any(x)); }
3501                                      break;
3502 
3503                 case Op::mul_i32:
3504                     if (in_reg(x)) { a->vpmulld(dst(x), r(x), any(y)); }
3505                     else           { a->vpmulld(dst(y), r(y), any(x)); }
3506                                      break;
3507 
3508                 case Op::sub_i32: a->vpsubd(dst(x), r(x), any(y)); break;
3509 
3510                 case Op::bit_and:
3511                     if (in_reg(x)) { a->vpand(dst(x), r(x), any(y)); }
3512                     else           { a->vpand(dst(y), r(y), any(x)); }
3513                                      break;
3514                 case Op::bit_or:
3515                     if (in_reg(x)) { a->vpor(dst(x), r(x), any(y)); }
3516                     else           { a->vpor(dst(y), r(y), any(x)); }
3517                                      break;
3518                 case Op::bit_xor:
3519                     if (in_reg(x)) { a->vpxor(dst(x), r(x), any(y)); }
3520                     else           { a->vpxor(dst(y), r(y), any(x)); }
3521                                      break;
3522 
3523                 case Op::bit_clear: a->vpandn(dst(y), r(y), any(x)); break; // Notice, y then x.
3524 
3525                 case Op::select:
3526                     if (try_alias(z)) { a->vpblendvb(dst(z), r(z), any(y), r(x)); }
3527                     else              { a->vpblendvb(dst(x), r(z), any(y), r(x)); }
3528                                         break;
3529 
3530                 case Op::shl_i32: a->vpslld(dst(x), r(x), immA); break;
3531                 case Op::shr_i32: a->vpsrld(dst(x), r(x), immA); break;
3532                 case Op::sra_i32: a->vpsrad(dst(x), r(x), immA); break;
3533 
3534                 case Op::eq_i32:
3535                     if (in_reg(x)) { a->vpcmpeqd(dst(x), r(x), any(y)); }
3536                     else           { a->vpcmpeqd(dst(y), r(y), any(x)); }
3537                                      break;
3538 
3539                 case Op::gt_i32: a->vpcmpgtd(dst(), r(x), any(y)); break;
3540 
3541                 case Op::eq_f32:
3542                     if (in_reg(x)) { a->vcmpeqps(dst(x), r(x), any(y)); }
3543                     else           { a->vcmpeqps(dst(y), r(y), any(x)); }
3544                                      break;
3545                 case Op::neq_f32:
3546                     if (in_reg(x)) { a->vcmpneqps(dst(x), r(x), any(y)); }
3547                     else           { a->vcmpneqps(dst(y), r(y), any(x)); }
3548                                      break;
3549 
3550                 case Op:: gt_f32: a->vcmpltps (dst(y), r(y), any(x)); break;
3551                 case Op::gte_f32: a->vcmpleps (dst(y), r(y), any(x)); break;
3552 
3553                 case Op::ceil:
3554                     if (in_reg(x)) { a->vroundps(dst(x),  r(x), Assembler::CEIL); }
3555                     else           { a->vroundps(dst(), any(x), Assembler::CEIL); }
3556                                      break;
3557 
3558                 case Op::floor:
3559                     if (in_reg(x)) { a->vroundps(dst(x),  r(x), Assembler::FLOOR); }
3560                     else           { a->vroundps(dst(), any(x), Assembler::FLOOR); }
3561                                      break;
3562 
3563                 case Op::to_f32:
3564                     if (in_reg(x)) { a->vcvtdq2ps(dst(x),  r(x)); }
3565                     else           { a->vcvtdq2ps(dst(), any(x)); }
3566                                      break;
3567 
3568                 case Op::trunc:
3569                     if (in_reg(x)) { a->vcvttps2dq(dst(x),  r(x)); }
3570                     else           { a->vcvttps2dq(dst(), any(x)); }
3571                                      break;
3572 
3573                 case Op::round:
3574                     if (in_reg(x)) { a->vcvtps2dq(dst(x),  r(x)); }
3575                     else           { a->vcvtps2dq(dst(), any(x)); }
3576                                      break;
3577 
3578                 case Op::to_fp16:
3579                     a->vcvtps2ph(dst(x), r(x), A::CURRENT);  // f32 ymm -> f16 xmm
3580                     a->vpmovzxwd(dst(), dst());              // f16 xmm -> f16 ymm
3581                     break;
3582 
3583                 case Op::from_fp16:
3584                     a->vpackusdw(dst(x), r(x), r(x));  // f16 ymm -> f16 xmm
3585                     a->vpermq   (dst(), dst(), 0xd8);  // swap middle two 64-bit lanes
3586                     a->vcvtph2ps(dst(), dst());        // f16 xmm -> f32 ymm
3587                     break;
3588 
3589                 case Op::duplicate: break;
3590 
3591             #elif defined(__aarch64__)
3592                 case Op::assert_true: {
3593                     a->uminv4s(dst(), r(x));   // uminv acts like an all() across the vector.
3594                     a->movs(GP0, dst(), 0);
3595                     A::Label all_true;
3596                     a->cbnz(GP0, &all_true);
3597                     a->brk(0);
3598                     a->label(&all_true);
3599                 } break;
3600 
3601                 case Op::trace_line:
3602                 case Op::trace_var:
3603                 case Op::trace_enter:
3604                 case Op::trace_exit:
3605                 case Op::trace_scope:
3606                     /* Force this program to run in the interpreter. */
3607                     return false;
3608 
3609                 case Op::index: {
3610                     A::V tmp = alloc_tmp();
3611                     a->ldrq (tmp, &iota);
3612                     a->dup4s(dst(), N);
3613                     a->sub4s(dst(), dst(), tmp);
3614                     free_tmp(tmp);
3615                 } break;
3616 
3617                 case Op::store8: a->xtns2h(dst(x), r(x));
3618                                  a->xtnh2b(dst(), dst());
3619                    if (scalar) { a->strb  (dst(), arg[immA]); }
3620                    else        { a->strs  (dst(), arg[immA]); }
3621                                  break;
3622 
3623                 case Op::store16: a->xtns2h(dst(x), r(x));
3624                     if (scalar) { a->strh  (dst(), arg[immA]); }
3625                     else        { a->strd  (dst(), arg[immA]); }
3626                                   break;
3627 
3628                 case Op::store32: if (scalar) { a->strs(r(x), arg[immA]); }
3629                                   else        { a->strq(r(x), arg[immA]); }
3630                                                 break;
3631 
3632                 case Op::store64: if (scalar) {
3633                                       a->strs(r(x), arg[immA], 0);
3634                                       a->strs(r(y), arg[immA], 1);
3635                                   } else if (r(y) == r(x)+1) {
3636                                       a->st24s(r(x), arg[immA]);
3637                                   } else {
3638                                       Reg tmp0 = alloc_tmp(2),
3639                                           tmp1 = (Reg)(tmp0+1);
3640                                       a->orr16b(tmp0, r(x), r(x));
3641                                       a->orr16b(tmp1, r(y), r(y));
3642                                       a-> st24s(tmp0, arg[immA]);
3643                                       free_tmp(tmp0);
3644                                       free_tmp(tmp1);
3645                                   } break;
3646 
3647                 case Op::store128:
3648                     if (scalar) {
3649                         a->strs(r(x), arg[immA], 0);
3650                         a->strs(r(y), arg[immA], 1);
3651                         a->strs(r(z), arg[immA], 2);
3652                         a->strs(r(w), arg[immA], 3);
3653                     } else if (r(y) == r(x)+1 &&
3654                                r(z) == r(x)+2 &&
3655                                r(w) == r(x)+3) {
3656                         a->st44s(r(x), arg[immA]);
3657                     } else {
3658                         Reg tmp0 = alloc_tmp(4),
3659                             tmp1 = (Reg)(tmp0+1),
3660                             tmp2 = (Reg)(tmp0+2),
3661                             tmp3 = (Reg)(tmp0+3);
3662                         a->orr16b(tmp0, r(x), r(x));
3663                         a->orr16b(tmp1, r(y), r(y));
3664                         a->orr16b(tmp2, r(z), r(z));
3665                         a->orr16b(tmp3, r(w), r(w));
3666                         a-> st44s(tmp0, arg[immA]);
3667                         free_tmp(tmp0);
3668                         free_tmp(tmp1);
3669                         free_tmp(tmp2);
3670                         free_tmp(tmp3);
3671                     } break;
3672 
3673 
3674                 case Op::load8: if (scalar) { a->ldrb(dst(), arg[immA]); }
3675                                 else        { a->ldrs(dst(), arg[immA]); }
3676                                               a->uxtlb2h(dst(), dst());
3677                                               a->uxtlh2s(dst(), dst());
3678                                               break;
3679 
3680                 case Op::load16: if (scalar) { a->ldrh(dst(), arg[immA]); }
3681                                  else        { a->ldrd(dst(), arg[immA]); }
3682                                                a->uxtlh2s(dst(), dst());
3683                                                break;
3684 
3685                 case Op::load32: if (scalar) { a->ldrs(dst(), arg[immA]); }
3686                                  else        { a->ldrq(dst(), arg[immA]); }
3687                                                break;
3688 
3689                 case Op::load64: if (scalar) {
3690                                     a->ldrs(dst(), arg[immA], immB);
3691                                  } else {
3692                                     Reg tmp0 = alloc_tmp(2),
3693                                         tmp1 = (Reg)(tmp0+1);
3694                                     a->ld24s(tmp0, arg[immA]);
3695                                     // TODO: return both
3696                                     switch (immB) {
3697                                         case 0: mark_tmp_as_dst(tmp0); free_tmp(tmp1); break;
3698                                         case 1: mark_tmp_as_dst(tmp1); free_tmp(tmp0); break;
3699                                     }
3700                                  } break;
3701 
3702                 case Op::load128: if (scalar) {
3703                                       a->ldrs(dst(), arg[immA], immB);
3704                                   } else {
3705                                       Reg tmp0 = alloc_tmp(4),
3706                                           tmp1 = (Reg)(tmp0+1),
3707                                           tmp2 = (Reg)(tmp0+2),
3708                                           tmp3 = (Reg)(tmp0+3);
3709                                       a->ld44s(tmp0, arg[immA]);
3710                                       // TODO: return all four
3711                                       switch (immB) {
3712                                           case 0: mark_tmp_as_dst(tmp0); break;
3713                                           case 1: mark_tmp_as_dst(tmp1); break;
3714                                           case 2: mark_tmp_as_dst(tmp2); break;
3715                                           case 3: mark_tmp_as_dst(tmp3); break;
3716                                       }
3717                                       if (immB != 0) { free_tmp(tmp0); }
3718                                       if (immB != 1) { free_tmp(tmp1); }
3719                                       if (immB != 2) { free_tmp(tmp2); }
3720                                       if (immB != 3) { free_tmp(tmp3); }
3721                                   } break;
3722 
3723                 case Op::uniform32: a->add(GP0, arg[immA], immB);
3724                                     a->ld1r4s(dst(), GP0);
3725                                     break;
3726 
3727                 case Op::array32: a->add(GP0, arg[immA], immB);
3728                                   a->ldrd(GP0, GP0);
3729                                   a->add(GP0, GP0, immC);
3730                                   a->ld1r4s(dst(), GP0);
3731                                   break;
3732 
3733                 case Op::gather8: {
3734                     // As usual, the gather base pointer is immB bytes off of uniform immA.
3735                     a->add (GP0, arg[immA], immB);  // GP0 = &(gather base pointer)
3736                     a->ldrd(GP0, GP0);              // GP0 =   gather base pointer
3737 
3738                     for (int i = 0; i < active_lanes; i++) {
3739                         a->movs(GP1, r(x), i);    // Extract index lane i into GP1.
3740                         a->add (GP1, GP0, GP1);   // Add the gather base pointer.
3741                         a->ldrb(GP1, GP1);        // Load that byte.
3742                         a->inss(dst(x), GP1, i);  // Insert it into dst() lane i.
3743                     }
3744                 } break;
3745 
3746                 // See gather8 for general idea; comments here only where gather16 differs.
3747                 case Op::gather16: {
3748                     a->add (GP0, arg[immA], immB);
3749                     a->ldrd(GP0, GP0);
3750                     for (int i = 0; i < active_lanes; i++) {
3751                         a->movs(GP1, r(x), i);
3752                         a->add (GP1, GP0, GP1, A::LSL, 1);  // Scale index 2x into a byte offset.
3753                         a->ldrh(GP1, GP1);                  // 2-byte load.
3754                         a->inss(dst(x), GP1, i);
3755                     }
3756                 } break;
3757 
3758                 // See gather8 for general idea; comments here only where gather32 differs.
3759                 case Op::gather32: {
3760                     a->add (GP0, arg[immA], immB);
3761                     a->ldrd(GP0, GP0);
3762                     for (int i = 0; i < active_lanes; i++) {
3763                         a->movs(GP1, r(x), i);
3764                         a->add (GP1, GP0, GP1, A::LSL, 2);  // Scale index 4x into a byte offset.
3765                         a->ldrs(GP1, GP1);                  // 4-byte load.
3766                         a->inss(dst(x), GP1, i);
3767                     }
3768                 } break;
3769 
3770                 case Op::add_f32: a->fadd4s(dst(x,y), r(x), r(y)); break;
3771                 case Op::sub_f32: a->fsub4s(dst(x,y), r(x), r(y)); break;
3772                 case Op::mul_f32: a->fmul4s(dst(x,y), r(x), r(y)); break;
3773                 case Op::div_f32: a->fdiv4s(dst(x,y), r(x), r(y)); break;
3774 
3775                 case Op::sqrt_f32: a->fsqrt4s(dst(x), r(x)); break;
3776 
3777                 case Op::fma_f32: // fmla.4s is z += x*y
3778                     if (try_alias(z)) { a->fmla4s( r(z), r(x), r(y)); }
3779                     else              { a->orr16b(dst(), r(z), r(z));
3780                                         a->fmla4s(dst(), r(x), r(y)); }
3781                                         break;
3782 
3783                 case Op::fnma_f32:  // fmls.4s is z -= x*y
3784                     if (try_alias(z)) { a->fmls4s( r(z), r(x), r(y)); }
3785                     else              { a->orr16b(dst(), r(z), r(z));
3786                                         a->fmls4s(dst(), r(x), r(y)); }
3787                                         break;
3788 
3789                 case Op::fms_f32:   // calculate z - xy, then negate to xy - z
3790                     if (try_alias(z)) { a->fmls4s( r(z), r(x), r(y)); }
3791                     else              { a->orr16b(dst(), r(z), r(z));
3792                                         a->fmls4s(dst(), r(x), r(y)); }
3793                                         a->fneg4s(dst(), dst());
3794                                         break;
3795 
3796                 case Op:: gt_f32: a->fcmgt4s (dst(x,y), r(x), r(y)); break;
3797                 case Op::gte_f32: a->fcmge4s (dst(x,y), r(x), r(y)); break;
3798                 case Op:: eq_f32: a->fcmeq4s (dst(x,y), r(x), r(y)); break;
3799                 case Op::neq_f32: a->fcmeq4s (dst(x,y), r(x), r(y));
3800                                   a->not16b  (dst(), dst());         break;
3801 
3802 
3803                 case Op::add_i32: a->add4s(dst(x,y), r(x), r(y)); break;
3804                 case Op::sub_i32: a->sub4s(dst(x,y), r(x), r(y)); break;
3805                 case Op::mul_i32: a->mul4s(dst(x,y), r(x), r(y)); break;
3806 
3807                 case Op::bit_and  : a->and16b(dst(x,y), r(x), r(y)); break;
3808                 case Op::bit_or   : a->orr16b(dst(x,y), r(x), r(y)); break;
3809                 case Op::bit_xor  : a->eor16b(dst(x,y), r(x), r(y)); break;
3810                 case Op::bit_clear: a->bic16b(dst(x,y), r(x), r(y)); break;
3811 
3812                 case Op::select: // bsl16b is x = x ? y : z
3813                     if (try_alias(x)) { a->bsl16b( r(x), r(y), r(z)); }
3814                     else              { a->orr16b(dst(), r(x), r(x));
3815                                         a->bsl16b(dst(), r(y), r(z)); }
3816                                         break;
3817 
3818                 // fmin4s and fmax4s don't work the way we want with NaN,
3819                 // so we write them the long way:
3820                 case Op::min_f32: // min(x,y) = y<x ? y : x
3821                                   a->fcmgt4s(dst(), r(x), r(y));
3822                                   a->bsl16b (dst(), r(y), r(x));
3823                                   break;
3824 
3825                 case Op::max_f32: // max(x,y) = x<y ? y : x
3826                                   a->fcmgt4s(dst(), r(y), r(x));
3827                                   a->bsl16b (dst(), r(y), r(x));
3828                                   break;
3829 
3830                 case Op::shl_i32: a-> shl4s(dst(x), r(x), immA); break;
3831                 case Op::shr_i32: a->ushr4s(dst(x), r(x), immA); break;
3832                 case Op::sra_i32: a->sshr4s(dst(x), r(x), immA); break;
3833 
3834                 case Op::eq_i32: a->cmeq4s(dst(x,y), r(x), r(y)); break;
3835                 case Op::gt_i32: a->cmgt4s(dst(x,y), r(x), r(y)); break;
3836 
3837                 case Op::to_f32: a->scvtf4s (dst(x), r(x)); break;
3838                 case Op::trunc:  a->fcvtzs4s(dst(x), r(x)); break;
3839                 case Op::round:  a->fcvtns4s(dst(x), r(x)); break;
3840                 case Op::ceil:   a->frintp4s(dst(x), r(x)); break;
3841                 case Op::floor:  a->frintm4s(dst(x), r(x)); break;
3842 
3843                 case Op::to_fp16:
3844                     a->fcvtn  (dst(x), r(x));    // 4x f32 -> 4x f16 in bottom four lanes
3845                     a->uxtlh2s(dst(), dst());    // expand to 4x f16 in even 16-bit lanes
3846                     break;
3847 
3848                 case Op::from_fp16:
3849                     a->xtns2h(dst(x), r(x));     // pack even 16-bit lanes into bottom four lanes
3850                     a->fcvtl (dst(), dst());     // 4x f16 -> 4x f32
3851                     break;
3852 
3853                 case Op::duplicate: break;
3854             #endif
3855             }
3856 
3857             // Proactively free the registers holding any value that dies here.
3858             if (rd != NA &&                   dies_here(regs[rd])) { regs[rd] = NA; }
3859             if (rx != NA && regs[rx] != NA && dies_here(regs[rx])) { regs[rx] = NA; }
3860             if (ry != NA && regs[ry] != NA && dies_here(regs[ry])) { regs[ry] = NA; }
3861             if (rz != NA && regs[rz] != NA && dies_here(regs[rz])) { regs[rz] = NA; }
3862             if (rw != NA && regs[rw] != NA && dies_here(regs[rw])) { regs[rw] = NA; }
3863             return true;
3864         };
3865 
3866         #if defined(__x86_64__) || defined(_M_X64)
3867             auto jump_if_less = [&](A::Label* l) { a->jl (l); };
3868             auto jump         = [&](A::Label* l) { a->jmp(l); };
3869 
3870             auto add = [&](A::GP64 gp, int imm) { a->add(gp, imm); };
3871             auto sub = [&](A::GP64 gp, int imm) { a->sub(gp, imm); };
3872         #elif defined(__aarch64__)
3873             auto jump_if_less = [&](A::Label* l) { a->blt(l); };
3874             auto jump         = [&](A::Label* l) { a->b  (l); };
3875 
3876             auto add = [&](A::X gp, int imm) { a->add(gp, gp, imm); };
3877             auto sub = [&](A::X gp, int imm) { a->sub(gp, gp, imm); };
3878         #endif
3879 
3880         A::Label body,
3881                  tail,
3882                  done;
3883 
3884         enter();
3885         for (Val id = 0; id < (Val)instructions.size(); id++) {
3886             if (fImpl->visualizer && is_trace(instructions[id].op)) {
3887                 // Make sure trace commands stay on JIT for visualizer
3888                 continue;
3889             }
3890             if (instructions[id].can_hoist && !emit(id, /*scalar=*/false)) {
3891                 return false;
3892             }
3893         }
3894 
3895         // This point marks a kind of canonical fixed point for register contents: if loop
3896         // code is generated as if these registers are holding these values, the next time
3897         // the loop comes around we'd better find those same registers holding those same values.
3898         auto restore_incoming_regs = [&,incoming=regs,saved_stack_slot=stack_slot,
3899                                       saved_next_stack_slot=next_stack_slot]{
3900             for (int r = 0; r < (int)regs.size(); r++) {
3901                 if (regs[r] != incoming[r]) {
3902                     regs[r]  = incoming[r];
3903                     if (regs[r] >= 0) {
3904                         load_from_memory((Reg)r, regs[r]);
3905                     }
3906                 }
3907             }
3908             *stack_hint = std::max(*stack_hint, next_stack_slot);
3909             stack_slot = saved_stack_slot;
3910             next_stack_slot = saved_next_stack_slot;
3911         };
3912 
3913         a->label(&body);
3914         {
3915             a->cmp(N, K);
3916             jump_if_less(&tail);
3917             for (Val id = 0; id < (Val)instructions.size(); id++) {
3918                 if (fImpl->visualizer != nullptr && is_trace(instructions[id].op)) {
3919                     // Make sure trace commands stay on JIT for visualizer
3920                     continue;
3921                 }
3922                 if (!instructions[id].can_hoist && !emit(id, /*scalar=*/false)) {
3923                     return false;
3924                 }
3925             }
3926             restore_incoming_regs();
3927             for (int i = 0; i < (int)fImpl->strides.size(); i++) {
3928                 if (fImpl->strides[i]) {
3929                     add(arg[i], K*fImpl->strides[i]);
3930                 }
3931             }
3932             sub(N, K);
3933             jump(&body);
3934         }
3935 
3936         a->label(&tail);
3937         {
3938             a->cmp(N, 1);
3939             jump_if_less(&done);
3940             for (Val id = 0; id < (Val)instructions.size(); id++) {
3941                 if (fImpl->visualizer && is_trace(instructions[id].op)) {
3942                     // Make sure trace commands stay on JIT for visualizer
3943                     continue;
3944                 }
3945                 if (!instructions[id].can_hoist && !emit(id, /*scalar=*/true)) {
3946                     return false;
3947                 }
3948             }
3949             restore_incoming_regs();
3950             for (int i = 0; i < (int)fImpl->strides.size(); i++) {
3951                 if (fImpl->strides[i]) {
3952                     add(arg[i], 1*fImpl->strides[i]);
3953                 }
3954             }
3955             sub(N, 1);
3956             jump(&tail);
3957         }
3958 
3959         a->label(&done);
3960         {
3961             exit();
3962         }
3963 
3964         // On ARM64, we use immediate offsets to adjust the stack pointer, and those are limited to
3965         // 12 bits. If our function is going to require more than 4k of stack, just fail. We could
3966         // tweak the code that adjusts `sp`, but then we risk exceeding the (larger) immediate limit
3967         // on our sp-relative load and store opcodes.
3968     #if defined(__aarch64__)
3969         const int stack_bytes = (*stack_hint) * K * 4;
3970         if (stack_bytes > mask(12)) {
3971             return false;
3972         }
3973     #endif
3974 
3975         // Except for explicit aligned load and store instructions, AVX allows
3976         // memory operands to be unaligned.  So even though we're creating 16
3977         // byte patterns on ARM or 32-byte patterns on x86, we only need to
3978         // align to 4 bytes, the element size and alignment requirement.
3979 
3980         constants.foreach([&](int imm, A::Label* label) {
3981             a->align(4);
3982             a->label(label);
3983             for (int i = 0; i < K; i++) {
3984                 a->word(imm);
3985             }
3986         });
3987 
3988         if (!iota.references.empty()) {
3989             a->align(4);
3990             a->label(&iota);        // 0,1,2,3,4,...
3991             for (int i = 0; i < K; i++) {
3992                 a->word(i);
3993             }
3994         }
3995 
3996         if (!load64_index.references.empty()) {
3997             a->align(4);
3998             a->label(&load64_index);  // {0,2,4,6|1,3,5,7}
3999             a->word(0); a->word(2); a->word(4); a->word(6);
4000             a->word(1); a->word(3); a->word(5); a->word(7);
4001         }
4002 
4003         return true;
4004     }
4005 
setupJIT(const std::vector<OptimizedInstruction> & instructions,const char * debug_name)4006     void Program::setupJIT(const std::vector<OptimizedInstruction>& instructions,
4007                            const char* debug_name) {
4008         // Assemble with no buffer to determine a.size() (the number of bytes we'll assemble)
4009         // and stack_hint/registers_used to feed forward into the next jit() call.
4010         Assembler a{nullptr};
4011         int stack_hint = -1;
4012         uint32_t registers_used = 0xffff'ffff;  // Start conservatively with all.
4013         if (!this->jit(instructions, &stack_hint, &registers_used, &a)) {
4014             return;
4015         }
4016 
4017         fImpl->jit_size = a.size();
4018         void* jit_entry = alloc_jit_buffer(&fImpl->jit_size);
4019         fImpl->jit_entry.store(jit_entry);
4020 
4021         // Assemble the program for real with stack_hint/registers_used as feedback from first call.
4022         a = Assembler{jit_entry};
4023         SkAssertResult(this->jit(instructions, &stack_hint, &registers_used, &a));
4024         SkASSERT(a.size() <= fImpl->jit_size);
4025 
4026         // Remap as executable, and flush caches on platforms that need that.
4027         remap_as_executable(jit_entry, fImpl->jit_size);
4028 
4029     #if !defined(SK_BUILD_FOR_WIN)
4030         // For profiling and debugging, it's helpful to have this code loaded
4031         // dynamically rather than just jumping info fImpl->jit_entry.
4032         if (gSkVMJITViaDylib) {
4033             // Dump the raw program binary.
4034             SkString path = SkStringPrintf("/tmp/%s.XXXXXX", debug_name);
4035             int fd = mkstemp(path.data());
4036             ::write(fd, jit_entry, a.size());
4037             close(fd);
4038 
4039             this->dropJIT();  // (unmap and null out fImpl->jit_entry.)
4040 
4041             // Convert it in-place to a dynamic library with a single symbol "skvm_jit":
4042             SkString cmd = SkStringPrintf(
4043                     "echo '.global _skvm_jit\n_skvm_jit: .incbin \"%s\"'"
4044                     " | clang -x assembler -shared - -o %s",
4045                     path.c_str(), path.c_str());
4046         #if defined(__aarch64__)
4047             cmd.append(" -arch arm64");
4048         #endif
4049             system(cmd.c_str());
4050 
4051             // Load that dynamic library and look up skvm_jit().
4052             fImpl->dylib = dlopen(path.c_str(), RTLD_NOW|RTLD_LOCAL);
4053             void* sym = nullptr;
4054             for (const char* name : {"skvm_jit", "_skvm_jit"} ) {
4055                 if (!sym) { sym = dlsym(fImpl->dylib, name); }
4056             }
4057             fImpl->jit_entry.store(sym);
4058         }
4059     #endif
4060     }
4061 
disassemble(SkWStream * o) const4062     void Program::disassemble(SkWStream* o) const {
4063     #if !defined(SK_BUILD_FOR_WIN)
4064         SkDebugfStream debug;
4065         if (!o) { o = &debug; }
4066 
4067         const void* jit_entry = fImpl->jit_entry.load();
4068         size_t jit_size = fImpl->jit_size;
4069 
4070         if (!jit_entry) {
4071             o->writeText("Program not JIT'd. Did you pass --jit?\n");
4072             return;
4073         }
4074 
4075         char path[] = "/tmp/skvm-jit.XXXXXX";
4076         int fd = mkstemp(path);
4077         ::write(fd, jit_entry, jit_size);
4078         close(fd);
4079 
4080         // Convert it in-place to a dynamic library with a single symbol "skvm_jit":
4081         SkString cmd = SkStringPrintf(
4082                 "echo '.global _skvm_jit\n_skvm_jit: .incbin \"%s\"'"
4083                 " | clang -x assembler -shared - -o %s",
4084                 path, path);
4085         #if defined(__aarch64__)
4086             cmd.append(" -arch arm64");
4087         #endif
4088         system(cmd.c_str());
4089 
4090         // Now objdump to disassemble our function:
4091         // TODO: We could trim this down to just our code using '--disassemble=<symbol name>`,
4092         // but the symbol name varies with OS, and that option may be missing from objdump on some
4093         // machines? There also apears to be quite a bit of junk after the end of the JIT'd code.
4094         // Trimming that would let us pass '--visualize-jumps' and get the loop annotated.
4095         // With the junk, we tend to end up with a bunch of stray jumps that pollute the ASCII art.
4096         cmd = SkStringPrintf("objdump -D %s", path);
4097     #if defined(SK_BUILD_FOR_UNIX)
4098         cmd.append(" --section=.text");
4099     #endif
4100         FILE* fp = popen(cmd.c_str(), "r");
4101         if (!fp) {
4102             o->writeText("objdump failed\n");
4103             return;
4104         }
4105 
4106         char line[1024];
4107         while (fgets(line, sizeof(line), fp)) {
4108             o->writeText(line);
4109         }
4110 
4111         pclose(fp);
4112     #endif
4113     }
4114 
4115 #endif
4116 
4117 }  // namespace skvm
4118