• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2019 Google LLC
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #ifndef SkVM_DEFINED
9 #define SkVM_DEFINED
10 
11 #include "include/core/SkTypes.h"
12 #include "include/private/SkMacros.h"
13 #include "include/private/SkTHash.h"
14 #include "src/core/SkVM_fwd.h"
15 #include <vector>      // std::vector
16 
17 class SkWStream;
18 
19 namespace skvm {
20 
21     class Assembler {
22     public:
23         explicit Assembler(void* buf);
24 
25         size_t size() const;
26 
27         // Order matters... GP64, Xmm, Ymm values match 4-bit register encoding for each.
28         enum GP64 {
29             rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi,
30             r8 , r9 , r10, r11, r12, r13, r14, r15,
31         };
32         enum Xmm {
33             xmm0, xmm1, xmm2 , xmm3 , xmm4 , xmm5 , xmm6 , xmm7 ,
34             xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
35         };
36         enum Ymm {
37             ymm0, ymm1, ymm2 , ymm3 , ymm4 , ymm5 , ymm6 , ymm7 ,
38             ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15,
39         };
40 
41         // X and V values match 5-bit encoding for each (nothing tricky).
42         enum X {
43             x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 ,
44             x8 , x9 , x10, x11, x12, x13, x14, x15,
45             x16, x17, x18, x19, x20, x21, x22, x23,
46             x24, x25, x26, x27, x28, x29, x30, xzr,
47         };
48         enum V {
49             v0 , v1 , v2 , v3 , v4 , v5 , v6 , v7 ,
50             v8 , v9 , v10, v11, v12, v13, v14, v15,
51             v16, v17, v18, v19, v20, v21, v22, v23,
52             v24, v25, v26, v27, v28, v29, v30, v31,
53         };
54 
55         void bytes(const void*, int);
56         void byte(uint8_t);
57         void word(uint32_t);
58 
59         // x86-64
60 
61         void align(int mod);
62 
63         void int3();
64         void vzeroupper();
65         void ret();
66 
67         void add(GP64, int imm);
68         void sub(GP64, int imm);
69 
70         void movq(GP64 dst, GP64 src, int off);  // dst = *(src+off)
71 
72         struct Label {
73             int                                      offset = 0;
74             enum { NotYetSet, ARMDisp19, X86Disp32 } kind = NotYetSet;
75             std::vector<int>                         references;
76         };
77 
78         struct YmmOrLabel {
79             Ymm    ymm   = ymm0;
80             Label* label = nullptr;
81 
YmmOrLabelYmmOrLabel82             /*implicit*/ YmmOrLabel(Ymm    y) : ymm  (y) { SkASSERT(!label); }
YmmOrLabelYmmOrLabel83             /*implicit*/ YmmOrLabel(Label* l) : label(l) { SkASSERT( label); }
84         };
85 
86         // All dst = x op y.
87         using DstEqXOpY = void(Ymm dst, Ymm x, Ymm y);
88         DstEqXOpY vpandn,
89                   vpmulld,
90                   vpsubw, vpmullw,
91                   vdivps,
92                   vfmadd132ps, vfmadd213ps, vfmadd231ps,
93                   vpackusdw, vpackuswb,
94                   vpcmpeqd, vpcmpgtd;
95 
96         using DstEqXOpYOrLabel = void(Ymm dst, Ymm x, YmmOrLabel y);
97         DstEqXOpYOrLabel vpand, vpor, vpxor,
98                          vpaddd, vpsubd,
99                          vaddps, vsubps, vmulps, vminps, vmaxps;
100 
101         // Floating point comparisons are all the same instruction with varying imm.
102         void vcmpps(Ymm dst, Ymm x, Ymm y, int imm);
vcmpeqps(Ymm dst,Ymm x,Ymm y)103         void vcmpeqps (Ymm dst, Ymm x, Ymm y) { this->vcmpps(dst,x,y,0); }
vcmpltps(Ymm dst,Ymm x,Ymm y)104         void vcmpltps (Ymm dst, Ymm x, Ymm y) { this->vcmpps(dst,x,y,1); }
vcmpleps(Ymm dst,Ymm x,Ymm y)105         void vcmpleps (Ymm dst, Ymm x, Ymm y) { this->vcmpps(dst,x,y,2); }
vcmpneqps(Ymm dst,Ymm x,Ymm y)106         void vcmpneqps(Ymm dst, Ymm x, Ymm y) { this->vcmpps(dst,x,y,4); }
107 
108         using DstEqXOpImm = void(Ymm dst, Ymm x, int imm);
109         DstEqXOpImm vpslld, vpsrld, vpsrad,
110                     vpsrlw,
111                     vpermq,
112                     vroundps;
113 
114         enum { NEAREST, FLOOR, CEIL, TRUNC };  // vroundps immediates
115 
116         using DstEqOpX = void(Ymm dst, Ymm x);
117         DstEqOpX vmovdqa, vcvtdq2ps, vcvttps2dq, vcvtps2dq, vsqrtps;
118 
119         void vpblendvb(Ymm dst, Ymm x, Ymm y, Ymm z);
120 
121         Label here();
122         void label(Label*);
123 
124         void jmp(Label*);
125         void je (Label*);
126         void jne(Label*);
127         void jl (Label*);
128         void jc (Label*);
129         void cmp(GP64, int imm);
130 
131         void vpshufb(Ymm dst, Ymm x, Label*);
132         void vptest(Ymm dst, Label*);
133 
134         void vbroadcastss(Ymm dst, Label*);
135         void vbroadcastss(Ymm dst, Xmm src);
136         void vbroadcastss(Ymm dst, GP64 ptr, int off);  // dst = *(ptr+off)
137 
138         void vmovups  (Ymm dst, GP64 ptr);   // dst = *ptr, 256-bit
139         void vpmovzxwd(Ymm dst, GP64 ptr);   // dst = *ptr, 128-bit, each uint16_t expanded to int
140         void vpmovzxbd(Ymm dst, GP64 ptr);   // dst = *ptr,  64-bit, each uint8_t  expanded to int
141         void vmovd    (Xmm dst, GP64 ptr);   // dst = *ptr,  32-bit
142 
143         enum Scale { ONE, TWO, FOUR, EIGHT };
144         void vmovd(Xmm dst, Scale, GP64 index, GP64 base);   // dst = *(base + scale*index),  32-bit
145 
146         void vmovups(GP64 ptr, Ymm src);     // *ptr = src, 256-bit
147         void vmovups(GP64 ptr, Xmm src);     // *ptr = src, 128-bit
148         void vmovq  (GP64 ptr, Xmm src);     // *ptr = src,  64-bit
149         void vmovd  (GP64 ptr, Xmm src);     // *ptr = src,  32-bit
150 
151         void movzbl(GP64 dst, GP64 ptr, int off);  // dst = *(ptr+off), uint8_t -> int
152         void movb  (GP64 ptr, GP64 src);           // *ptr = src, 8-bit
153 
154         void vmovd_direct(GP64 dst, Xmm src);  // dst = src, 32-bit
155         void vmovd_direct(Xmm dst, GP64 src);  // dst = src, 32-bit
156 
157         void vpinsrw(Xmm dst, Xmm src, GP64 ptr, int imm);  // dst = src; dst[imm] = *ptr, 16-bit
158         void vpinsrb(Xmm dst, Xmm src, GP64 ptr, int imm);  // dst = src; dst[imm] = *ptr,  8-bit
159 
160         void vpextrw(GP64 ptr, Xmm src, int imm);           // *dst = src[imm]           , 16-bit
161         void vpextrb(GP64 ptr, Xmm src, int imm);           // *dst = src[imm]           ,  8-bit
162 
163         // if (mask & 0x8000'0000) {
164         //     dst = base[scale*ix];
165         // }
166         // mask = 0;
167         void vgatherdps(Ymm dst, Scale scale, Ymm ix, GP64 base, Ymm mask);
168 
169         // aarch64
170 
171         // d = op(n,m)
172         using DOpNM = void(V d, V n, V m);
173         DOpNM  and16b, orr16b, eor16b, bic16b, bsl16b,
174                add4s,  sub4s,  mul4s,
175               cmeq4s, cmgt4s,
176                        sub8h,  mul8h,
177               fadd4s, fsub4s, fmul4s, fdiv4s, fmin4s, fmax4s,
178               fcmeq4s, fcmgt4s, fcmge4s,
179               tbl;
180 
181         // TODO: there are also float ==,<,<=,>,>= instructions with an immediate 0.0f,
182         // and the register comparison > and >= can also compare absolute values.  Interesting.
183 
184         // d += n*m
185         void fmla4s(V d, V n, V m);
186 
187         // d -= n*m
188         void fmls4s(V d, V n, V m);
189 
190         // d = op(n,imm)
191         using DOpNImm = void(V d, V n, int imm);
192         DOpNImm sli4s,
193                 shl4s, sshr4s, ushr4s,
194                                ushr8h;
195 
196         // d = op(n)
197         using DOpN = void(V d, V n);
198         DOpN not16b,    // d = ~n
199              scvtf4s,   // int -> float
200              fcvtzs4s,  // truncate float -> int
201              fcvtns4s,  // round float -> int
202              xtns2h,    // u32 -> u16
203              xtnh2b,    // u16 -> u8
204              uxtlb2h,   // u8 -> u16
205              uxtlh2s,   // u16 -> u32
206              uminv4s;   // dst[0] = min(n[0],n[1],n[2],n[3]), n as unsigned
207 
208         void brk (int imm16);
209         void ret (X);
210         void add (X d, X n, int imm12);
211         void sub (X d, X n, int imm12);
212         void subs(X d, X n, int imm12);  // subtract setting condition flags
213 
214         // There's another encoding for unconditional branches that can jump further,
215         // but this one encoded as b.al is simple to implement and should be fine.
b(Label * l)216         void b  (Label* l) { this->b(Condition::al, l); }
bne(Label * l)217         void bne(Label* l) { this->b(Condition::ne, l); }
blt(Label * l)218         void blt(Label* l) { this->b(Condition::lt, l); }
219 
220         // "cmp ..." is just an assembler mnemonic for "subs xzr, ..."!
cmp(X n,int imm12)221         void cmp(X n, int imm12) { this->subs(xzr, n, imm12); }
222 
223         // Compare and branch if zero/non-zero, as if
224         //      cmp(t,0)
225         //      beq/bne(l)
226         // but without setting condition flags.
227         void cbz (X t, Label* l);
228         void cbnz(X t, Label* l);
229 
230         void ldrq(V dst, Label*);  // 128-bit PC-relative load
231 
232         void ldrq(V dst, X src);  // 128-bit dst = *src
233         void ldrs(V dst, X src);  //  32-bit dst = *src
234         void ldrb(V dst, X src);  //   8-bit dst = *src
235 
236         void strq(V src, X dst);  // 128-bit *dst = src
237         void strs(V src, X dst);  //  32-bit *dst = src
238         void strb(V src, X dst);  //   8-bit *dst = src
239 
240         void fmovs(X dst, V src); // dst = 32-bit src[0]
241 
242     private:
243         // dst = op(dst, imm)
244         void op(int opcode, int opcode_ext, GP64 dst, int imm);
245 
246 
247         // dst = op(x,y) or op(x)
248         void op(int prefix, int map, int opcode, Ymm dst, Ymm x, Ymm y, bool W=false);
249         void op(int prefix, int map, int opcode, Ymm dst, Ymm x,        bool W=false) {
250             // Two arguments ops seem to pass them in dst and y, forcing x to 0 so VEX.vvvv == 1111.
251             this->op(prefix, map, opcode, dst,(Ymm)0,x, W);
252         }
253 
254         // dst = op(x,imm)
255         void op(int prefix, int map, int opcode, int opcode_ext, Ymm dst, Ymm x, int imm);
256 
257         // dst = op(x,label) or op(label)
258         void op(int prefix, int map, int opcode, Ymm dst, Ymm x, Label* l);
259         void op(int prefix, int map, int opcode, Ymm dst, Ymm x, YmmOrLabel);
260 
261         // *ptr = ymm or ymm = *ptr, depending on opcode.
262         void load_store(int prefix, int map, int opcode, Ymm ymm, GP64 ptr);
263 
264         // Opcode for 3-arguments ops is split between hi and lo:
265         //    [11 bits hi] [5 bits m] [6 bits lo] [5 bits n] [5 bits d]
266         void op(uint32_t hi, V m, uint32_t lo, V n, V d);
267 
268         // 2-argument ops, with or without an immediate.
269         void op(uint32_t op22, int imm, V n, V d);
op(uint32_t op22,V n,V d)270         void op(uint32_t op22, V n, V d) { this->op(op22,0,n,d); }
op(uint32_t op22,X x,V v)271         void op(uint32_t op22, X x, V v) { this->op(op22,0,(V)x,v); }
272 
273         // Order matters... value is 4-bit encoding for condition code.
274         enum class Condition { eq,ne,cs,cc,mi,pl,vs,vc,hi,ls,ge,lt,gt,le,al };
275         void b(Condition, Label*);
276 
277         void jump(uint8_t condition, Label*);
278 
279         int disp19(Label*);
280         int disp32(Label*);
281 
282         uint8_t* fCode;
283         uint8_t* fCurr;
284         size_t   fSize;
285     };
286 
287     // Order matters a little: Ops <=store32 are treated as having side effects.
288     #define SKVM_OPS(M)                       \
289         M(assert_true)                        \
290         M(store8)   M(store16)   M(store32)   \
291         M(index)                              \
292         M(load8)    M(load16)    M(load32)    \
293         M(gather8)  M(gather16)  M(gather32)  \
294         M(uniform8) M(uniform16) M(uniform32) \
295         M(splat)                              \
296         M(add_f32) M(add_i32) M(add_i16x2)    \
297         M(sub_f32) M(sub_i32) M(sub_i16x2)    \
298         M(mul_f32) M(mul_i32) M(mul_i16x2)    \
299         M(div_f32)                            \
300         M(min_f32)                            \
301         M(max_f32)                            \
302         M(mad_f32)                            \
303         M(sqrt_f32)                           \
304                    M(shl_i32) M(shl_i16x2)    \
305                    M(shr_i32) M(shr_i16x2)    \
306                    M(sra_i32) M(sra_i16x2)    \
307         M(add_f32_imm)                        \
308         M(sub_f32_imm)                        \
309         M(mul_f32_imm)                        \
310         M(min_f32_imm)                        \
311         M(max_f32_imm)                        \
312         M(floor) M(trunc) M(round) M(to_f32)  \
313         M( eq_f32) M( eq_i32) M( eq_i16x2)    \
314         M(neq_f32) M(neq_i32) M(neq_i16x2)    \
315         M( gt_f32) M( gt_i32) M( gt_i16x2)    \
316         M(gte_f32) M(gte_i32) M(gte_i16x2)    \
317         M(bit_and)                            \
318         M(bit_or)                             \
319         M(bit_xor)                            \
320         M(bit_clear)                          \
321         M(bit_and_imm)                        \
322         M(bit_or_imm)                         \
323         M(bit_xor_imm)                        \
324         M(select) M(bytes) M(pack)            \
325     // End of SKVM_OPS
326 
327     enum class Op : int {
328     #define M(op) op,
329         SKVM_OPS(M)
330     #undef M
331     };
332 
333     using Val = int;
334     // We reserve the last Val ID as a sentinel meaning none, n/a, null, nil, etc.
335     static const Val NA = ~0;
336 
337     struct Arg { int ix; };
338     struct I32 { Val id; };
339     struct F32 { Val id; };
340 
341     struct Color { skvm::F32 r,g,b,a; };
342 
343     struct OptimizedInstruction {
344         Op op;
345         Val x,y,z;
346         int immy,immz;
347 
348         int  death;
349         bool can_hoist;
350         bool used_in_loop;
351     };
352 
353     class Builder {
354     public:
355         SK_BEGIN_REQUIRE_DENSE
356         struct Instruction {
357             Op  op;         // v* = op(x,y,z,imm), where * == index of this Instruction.
358             Val x,y,z;      // Enough arguments for mad().
359             int immy,immz;  // Immediate bit pattern, shift count, argument index, etc.
360         };
361         SK_END_REQUIRE_DENSE
362 
363         Program done(const char* debug_name = nullptr) const;
364 
365         // Mostly for debugging, tests, etc.
program()366         std::vector<Instruction> program() const { return fProgram; }
367         std::vector<OptimizedInstruction> optimize(bool for_jit=false) const;
368 
369         // Declare an argument with given stride (use stride=0 for uniforms).
370         // TODO: different types for varying and uniforms?
371         Arg arg(int stride);
372 
373         // Convenience arg() wrappers for most common strides, sizeof(T) and 0.
374         template <typename T>
varying()375         Arg varying() { return this->arg(sizeof(T)); }
uniform()376         Arg uniform() { return this->arg(0); }
377 
378         // TODO: allow uniform (i.e. Arg) offsets to store* and load*?
379         // TODO: sign extension (signed types) for <32-bit loads?
380         // TODO: unsigned integer operations where relevant (just comparisons?)?
381 
382         // Assert cond is true, printing debug when not.
383         void assert_true(I32 cond, I32 debug);
assert_true(I32 cond,F32 debug)384         void assert_true(I32 cond, F32 debug) { this->assert_true(cond, this->bit_cast(debug)); }
assert_true(I32 cond)385         void assert_true(I32 cond)            { this->assert_true(cond, cond); }
386 
387         // Store {8,16,32}-bit varying.
388         void store8 (Arg ptr, I32 val);
389         void store16(Arg ptr, I32 val);
390         void store32(Arg ptr, I32 val);
391 
392         // Returns varying {n, n-1, n-2, ..., 1}, where n is the argument to Program::eval().
393         I32 index();
394 
395         // Load u8,u16,i32 varying.
396         I32 load8 (Arg ptr);
397         I32 load16(Arg ptr);
398         I32 load32(Arg ptr);
399 
400         // Load u8,u16,i32 uniform with byte-count offset.
401         I32 uniform8 (Arg ptr, int offset);
402         I32 uniform16(Arg ptr, int offset);
403         I32 uniform32(Arg ptr, int offset);
uniformF(Arg ptr,int offset)404         F32 uniformF (Arg ptr, int offset) { return this->bit_cast(this->uniform32(ptr,offset)); }
405 
406         // Gather u8,u16,i32 with varying element-count index from *(ptr + byte-count offset).
407         I32 gather8 (Arg ptr, int offset, I32 index);
408         I32 gather16(Arg ptr, int offset, I32 index);
409         I32 gather32(Arg ptr, int offset, I32 index);
410 
411         // Convenience methods for working with skvm::Uniforms.
412         struct Uniform {
413             Arg ptr;
414             int offset;
415         };
uniform8(Uniform u)416         I32 uniform8 (Uniform u)            { return this->uniform8 (u.ptr, u.offset); }
uniform16(Uniform u)417         I32 uniform16(Uniform u)            { return this->uniform16(u.ptr, u.offset); }
uniform32(Uniform u)418         I32 uniform32(Uniform u)            { return this->uniform32(u.ptr, u.offset); }
uniformF(Uniform u)419         F32 uniformF (Uniform u)            { return this->uniformF (u.ptr, u.offset); }
gather8(Uniform u,I32 index)420         I32 gather8  (Uniform u, I32 index) { return this->gather8  (u.ptr, u.offset, index); }
gather16(Uniform u,I32 index)421         I32 gather16 (Uniform u, I32 index) { return this->gather16 (u.ptr, u.offset, index); }
gather32(Uniform u,I32 index)422         I32 gather32 (Uniform u, I32 index) { return this->gather32 (u.ptr, u.offset, index); }
423 
424         // Load an immediate constant.
425         I32 splat(int      n);
splat(unsigned u)426         I32 splat(unsigned u) { return this->splat((int)u); }
427         F32 splat(float    f);
428 
429         // float math, comparisons, etc.
430         F32 add(F32 x, F32 y);
431         F32 sub(F32 x, F32 y);
432         F32 mul(F32 x, F32 y);
433         F32 div(F32 x, F32 y);
434         F32 min(F32 x, F32 y);
435         F32 max(F32 x, F32 y);
436         F32 mad(F32 x, F32 y, F32 z);  //  x*y+z, often an FMA
437         F32 sqrt(F32 x);
438 
negate(F32 x)439         F32 negate(F32 x) {
440             return sub(splat(0.0f), x);
441         }
lerp(F32 lo,F32 hi,F32 t)442         F32 lerp(F32 lo, F32 hi, F32 t) {
443             return mad(sub(hi,lo), t, lo);
444         }
clamp(F32 x,F32 lo,F32 hi)445         F32 clamp(F32 x, F32 lo, F32 hi) {
446             return max(lo, min(x, hi));
447         }
abs(F32 x)448         F32 abs(F32 x) {
449             return bit_cast(bit_and(bit_cast(x),
450                                     splat(0x7fffffff)));
451         }
fract(F32 x)452         F32 fract(F32 x) {
453             return sub(x, floor(x));
454         }
norm(F32 x,F32 y)455         F32 norm(F32 x, F32 y) {
456             return sqrt(mad(x,x, mul(y,y)));
457         }
458 
459         I32 eq (F32 x, F32 y);
460         I32 neq(F32 x, F32 y);
461         I32 lt (F32 x, F32 y);
462         I32 lte(F32 x, F32 y);
463         I32 gt (F32 x, F32 y);
464         I32 gte(F32 x, F32 y);
465 
466         F32 floor(F32);
467         I32 trunc(F32 x);
468         I32 round(F32 x);
bit_cast(F32 x)469         I32 bit_cast(F32 x) { return {x.id}; }
470 
471         // int math, comparisons, etc.
472         I32 add(I32 x, I32 y);
473         I32 sub(I32 x, I32 y);
474         I32 mul(I32 x, I32 y);
475 
476         I32 shl(I32 x, int bits);
477         I32 shr(I32 x, int bits);
478         I32 sra(I32 x, int bits);
479 
480         I32 eq (I32 x, I32 y);
481         I32 neq(I32 x, I32 y);
482         I32 lt (I32 x, I32 y);
483         I32 lte(I32 x, I32 y);
484         I32 gt (I32 x, I32 y);
485         I32 gte(I32 x, I32 y);
486 
487         F32 to_f32(I32 x);
bit_cast(I32 x)488         F32 bit_cast(I32 x) { return {x.id}; }
489 
490         // Treat each 32-bit lane as a pair of 16-bit ints.
491         I32 add_16x2(I32 x, I32 y);
492         I32 sub_16x2(I32 x, I32 y);
493         I32 mul_16x2(I32 x, I32 y);
494 
495         I32 shl_16x2(I32 x, int bits);
496         I32 shr_16x2(I32 x, int bits);
497         I32 sra_16x2(I32 x, int bits);
498 
499         I32  eq_16x2(I32 x, I32 y);
500         I32 neq_16x2(I32 x, I32 y);
501         I32  lt_16x2(I32 x, I32 y);
502         I32 lte_16x2(I32 x, I32 y);
503         I32  gt_16x2(I32 x, I32 y);
504         I32 gte_16x2(I32 x, I32 y);
505 
506         // Bitwise operations.
507         I32 bit_and  (I32 x, I32 y);
508         I32 bit_or   (I32 x, I32 y);
509         I32 bit_xor  (I32 x, I32 y);
510         I32 bit_clear(I32 x, I32 y);   // x & ~y
511 
512         I32 select(I32 cond, I32 t, I32 f);  // cond ? t : f
select(I32 cond,F32 t,F32 f)513         F32 select(I32 cond, F32 t, F32 f) {
514             return this->bit_cast(this->select(cond, this->bit_cast(t)
515                                                    , this->bit_cast(f)));
516         }
517 
518         // More complex operations...
519 
520         // Shuffle the bytes in x according to each nibble of control, as if
521         //
522         //    uint8_t bytes[] = {
523         //        0,
524         //        ((uint32_t)x      ) & 0xff,
525         //        ((uint32_t)x >>  8) & 0xff,
526         //        ((uint32_t)x >> 16) & 0xff,
527         //        ((uint32_t)x >> 24) & 0xff,
528         //    };
529         //    return (uint32_t)bytes[(control >>  0) & 0xf] <<  0
530         //         | (uint32_t)bytes[(control >>  4) & 0xf] <<  8
531         //         | (uint32_t)bytes[(control >>  8) & 0xf] << 16
532         //         | (uint32_t)bytes[(control >> 12) & 0xf] << 24;
533         //
534         // So, e.g.,
535         //    - bytes(x, 0x1111) splats the low byte of x to all four bytes
536         //    - bytes(x, 0x4321) is x, an identity
537         //    - bytes(x, 0x0000) is 0
538         //    - bytes(x, 0x0404) transforms an RGBA pixel into an A0A0 bit pattern.
539         I32 bytes  (I32 x, int control);
540 
541         I32 extract(I32 x, int bits, I32 z);   // (x>>bits) & z
542         I32 pack   (I32 x, I32 y, int bits);   // x | (y << bits), assuming (x & (y << bits)) == 0
543 
544         // Common idioms used in several places, worth centralizing for consistency.
545         F32 from_unorm(int bits, I32);   // E.g. from_unorm(8, x) -> x * (1/255.0f)
546         I32   to_unorm(int bits, F32);   // E.g.   to_unorm(8, x) -> round(x * 255)
547 
548         Color unpack_1010102(I32 rgba);
549         Color unpack_8888   (I32 rgba);
550         Color unpack_565    (I32 bgr );  // bottom 16 bits
551 
552         void   premul(F32* r, F32* g, F32* b, F32 a);
553         void unpremul(F32* r, F32* g, F32* b, F32 a);
554 
555         Color lerp(Color lo, Color hi, F32 t);
556 
557         void dump(SkWStream* = nullptr) const;
558 
559         uint64_t hash() const;
560 
561     private:
562         struct InstructionHash {
563             uint32_t operator()(const Instruction& inst, uint32_t seed=0) const;
564         };
565 
566         Val push(Op, Val x, Val y=NA, Val z=NA, int immy=0, int immz=0);
567 
568         bool allImm() const;
569 
570         template <typename T, typename... Rest>
571         bool allImm(Val, T* imm, Rest...) const;
572 
573         template <typename T>
isImm(Val id,T want)574         bool isImm(Val id, T want) const {
575             T imm = 0;
576             return this->allImm(id, &imm) && imm == want;
577         }
578 
579         SkTHashMap<Instruction, Val, InstructionHash> fIndex;
580         std::vector<Instruction>                      fProgram;
581         std::vector<int>                              fStrides;
582     };
583 
584     // Helper to streamline allocating and working with uniforms.
585     struct Uniforms {
586         Arg              base;
587         std::vector<int> buf;
588 
UniformsUniforms589         explicit Uniforms(int init) : base(Arg{0}), buf(init) {}
590 
pushUniforms591         Builder::Uniform push(int val) {
592             buf.push_back(val);
593             return {base, (int)( sizeof(int)*(buf.size() - 1) )};
594         }
595 
pushFUniforms596         Builder::Uniform pushF(float val) {
597             int bits;
598             memcpy(&bits, &val, sizeof(int));
599             return this->push(bits);
600         }
601 
pushPtrUniforms602         Builder::Uniform pushPtr(const void* ptr) {
603             // Jam the pointer into 1 or 2 ints.
604             int ints[sizeof(ptr) / sizeof(int)];
605             memcpy(ints, &ptr, sizeof(ptr));
606             for (int bits : ints) {
607                 buf.push_back(bits);
608             }
609             return {base, (int)( sizeof(int)*(buf.size() - SK_ARRAY_COUNT(ints)) )};
610         }
611     };
612 
613     using Reg = int;
614 
615     class Program {
616     public:
617         struct Instruction {   // d = op(x, y/imm, z/imm)
618             Op  op;
619             Reg d,x;
620             union { Reg y; int immy; };
621             union { Reg z; int immz; };
622         };
623 
624         Program(const std::vector<OptimizedInstruction>& interpreter,
625                 const std::vector<int>& strides);
626 
627         Program(const std::vector<OptimizedInstruction>& interpreter,
628                 const std::vector<OptimizedInstruction>& jit,
629                 const std::vector<int>& strides,
630                 const char* debug_name);
631 
632         Program();
633         ~Program();
634         Program(Program&&);
635         Program& operator=(Program&&);
636         Program(const Program&) = delete;
637         Program& operator=(const Program&) = delete;
638 
639         void eval(int n, void* args[]) const;
640 
641         template <typename... T>
eval(int n,T * ...arg)642         void eval(int n, T*... arg) const {
643             SkASSERT(sizeof...(arg) == fStrides.size());
644             // This nullptr isn't important except that it makes args[] non-empty if you pass none.
645             void* args[] = { (void*)arg..., nullptr };
646             this->eval(n, args);
647         }
648 
instructions()649         std::vector<Instruction> instructions() const { return fInstructions; }
nregs()650         int nregs() const { return fRegs; }
loop()651         int loop() const { return fLoop; }
empty()652         bool empty() const { return fInstructions.empty(); }
653 
654         bool hasJIT() const;  // Has this Program been JITted?
655         void dropJIT();       // If hasJIT(), drop it, forcing interpreter fallback.
656 
657         void dump(SkWStream* = nullptr) const;
658 
659     private:
660         void setupInterpreter(const std::vector<OptimizedInstruction>&);
661         void setupJIT        (const std::vector<OptimizedInstruction>&, const char* debug_name);
662 
663         void interpret(int n, void* args[]) const;
664 
665         bool jit(const std::vector<OptimizedInstruction>&,
666                  bool try_hoisting,
667                  Assembler*) const;
668 
669         std::vector<Instruction> fInstructions;
670         int                      fRegs = 0;
671         int                      fLoop = 0;
672         std::vector<int>         fStrides;
673 
674         void*  fJITEntry = nullptr;
675         size_t fJITSize  = 0;
676         void*  fDylib    = nullptr;
677     };
678 
679     // TODO: control flow
680     // TODO: 64-bit values?
681     // TODO: SSE2/SSE4.1, AVX-512F, ARMv8.2 JITs?
682     // TODO: lower to LLVM or WebASM for comparison?
683 }
684 
685 #endif//SkVM_DEFINED
686