• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2019 Google LLC
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #ifndef SkVM_DEFINED
9 #define SkVM_DEFINED
10 
11 #include "include/core/SkTypes.h"
12 #include "include/private/SkTHash.h"
13 #include <vector>
14 
15 namespace skvm {
16 
17     class Assembler {
18     public:
19         explicit Assembler(void* buf);
20 
21         size_t size() const;
22 
23         // Order matters... GP64, Xmm, Ymm values match 4-bit register encoding for each.
24         enum GP64 {
25             rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi,
26             r8 , r9 , r10, r11, r12, r13, r14, r15,
27         };
28         enum Xmm {
29             xmm0, xmm1, xmm2 , xmm3 , xmm4 , xmm5 , xmm6 , xmm7 ,
30             xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
31         };
32         enum Ymm {
33             ymm0, ymm1, ymm2 , ymm3 , ymm4 , ymm5 , ymm6 , ymm7 ,
34             ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15,
35         };
36 
37         // X and V values match 5-bit encoding for each (nothing tricky).
38         enum X {
39             x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 ,
40             x8 , x9 , x10, x11, x12, x13, x14, x15,
41             x16, x17, x18, x19, x20, x21, x22, x23,
42             x24, x25, x26, x27, x28, x29, x30, xzr,
43         };
44         enum V {
45             v0 , v1 , v2 , v3 , v4 , v5 , v6 , v7 ,
46             v8 , v9 , v10, v11, v12, v13, v14, v15,
47             v16, v17, v18, v19, v20, v21, v22, v23,
48             v24, v25, v26, v27, v28, v29, v30, v31,
49         };
50 
51         void bytes(const void*, int);
52         void byte(uint8_t);
53         void word(uint32_t);
54 
55         // x86-64
56 
57         void align(int mod);
58 
59         void vzeroupper();
60         void ret();
61 
62         void add(GP64, int imm);
63         void sub(GP64, int imm);
64 
65         // All dst = x op y.
66         using DstEqXOpY = void(Ymm dst, Ymm x, Ymm y);
67         DstEqXOpY vpand, vpor, vpxor, vpandn,
68                   vpaddd, vpsubd, vpmulld,
69                           vpsubw, vpmullw,
70                   vaddps, vsubps, vmulps, vdivps,
71                   vfmadd132ps, vfmadd213ps, vfmadd231ps,
72                   vpackusdw, vpackuswb,
73                   vpcmpeqd, vpcmpgtd;
74 
75         using DstEqXOpImm = void(Ymm dst, Ymm x, int imm);
76         DstEqXOpImm vpslld, vpsrld, vpsrad,
77                     vpsrlw,
78                     vpermq;
79 
80         using DstEqOpX = void(Ymm dst, Ymm x);
81         DstEqOpX vmovdqa, vcvtdq2ps, vcvttps2dq;
82 
83         void vpblendvb(Ymm dst, Ymm x, Ymm y, Ymm z);
84 
85         struct Label {
86             int                                 offset = 0;
87             enum { None, ARMDisp19, X86Disp32 } kind = None;
88             std::vector<int>                    references;
89         };
90 
91         Label here();
92         void label(Label*);
93 
94         void jmp(Label*);
95         void je (Label*);
96         void jne(Label*);
97         void jl (Label*);
98         void cmp(GP64, int imm);
99 
100         void vbroadcastss(Ymm dst, Label*);
101         void vbroadcastss(Ymm dst, Xmm src);
102         void vbroadcastss(Ymm dst, GP64 ptr, int off);  // dst = *(ptr+off)
103 
104         void vpshufb(Ymm dst, Ymm x, Label*);
105 
106         void vmovups  (Ymm dst, GP64 ptr);   // dst = *ptr, 256-bit
107         void vpmovzxwd(Ymm dst, GP64 ptr);   // dst = *ptr, 128-bit, each uint16_t expanded to int
108         void vpmovzxbd(Ymm dst, GP64 ptr);   // dst = *ptr,  64-bit, each uint8_t  expanded to int
109         void vmovd    (Xmm dst, GP64 ptr);   // dst = *ptr,  32-bit
110 
111         void vmovups(GP64 ptr, Ymm src);     // *ptr = src, 256-bit
112         void vmovups(GP64 ptr, Xmm src);     // *ptr = src, 128-bit
113         void vmovq  (GP64 ptr, Xmm src);     // *ptr = src,  64-bit
114         void vmovd  (GP64 ptr, Xmm src);     // *ptr = src,  32-bit
115 
116         void movzbl(GP64 dst, GP64 ptr, int off);  // dst = *(ptr+off), uint8_t -> int
117         void movb  (GP64 ptr, GP64 src);           // *ptr = src, 8-bit
118 
119         void vmovd_direct(GP64 dst, Xmm src);  // dst = src, 32-bit
120         void vmovd_direct(Xmm dst, GP64 src);  // dst = src, 32-bit
121 
122         void vpinsrw(Xmm dst, Xmm src, GP64 ptr, int imm);  // dst = src; dst[imm] = *ptr, 16-bit
123         void vpinsrb(Xmm dst, Xmm src, GP64 ptr, int imm);  // dst = src; dst[imm] = *ptr,  8-bit
124 
125         void vpextrw(GP64 ptr, Xmm src, int imm);           // *dst = src[imm]           , 16-bit
126         void vpextrb(GP64 ptr, Xmm src, int imm);           // *dst = src[imm]           ,  8-bit
127 
128         // aarch64
129 
130         // d = op(n,m)
131         using DOpNM = void(V d, V n, V m);
132         DOpNM  and16b, orr16b, eor16b, bic16b,
133                add4s,  sub4s,  mul4s,
134                        sub8h,  mul8h,
135               fadd4s, fsub4s, fmul4s, fdiv4s,
136               tbl;
137 
138         // d += n*m
139         void fmla4s(V d, V n, V m);
140 
141         // d = op(n,imm)
142         using DOpNImm = void(V d, V n, int imm);
143         DOpNImm sli4s,
144                 shl4s, sshr4s, ushr4s,
145                                ushr8h;
146 
147         // d = op(n)
148         using DOpN = void(V d, V n);
149         DOpN scvtf4s,   // int -> float
150              fcvtzs4s,  // truncate float -> int
151              xtns2h,    // u32 -> u16
152              xtnh2b,    // u16 -> u8
153              uxtlb2h,   // u8 -> u16
154              uxtlh2s;   // u16 -> u32
155 
156         // TODO: both these platforms support rounding float->int (vcvtps2dq, fcvtns.4s)... use?
157 
158         void ret (X);
159         void add (X d, X n, int imm12);
160         void sub (X d, X n, int imm12);
161         void subs(X d, X n, int imm12);  // subtract setting condition flags
162 
163         // There's another encoding for unconditional branches that can jump further,
164         // but this one encoded as b.al is simple to implement and should be fine.
b(Label * l)165         void b  (Label* l) { this->b(Condition::al, l); }
bne(Label * l)166         void bne(Label* l) { this->b(Condition::ne, l); }
blt(Label * l)167         void blt(Label* l) { this->b(Condition::lt, l); }
168 
169         // "cmp ..." is just an assembler mnemonic for "subs xzr, ..."!
cmp(X n,int imm12)170         void cmp(X n, int imm12) { this->subs(xzr, n, imm12); }
171 
172         // Compare and branch if zero/non-zero, as if
173         //      cmp(t,0)
174         //      beq/bne(l)
175         // but without setting condition flags.
176         void cbz (X t, Label* l);
177         void cbnz(X t, Label* l);
178 
179         void ldrq(V dst, Label*);  // 128-bit PC-relative load
180 
181         void ldrq(V dst, X src);  // 128-bit dst = *src
182         void ldrs(V dst, X src);  //  32-bit dst = *src
183         void ldrb(V dst, X src);  //   8-bit dst = *src
184 
185         void strq(V src, X dst);  // 128-bit *dst = src
186         void strs(V src, X dst);  //  32-bit *dst = src
187         void strb(V src, X dst);  //   8-bit *dst = src
188 
189     private:
190         // dst = op(dst, imm)
191         void op(int opcode, int opcode_ext, GP64 dst, int imm);
192 
193 
194         // dst = op(x,y) or op(x)
195         void op(int prefix, int map, int opcode, Ymm dst, Ymm x, Ymm y, bool W=false);
196         void op(int prefix, int map, int opcode, Ymm dst, Ymm x,        bool W=false) {
197             // Two arguments ops seem to pass them in dst and y, forcing x to 0 so VEX.vvvv == 1111.
198             this->op(prefix, map, opcode, dst,(Ymm)0,x, W);
199         }
200 
201         // dst = op(x,imm)
202         void op(int prefix, int map, int opcode, int opcode_ext, Ymm dst, Ymm x, int imm);
203 
204         // dst = op(x,label) or op(label)
205         void op(int prefix, int map, int opcode, Ymm dst, Ymm x, Label* l);
206 
207         // *ptr = ymm or ymm = *ptr, depending on opcode.
208         void load_store(int prefix, int map, int opcode, Ymm ymm, GP64 ptr);
209 
210         // Opcode for 3-arguments ops is split between hi and lo:
211         //    [11 bits hi] [5 bits m] [6 bits lo] [5 bits n] [5 bits d]
212         void op(uint32_t hi, V m, uint32_t lo, V n, V d);
213 
214         // 2-argument ops, with or without an immediate.
215         void op(uint32_t op22, int imm, V n, V d);
op(uint32_t op22,V n,V d)216         void op(uint32_t op22, V n, V d) { this->op(op22,0,n,d); }
op(uint32_t op22,X x,V v)217         void op(uint32_t op22, X x, V v) { this->op(op22,0,(V)x,v); }
218 
219         // Order matters... value is 4-bit encoding for condition code.
220         enum class Condition { eq,ne,cs,cc,mi,pl,vs,vc,hi,ls,ge,lt,gt,le,al };
221         void b(Condition, Label*);
222 
223         void jump(uint8_t condition, Label*);
224 
225         int disp19(Label*);
226         int disp32(Label*);
227 
228         uint8_t* fCode;
229         uint8_t* fCurr;
230         size_t   fSize;
231     };
232 
233     enum class Op : uint8_t {
234           store8,   store16,   store32,
235     // ↑ side effects / no side effects ↓
236 
237            load8,    load16,    load32,
238          gather8,  gather16,  gather32,
239     // ↑ always varying / uniforms, constants, Just Math ↓
240 
241         uniform8, uniform16, uniform32,
242         splat,
243 
244         add_f32, add_i32, add_i16x2,
245         sub_f32, sub_i32, sub_i16x2,
246         mul_f32, mul_i32, mul_i16x2,
247         div_f32,
248         mad_f32,
249                  shl_i32, shl_i16x2,
250                  shr_i32, shr_i16x2,
251                  sra_i32, sra_i16x2,
252 
253          to_i32,  to_f32,
254 
255          eq_f32,  eq_i32,  eq_i16x2,
256         neq_f32, neq_i32, neq_i16x2,
257          lt_f32,  lt_i32,  lt_i16x2,
258         lte_f32, lte_i32, lte_i16x2,
259          gt_f32,  gt_i32,  gt_i16x2,
260         gte_f32, gte_i32, gte_i16x2,
261 
262         bit_and,
263         bit_or,
264         bit_xor,
265         bit_clear,
266         select,
267 
268         bytes, extract, pack,
269     };
270 
271     using Val = int;
272     // We reserve the last Val ID as a sentinel meaning none, n/a, null, nil, etc.
273     static const Val NA = ~0;
274 
275     struct Arg { int ix; };
276     struct I32 { Val id; };
277     struct F32 { Val id; };
278 
279     class Program;
280 
281     class Builder {
282     public:
283         struct Instruction {
284             Op  op;         // v* = op(x,y,z,imm), where * == index of this Instruction.
285             Val x,y,z;      // Enough arguments for mad().
286             int imm;        // Immediate bit pattern, shift count, argument index, etc.
287 
288             // Not populated until done() has been called.
289             int  death;     // Index of last live instruction taking this input; live if != 0.
290             bool hoist;     // Value independent of all loop variables?
291         };
292 
293         Program done(const char* debug_name = nullptr);
294 
295         // Mostly for debugging, tests, etc.
program()296         std::vector<Instruction> program() const { return fProgram; }
297 
298 
299         // Declare an argument with given stride (use stride=0 for uniforms).
300         // TODO: different types for varying and uniforms?
301         Arg arg(int stride);
302 
303         // Convenience arg() wrappers for most common strides, sizeof(T) and 0.
304         template <typename T>
varying()305         Arg varying() { return this->arg(sizeof(T)); }
uniform()306         Arg uniform() { return this->arg(0); }
307 
308         // TODO: allow uniform (i.e. Arg) offsets to store* and load*?
309         // TODO: sign extension (signed types) for <32-bit loads?
310         // TODO: unsigned integer operations where relevant (just comparisons?)?
311 
312         // Store {8,16,32}-bit varying.
313         void store8 (Arg ptr, I32 val);
314         void store16(Arg ptr, I32 val);
315         void store32(Arg ptr, I32 val);
316 
317         // Load u8,u16,i32 varying.
318         I32 load8 (Arg ptr);
319         I32 load16(Arg ptr);
320         I32 load32(Arg ptr);
321 
322         // Gather u8,u16,i32 with varying element-count offset.
323         I32 gather8 (Arg ptr, I32 offset);
324         I32 gather16(Arg ptr, I32 offset);
325         I32 gather32(Arg ptr, I32 offset);
326 
327         // Load u8,u16,i32 uniform with optional byte-count offset.
328         I32 uniform8 (Arg ptr, int offset=0);
329         I32 uniform16(Arg ptr, int offset=0);
330         I32 uniform32(Arg ptr, int offset=0);
331 
332         // Load an immediate constant.
333         I32 splat(int      n);
splat(unsigned u)334         I32 splat(unsigned u) { return this->splat((int)u); }
335         F32 splat(float    f);
336 
337         // float math, comparisons, etc.
338         F32 add(F32 x, F32 y);
339         F32 sub(F32 x, F32 y);
340         F32 mul(F32 x, F32 y);
341         F32 div(F32 x, F32 y);
342         F32 mad(F32 x, F32 y, F32 z);  //  x*y+z, often an FMA
343 
344         I32 eq (F32 x, F32 y);
345         I32 neq(F32 x, F32 y);
346         I32 lt (F32 x, F32 y);
347         I32 lte(F32 x, F32 y);
348         I32 gt (F32 x, F32 y);
349         I32 gte(F32 x, F32 y);
350 
351         I32 to_i32(F32 x);
bit_cast(F32 x)352         I32 bit_cast(F32 x) { return {x.id}; }
353 
354         // int math, comparisons, etc.
355         I32 add(I32 x, I32 y);
356         I32 sub(I32 x, I32 y);
357         I32 mul(I32 x, I32 y);
358 
359         I32 shl(I32 x, int bits);
360         I32 shr(I32 x, int bits);
361         I32 sra(I32 x, int bits);
362 
363         I32 eq (I32 x, I32 y);
364         I32 neq(I32 x, I32 y);
365         I32 lt (I32 x, I32 y);
366         I32 lte(I32 x, I32 y);
367         I32 gt (I32 x, I32 y);
368         I32 gte(I32 x, I32 y);
369 
370         F32 to_f32(I32 x);
bit_cast(I32 x)371         F32 bit_cast(I32 x) { return {x.id}; }
372 
373         // Treat each 32-bit lane as a pair of 16-bit ints.
374         I32 add_16x2(I32 x, I32 y);
375         I32 sub_16x2(I32 x, I32 y);
376         I32 mul_16x2(I32 x, I32 y);
377 
378         I32 shl_16x2(I32 x, int bits);
379         I32 shr_16x2(I32 x, int bits);
380         I32 sra_16x2(I32 x, int bits);
381 
382         I32  eq_16x2(I32 x, I32 y);
383         I32 neq_16x2(I32 x, I32 y);
384         I32  lt_16x2(I32 x, I32 y);
385         I32 lte_16x2(I32 x, I32 y);
386         I32  gt_16x2(I32 x, I32 y);
387         I32 gte_16x2(I32 x, I32 y);
388 
389         // Bitwise operations.
390         I32 bit_and  (I32 x, I32 y);
391         I32 bit_or   (I32 x, I32 y);
392         I32 bit_xor  (I32 x, I32 y);
393         I32 bit_clear(I32 x, I32 y);   // x & ~y
394 
395         I32 select(I32 cond, I32 t, I32 f);  // cond ? t : f
select(I32 cond,F32 t,F32 f)396         F32 select(I32 cond, F32 t, F32 f) {
397             return this->bit_cast(this->select(cond, this->bit_cast(t)
398                                                    , this->bit_cast(f)));
399         }
400 
401         // More complex operations...
402 
403         // Shuffle the bytes in x according to each nibble of control, as if
404         //
405         //    uint8_t bytes[] = {
406         //        0,
407         //        ((uint32_t)x      ) & 0xff,
408         //        ((uint32_t)x >>  8) & 0xff,
409         //        ((uint32_t)x >> 16) & 0xff,
410         //        ((uint32_t)x >> 24) & 0xff,
411         //    };
412         //    return (uint32_t)bytes[(control >>  0) & 0xf] <<  0
413         //         | (uint32_t)bytes[(control >>  4) & 0xf] <<  8
414         //         | (uint32_t)bytes[(control >>  8) & 0xf] << 16
415         //         | (uint32_t)bytes[(control >> 12) & 0xf] << 24;
416         //
417         // So, e.g.,
418         //    - bytes(x, 0x1111) splats the low byte of x to all four bytes
419         //    - bytes(x, 0x4321) is x, an identity
420         //    - bytes(x, 0x0000) is 0
421         //    - bytes(x, 0x0404) transforms an RGBA pixel into an A0A0 bit pattern.
422         I32 bytes  (I32 x, int control);
423 
424         I32 extract(I32 x, int bits, I32 y);   // (x >> bits) & y
425         I32 pack   (I32 x, I32 y, int bits);   // x | (y << bits), assuming (x & (y << bits)) == 0
426 
427     private:
428         struct InstructionHash {
429             template <typename T>
HashInstructionHash430             static size_t Hash(T val) {
431                 return std::hash<T>{}(val);
432             }
operatorInstructionHash433             size_t operator()(const Instruction& inst) const {
434                 return Hash((uint8_t)inst.op)
435                      ^ Hash(inst.x)
436                      ^ Hash(inst.y)
437                      ^ Hash(inst.z)
438                      ^ Hash(inst.imm)
439                      ^ Hash(inst.death)
440                      ^ Hash(inst.hoist);
441             }
442         };
443 
444         Val push(Op, Val x, Val y=NA, Val z=NA, int imm=0);
445         bool isZero(Val) const;
446 
447         SkTHashMap<Instruction, Val, InstructionHash> fIndex;
448         std::vector<Instruction>                      fProgram;
449         std::vector<int>                              fStrides;
450     };
451 
452     using Reg = int;
453 
454     class Program {
455     public:
456         struct Instruction {   // d = op(x, y, z/imm)
457             Op  op;
458             Reg d,x,y;
459             union { Reg z; int imm; };
460         };
461 
462         Program(const std::vector<Builder::Instruction>& instructions,
463                 const std::vector<int>                 & strides,
464                 const char* debug_name);
465 
466         Program();
467         ~Program();
468         Program(Program&&);
469         Program& operator=(Program&&);
470         Program(const Program&) = delete;
471         Program& operator=(const Program&) = delete;
472 
473         void eval(int n, void* args[]) const;
474 
475         template <typename... T>
eval(int n,T * ...arg)476         void eval(int n, T*... arg) const {
477             SkASSERT(sizeof...(arg) == fStrides.size());
478             // This nullptr isn't important except that it makes args[] non-empty if you pass none.
479             void* args[] = { (void*)arg..., nullptr };
480             this->eval(n, args);
481         }
482 
instructions()483         std::vector<Instruction> instructions() const { return fInstructions; }
nregs()484         int nregs() const { return fRegs; }
loop()485         int loop() const { return fLoop; }
empty()486         bool empty() const { return fInstructions.empty(); }
487 
488         // If this Program has been JITted, drop it, forcing interpreter fallback.
489         void dropJIT();
490 
491     private:
492         void setupInterpreter(const std::vector<Builder::Instruction>&);
493         void setupJIT        (const std::vector<Builder::Instruction>&, const char* debug_name);
494 
495         bool jit(const std::vector<Builder::Instruction>&,
496                  bool hoist,
497                  Assembler*) const;
498 
499         // Dump jit-*.dump files for perf inject.
500         void dumpJIT(const char* debug_name, size_t size) const;
501 
502         std::vector<Instruction> fInstructions;
503         int                      fRegs = 0;
504         int                      fLoop = 0;
505         std::vector<int>         fStrides;
506 
507         void*  fJITBuf  = nullptr;
508         size_t fJITSize = 0;
509     };
510 
511     // TODO: control flow
512     // TODO: 64-bit values?
513     // TODO: SSE2/SSE4.1, AVX-512F, ARMv8.2 JITs?
514     // TODO: lower to LLVM or WebASM for comparison?
515 }
516 
517 #endif//SkVM_DEFINED
518