• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2019 Google LLC
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "include/core/SkColorPriv.h"
9 #include "include/private/SkColorData.h"
10 #include "src/core/SkVM.h"
11 #include "tests/Test.h"
12 #include "tools/Resources.h"
13 #include "tools/SkVMBuilders.h"
14 
15 using Fmt = SrcoverBuilder_F32::Fmt;
fmt_name(Fmt fmt)16 const char* fmt_name(Fmt fmt) {
17     switch (fmt) {
18         case Fmt::A8:        return "A8";
19         case Fmt::G8:        return "G8";
20         case Fmt::RGBA_8888: return "RGBA_8888";
21     }
22     return "";
23 }
24 
25 namespace {
26     using namespace skvm;
27 
28     struct V { Val id; };
29     struct R { Reg id; };
30     struct Shift { int bits; };
31     struct Splat { int bits; };
32     struct Hex   { int bits; };
33 
write(SkWStream * o,const char * s)34     static void write(SkWStream* o, const char* s) {
35         o->writeText(s);
36     }
37 
write(SkWStream * o,Arg a)38     static void write(SkWStream* o, Arg a) {
39         write(o, "arg(");
40         o->writeDecAsText(a.ix);
41         write(o, ")");
42     }
write(SkWStream * o,V v)43     static void write(SkWStream* o, V v) {
44         write(o, "v");
45         o->writeDecAsText(v.id);
46     }
write(SkWStream * o,R r)47     static void write(SkWStream* o, R r) {
48         write(o, "r");
49         o->writeDecAsText(r.id);
50     }
write(SkWStream * o,Shift s)51     static void write(SkWStream* o, Shift s) {
52         o->writeDecAsText(s.bits);
53     }
write(SkWStream * o,Splat s)54     static void write(SkWStream* o, Splat s) {
55         float f;
56         memcpy(&f, &s.bits, 4);
57         o->writeHexAsText(s.bits);
58         write(o, " (");
59         o->writeScalarAsText(f);
60         write(o, ")");
61     }
write(SkWStream * o,Hex h)62     static void write(SkWStream* o, Hex h) {
63         o->writeHexAsText(h.bits);
64     }
65 
66     template <typename T, typename... Ts>
write(SkWStream * o,T first,Ts...rest)67     static void write(SkWStream* o, T first, Ts... rest) {
68         write(o, first);
69         write(o, " ");
70         write(o, rest...);
71     }
72 
dump_builder(const Builder & builder,SkWStream * o)73     static void dump_builder(const Builder& builder, SkWStream* o) {
74         const std::vector<Builder::Instruction> program = builder.program();
75 
76         o->writeDecAsText(program.size());
77         o->writeText(" values:\n");
78         for (Val id = 0; id < (Val)program.size(); id++) {
79             const Builder::Instruction& inst = program[id];
80             Op  op = inst.op;
81             Val  x = inst.x,
82                  y = inst.y,
83                  z = inst.z;
84             int imm = inst.imm;
85             write(o, inst.death == 0 ? "☠️ " :
86                      inst.hoist      ? "↑ " : "  ");
87             switch (op) {
88                 case Op::store8:  write(o, "store8" , Arg{imm}, V{x}); break;
89                 case Op::store16: write(o, "store16", Arg{imm}, V{x}); break;
90                 case Op::store32: write(o, "store32", Arg{imm}, V{x}); break;
91 
92                 case Op::load8:  write(o, V{id}, "= load8" , Arg{imm}); break;
93                 case Op::load16: write(o, V{id}, "= load16", Arg{imm}); break;
94                 case Op::load32: write(o, V{id}, "= load32", Arg{imm}); break;
95 
96                 case Op::gather8:  write(o, V{id}, "= gather8" , Arg{imm}, V{x}); break;
97                 case Op::gather16: write(o, V{id}, "= gather16", Arg{imm}, V{x}); break;
98                 case Op::gather32: write(o, V{id}, "= gather32", Arg{imm}, V{x}); break;
99 
100                 case Op::uniform8:  write(o, V{id}, "= uniform8" , Arg{imm & 0xffff}, Hex{imm>>16}); break;
101                 case Op::uniform16: write(o, V{id}, "= uniform16", Arg{imm & 0xffff}, Hex{imm>>16}); break;
102                 case Op::uniform32: write(o, V{id}, "= uniform32", Arg{imm & 0xffff}, Hex{imm>>16}); break;
103 
104                 case Op::splat:  write(o, V{id}, "= splat", Splat{imm}); break;
105 
106 
107                 case Op::add_f32: write(o, V{id}, "= add_f32", V{x}, V{y}      ); break;
108                 case Op::sub_f32: write(o, V{id}, "= sub_f32", V{x}, V{y}      ); break;
109                 case Op::mul_f32: write(o, V{id}, "= mul_f32", V{x}, V{y}      ); break;
110                 case Op::div_f32: write(o, V{id}, "= div_f32", V{x}, V{y}      ); break;
111                 case Op::mad_f32: write(o, V{id}, "= mad_f32", V{x}, V{y}, V{z}); break;
112 
113                 case Op:: eq_f32: write(o, V{id}, "= eq_f32", V{x}, V{y}); break;
114                 case Op::neq_f32: write(o, V{id}, "= neq_f32", V{x}, V{y}); break;
115                 case Op:: lt_f32: write(o, V{id}, "= lt_f32", V{x}, V{y}); break;
116                 case Op::lte_f32: write(o, V{id}, "= lte_f32", V{x}, V{y}); break;
117                 case Op:: gt_f32: write(o, V{id}, "= gt_f32", V{x}, V{y}); break;
118                 case Op::gte_f32: write(o, V{id}, "= gte_f32", V{x}, V{y}); break;
119 
120 
121                 case Op::add_i32: write(o, V{id}, "= add_i32", V{x}, V{y}); break;
122                 case Op::sub_i32: write(o, V{id}, "= sub_i32", V{x}, V{y}); break;
123                 case Op::mul_i32: write(o, V{id}, "= mul_i32", V{x}, V{y}); break;
124 
125                 case Op::shl_i32: write(o, V{id}, "= shl_i32", V{x}, Shift{imm}); break;
126                 case Op::shr_i32: write(o, V{id}, "= shr_i32", V{x}, Shift{imm}); break;
127                 case Op::sra_i32: write(o, V{id}, "= sra_i32", V{x}, Shift{imm}); break;
128 
129                 case Op:: eq_i32: write(o, V{id}, "= eq_i32", V{x}, V{y}); break;
130                 case Op::neq_i32: write(o, V{id}, "= neq_i32", V{x}, V{y}); break;
131                 case Op:: lt_i32: write(o, V{id}, "= lt_i32", V{x}, V{y}); break;
132                 case Op::lte_i32: write(o, V{id}, "= lte_i32", V{x}, V{y}); break;
133                 case Op:: gt_i32: write(o, V{id}, "= gt_i32", V{x}, V{y}); break;
134                 case Op::gte_i32: write(o, V{id}, "= gte_i32", V{x}, V{y}); break;
135 
136                 case Op::add_i16x2: write(o, V{id}, "= add_i16x2", V{x}, V{y}); break;
137                 case Op::sub_i16x2: write(o, V{id}, "= sub_i16x2", V{x}, V{y}); break;
138                 case Op::mul_i16x2: write(o, V{id}, "= mul_i16x2", V{x}, V{y}); break;
139 
140                 case Op::shl_i16x2: write(o, V{id}, "= shl_i16x2", V{x}, Shift{imm}); break;
141                 case Op::shr_i16x2: write(o, V{id}, "= shr_i16x2", V{x}, Shift{imm}); break;
142                 case Op::sra_i16x2: write(o, V{id}, "= sra_i16x2", V{x}, Shift{imm}); break;
143 
144                 case Op:: eq_i16x2: write(o, V{id}, "= eq_i16x2", V{x}, V{y}); break;
145                 case Op::neq_i16x2: write(o, V{id}, "= neq_i16x2", V{x}, V{y}); break;
146                 case Op:: lt_i16x2: write(o, V{id}, "= lt_i16x2", V{x}, V{y}); break;
147                 case Op::lte_i16x2: write(o, V{id}, "= lte_i16x2", V{x}, V{y}); break;
148                 case Op:: gt_i16x2: write(o, V{id}, "= gt_i16x2", V{x}, V{y}); break;
149                 case Op::gte_i16x2: write(o, V{id}, "= gte_i16x2", V{x}, V{y}); break;
150 
151                 case Op::bit_and  : write(o, V{id}, "= bit_and"  , V{x}, V{y}      ); break;
152                 case Op::bit_or   : write(o, V{id}, "= bit_or"   , V{x}, V{y}      ); break;
153                 case Op::bit_xor  : write(o, V{id}, "= bit_xor"  , V{x}, V{y}      ); break;
154                 case Op::bit_clear: write(o, V{id}, "= bit_clear", V{x}, V{y}      ); break;
155                 case Op::select   : write(o, V{id}, "= select"   , V{x}, V{y}, V{z}); break;
156 
157                 case Op::bytes:   write(o, V{id}, "= bytes",   V{x}, Hex{imm}); break;
158                 case Op::extract: write(o, V{id}, "= extract", V{x}, Shift{imm}, V{y}); break;
159                 case Op::pack:    write(o, V{id}, "= pack",    V{x}, V{y}, Shift{imm}); break;
160 
161                 case Op::to_f32: write(o, V{id}, "= to_f32", V{x}); break;
162                 case Op::to_i32: write(o, V{id}, "= to_i32", V{x}); break;
163             }
164 
165             write(o, "\n");
166         }
167     }
168 
dump_program(const Program & program,SkWStream * o)169     static void dump_program(const Program& program, SkWStream* o) {
170         const std::vector<Program::Instruction> instructions = program.instructions();
171         const int nregs = program.nregs();
172         const int loop  = program.loop();
173 
174         o->writeDecAsText(nregs);
175         o->writeText(" registers, ");
176         o->writeDecAsText(instructions.size());
177         o->writeText(" instructions:\n");
178         for (int i = 0; i < (int)instructions.size(); i++) {
179             if (i == loop) {
180                 write(o, "loop:\n");
181             }
182             const Program::Instruction& inst = instructions[i];
183             Op   op = inst.op;
184             Reg   d = inst.d,
185                   x = inst.x,
186                   y = inst.y,
187                   z = inst.z;
188             int imm = inst.imm;
189             switch (op) {
190                 case Op::store8:  write(o, "store8" , Arg{imm}, R{x}); break;
191                 case Op::store16: write(o, "store16", Arg{imm}, R{x}); break;
192                 case Op::store32: write(o, "store32", Arg{imm}, R{x}); break;
193 
194                 case Op::load8:  write(o, R{d}, "= load8" , Arg{imm}); break;
195                 case Op::load16: write(o, R{d}, "= load16", Arg{imm}); break;
196                 case Op::load32: write(o, R{d}, "= load32", Arg{imm}); break;
197 
198                 case Op::gather8:  write(o, R{d}, "= gather8" , Arg{imm}, R{x}); break;
199                 case Op::gather16: write(o, R{d}, "= gather16", Arg{imm}, R{x}); break;
200                 case Op::gather32: write(o, R{d}, "= gather32", Arg{imm}, R{x}); break;
201 
202                 case Op::uniform8:  write(o, R{d}, "= uniform8" , Arg{imm & 0xffff}, Hex{imm>>16}); break;
203                 case Op::uniform16: write(o, R{d}, "= uniform16", Arg{imm & 0xffff}, Hex{imm>>16}); break;
204                 case Op::uniform32: write(o, R{d}, "= uniform32", Arg{imm & 0xffff}, Hex{imm>>16}); break;
205 
206                 case Op::splat:  write(o, R{d}, "= splat", Splat{imm}); break;
207 
208 
209                 case Op::add_f32: write(o, R{d}, "= add_f32", R{x}, R{y}      ); break;
210                 case Op::sub_f32: write(o, R{d}, "= sub_f32", R{x}, R{y}      ); break;
211                 case Op::mul_f32: write(o, R{d}, "= mul_f32", R{x}, R{y}      ); break;
212                 case Op::div_f32: write(o, R{d}, "= div_f32", R{x}, R{y}      ); break;
213                 case Op::mad_f32: write(o, R{d}, "= mad_f32", R{x}, R{y}, R{z}); break;
214 
215                 case Op:: eq_f32: write(o, R{d}, "= eq_f32", R{x}, R{y}); break;
216                 case Op::neq_f32: write(o, R{d}, "= neq_f32", R{x}, R{y}); break;
217                 case Op:: lt_f32: write(o, R{d}, "= lt_f32", R{x}, R{y}); break;
218                 case Op::lte_f32: write(o, R{d}, "= lte_f32", R{x}, R{y}); break;
219                 case Op:: gt_f32: write(o, R{d}, "= gt_f32", R{x}, R{y}); break;
220                 case Op::gte_f32: write(o, R{d}, "= gte_f32", R{x}, R{y}); break;
221 
222 
223                 case Op::add_i32: write(o, R{d}, "= add_i32", R{x}, R{y}); break;
224                 case Op::sub_i32: write(o, R{d}, "= sub_i32", R{x}, R{y}); break;
225                 case Op::mul_i32: write(o, R{d}, "= mul_i32", R{x}, R{y}); break;
226 
227                 case Op::shl_i32: write(o, R{d}, "= shl_i32", R{x}, Shift{imm}); break;
228                 case Op::shr_i32: write(o, R{d}, "= shr_i32", R{x}, Shift{imm}); break;
229                 case Op::sra_i32: write(o, R{d}, "= sra_i32", R{x}, Shift{imm}); break;
230 
231                 case Op:: eq_i32: write(o, R{d}, "= eq_i32", R{x}, R{y}); break;
232                 case Op::neq_i32: write(o, R{d}, "= neq_i32", R{x}, R{y}); break;
233                 case Op:: lt_i32: write(o, R{d}, "= lt_i32", R{x}, R{y}); break;
234                 case Op::lte_i32: write(o, R{d}, "= lte_i32", R{x}, R{y}); break;
235                 case Op:: gt_i32: write(o, R{d}, "= gt_i32", R{x}, R{y}); break;
236                 case Op::gte_i32: write(o, R{d}, "= gte_i32", R{x}, R{y}); break;
237 
238 
239                 case Op::add_i16x2: write(o, R{d}, "= add_i16x2", R{x}, R{y}); break;
240                 case Op::sub_i16x2: write(o, R{d}, "= sub_i16x2", R{x}, R{y}); break;
241                 case Op::mul_i16x2: write(o, R{d}, "= mul_i16x2", R{x}, R{y}); break;
242 
243                 case Op::shl_i16x2: write(o, R{d}, "= shl_i16x2", R{x}, Shift{imm}); break;
244                 case Op::shr_i16x2: write(o, R{d}, "= shr_i16x2", R{x}, Shift{imm}); break;
245                 case Op::sra_i16x2: write(o, R{d}, "= sra_i16x2", R{x}, Shift{imm}); break;
246 
247                 case Op:: eq_i16x2: write(o, R{d}, "= eq_i16x2", R{x}, R{y}); break;
248                 case Op::neq_i16x2: write(o, R{d}, "= neq_i16x2", R{x}, R{y}); break;
249                 case Op:: lt_i16x2: write(o, R{d}, "= lt_i16x2", R{x}, R{y}); break;
250                 case Op::lte_i16x2: write(o, R{d}, "= lte_i16x2", R{x}, R{y}); break;
251                 case Op:: gt_i16x2: write(o, R{d}, "= gt_i16x2", R{x}, R{y}); break;
252                 case Op::gte_i16x2: write(o, R{d}, "= gte_i16x2", R{x}, R{y}); break;
253 
254 
255                 case Op::bit_and  : write(o, R{d}, "= bit_and"  , R{x}, R{y}      ); break;
256                 case Op::bit_or   : write(o, R{d}, "= bit_or"   , R{x}, R{y}      ); break;
257                 case Op::bit_xor  : write(o, R{d}, "= bit_xor"  , R{x}, R{y}      ); break;
258                 case Op::bit_clear: write(o, R{d}, "= bit_clear", R{x}, R{y}      ); break;
259                 case Op::select   : write(o, R{d}, "= select"   , R{x}, R{y}, R{z}); break;
260 
261                 case Op::bytes:   write(o, R{d}, "= bytes", R{x}, Hex{imm}); break;
262                 case Op::extract: write(o, R{d}, "= extract", R{x}, Shift{imm}, R{y}); break;
263                 case Op::pack:    write(o, R{d}, "= pack",    R{x}, R{y}, Shift{imm}); break;
264 
265                 case Op::to_f32: write(o, R{d}, "= to_f32", R{x}); break;
266                 case Op::to_i32: write(o, R{d}, "= to_i32", R{x}); break;
267             }
268             write(o, "\n");
269         }
270     }
271 
dump(Builder & builder,SkWStream * o)272     static void dump(Builder& builder, SkWStream* o) {
273         skvm::Program program = builder.done();
274         dump_builder(builder, o);
275         o->writeText("\n");
276         dump_program(program, o);
277         o->writeText("\n");
278     }
279 
280 }  // namespace
281 
282 template <typename Fn>
test_jit_and_interpreter(skvm::Program && program,Fn && test)283 static void test_jit_and_interpreter(skvm::Program&& program, Fn&& test) {
284     test((const skvm::Program&) program);
285     program.dropJIT();
286     test((const skvm::Program&) program);
287 }
288 
DEF_TEST(SkVM,r)289 DEF_TEST(SkVM, r) {
290     SkDynamicMemoryWStream buf;
291 
292     // Write all combinations of SrcoverBuilder_F32
293     for (int s = 0; s < 3; s++)
294     for (int d = 0; d < 3; d++) {
295         auto srcFmt = (Fmt)s,
296              dstFmt = (Fmt)d;
297         SrcoverBuilder_F32 builder{srcFmt, dstFmt};
298 
299         buf.writeText(fmt_name(srcFmt));
300         buf.writeText(" over ");
301         buf.writeText(fmt_name(dstFmt));
302         buf.writeText("\n");
303         dump(builder, &buf);
304     }
305 
306     // Write the I32 Srcovers also.
307     {
308         SrcoverBuilder_I32_Naive builder;
309         buf.writeText("I32 (Naive) 8888 over 8888\n");
310         dump(builder, &buf);
311     }
312     {
313         SrcoverBuilder_I32 builder;
314         buf.writeText("I32 8888 over 8888\n");
315         dump(builder, &buf);
316     }
317     {
318         SrcoverBuilder_I32_SWAR builder;
319         buf.writeText("I32 (SWAR) 8888 over 8888\n");
320         dump(builder, &buf);
321     }
322 
323     {
324         skvm::Builder b;
325         skvm::Arg arg = b.varying<int>();
326 
327         // x and y can both be hoisted,
328         // and x can die at y, while y lives forever.
329         skvm::I32 x = b.splat(1),
330                   y = b.add(x, b.splat(2));
331         b.store32(arg, b.mul(b.load32(arg), y));
332 
333         skvm::Program program = b.done();
334         REPORTER_ASSERT(r, program.nregs() == 2);
335 
336         std::vector<skvm::Builder::Instruction> insts = b.program();
337         REPORTER_ASSERT(r, insts.size() == 6);
338         REPORTER_ASSERT(r,  insts[0].hoist && insts[0].death == 2);
339         REPORTER_ASSERT(r,  insts[1].hoist && insts[1].death == 2);
340         REPORTER_ASSERT(r,  insts[2].hoist && insts[2].death == 6);
341         REPORTER_ASSERT(r, !insts[3].hoist);
342         REPORTER_ASSERT(r, !insts[4].hoist);
343         REPORTER_ASSERT(r, !insts[5].hoist);
344 
345         dump(b, &buf);
346 
347         test_jit_and_interpreter(std::move(program), [&](const skvm::Program& program) {
348             int arg[] = {0,1,2,3,4,5,6,7,8,9};
349 
350             program.eval(SK_ARRAY_COUNT(arg), arg);
351 
352             for (int i = 0; i < (int)SK_ARRAY_COUNT(arg); i++) {
353                 REPORTER_ASSERT(r, arg[i] == i*3);
354             }
355         });
356     }
357 
358     sk_sp<SkData> blob = buf.detachAsData();
359     {
360 
361         sk_sp<SkData> expected = GetResourceAsData("SkVMTest.expected");
362         REPORTER_ASSERT(r, expected, "Couldn't load SkVMTest.expected.");
363         if (expected) {
364             if (blob->size() != expected->size()
365                     || 0 != memcmp(blob->data(), expected->data(), blob->size())) {
366 
367                 ERRORF(r, "SkVMTest expected\n%.*s\nbut got\n%.*s\n",
368                        expected->size(), expected->data(),
369                        blob->size(), blob->data());
370             }
371 
372             SkFILEWStream out(GetResourcePath("SkVMTest.expected").c_str());
373             if (out.isValid()) {
374                 out.write(blob->data(), blob->size());
375             }
376         }
377     }
378 
379     auto test_8888 = [&](skvm::Program&& program) {
380         uint32_t src[9];
381         uint32_t dst[SK_ARRAY_COUNT(src)];
382 
383         test_jit_and_interpreter(std::move(program), [&](const skvm::Program& program) {
384             for (int i = 0; i < (int)SK_ARRAY_COUNT(src); i++) {
385                 src[i] = 0xbb007733;
386                 dst[i] = 0xffaaccee;
387             }
388 
389             SkPMColor expected = SkPMSrcOver(src[0], dst[0]);  // 0xff2dad73
390 
391             program.eval((int)SK_ARRAY_COUNT(src), src, dst);
392 
393             // dst is probably 0xff2dad72.
394             for (auto got : dst) {
395                 auto want = expected;
396                 for (int i = 0; i < 4; i++) {
397                     uint8_t d = got  & 0xff,
398                             w = want & 0xff;
399                     if (abs(d-w) >= 2) {
400                         SkDebugf("d %02x, w %02x\n", d,w);
401                     }
402                     REPORTER_ASSERT(r, abs(d-w) < 2);
403                     got  >>= 8;
404                     want >>= 8;
405                 }
406             }
407         });
408     };
409 
410     test_8888(SrcoverBuilder_F32{Fmt::RGBA_8888, Fmt::RGBA_8888}.done("srcover_f32"));
411     test_8888(SrcoverBuilder_I32_Naive{}.done("srcover_i32_naive"));
412     test_8888(SrcoverBuilder_I32{}.done("srcover_i32"));
413     test_8888(SrcoverBuilder_I32_SWAR{}.done("srcover_i32_SWAR"));
414 
415     test_jit_and_interpreter(SrcoverBuilder_F32{Fmt::RGBA_8888, Fmt::G8}.done(),
416                              [&](const skvm::Program& program) {
417         uint32_t src[9];
418         uint8_t  dst[SK_ARRAY_COUNT(src)];
419 
420         for (int i = 0; i < (int)SK_ARRAY_COUNT(src); i++) {
421             src[i] = 0xbb007733;
422             dst[i] = 0x42;
423         }
424 
425         SkPMColor over = SkPMSrcOver(SkPackARGB32(0xbb, 0x33, 0x77, 0x00),
426                                      0xff424242);
427 
428         uint8_t want = SkComputeLuminance(SkGetPackedR32(over),
429                                           SkGetPackedG32(over),
430                                           SkGetPackedB32(over));
431         program.eval((int)SK_ARRAY_COUNT(src), src, dst);
432 
433         for (auto got : dst) {
434             REPORTER_ASSERT(r, abs(got-want) < 3);
435         }
436     });
437 
438     test_jit_and_interpreter(SrcoverBuilder_F32{Fmt::A8, Fmt::A8}.done(),
439                              [&](const skvm::Program& program) {
440         uint8_t src[256],
441                 dst[256];
442         for (int i = 0; i < 256; i++) {
443             src[i] = 255 - i;
444             dst[i] = i;
445         }
446 
447         program.eval(256, src, dst);
448 
449         for (int i = 0; i < 256; i++) {
450             uint8_t want = SkGetPackedA32(SkPMSrcOver(SkPackARGB32(src[i], 0,0,0),
451                                                       SkPackARGB32(     i, 0,0,0)));
452             REPORTER_ASSERT(r, abs(dst[i]-want) < 2);
453         }
454     });
455 }
456 
DEF_TEST(SkVM_Pointless,r)457 DEF_TEST(SkVM_Pointless, r) {
458     // Let's build a program with no memory arguments.
459     // It should all be pegged as dead code, but we should be able to "run" it.
460     skvm::Builder b;
461     {
462         b.add(b.splat(5.0f),
463               b.splat(4.0f));
464     }
465 
466     test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
467         for (int N = 0; N < 64; N++) {
468             program.eval(N);
469         }
470     });
471 
472     for (const skvm::Builder::Instruction& inst : b.program()) {
473         REPORTER_ASSERT(r, inst.death == 0 && inst.hoist == true);
474     }
475 }
476 
DEF_TEST(SkVM_LoopCounts,r)477 DEF_TEST(SkVM_LoopCounts, r) {
478     // Make sure we cover all the exact N we want.
479 
480     // buf[i] += 1
481     skvm::Builder b;
482     skvm::Arg arg = b.varying<int>();
483     b.store32(arg,
484               b.add(b.splat(1),
485                     b.load32(arg)));
486 
487     test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
488         int buf[64];
489         for (int N = 0; N <= (int)SK_ARRAY_COUNT(buf); N++) {
490             for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
491                 buf[i] = i;
492             }
493             program.eval(N, buf);
494 
495             for (int i = 0; i < N; i++) {
496                 REPORTER_ASSERT(r, buf[i] == i+1);
497             }
498             for (int i = N; i < (int)SK_ARRAY_COUNT(buf); i++) {
499                 REPORTER_ASSERT(r, buf[i] == i);
500             }
501         }
502     });
503 }
504 
DEF_TEST(SkVM_gathers,r)505 DEF_TEST(SkVM_gathers, r) {
506     skvm::Builder b;
507     {
508         skvm::Arg img   = b.uniform(),
509                   buf32 = b.varying<int>(),
510                   buf16 = b.varying<uint16_t>(),
511                   buf8  = b.varying<uint8_t>();
512 
513         skvm::I32 x = b.load32(buf32);
514 
515         b.store32(buf32, b.gather32(img, b.bit_and(x, b.splat( 7))));
516         b.store16(buf16, b.gather16(img, b.bit_and(x, b.splat(15))));
517         b.store8 (buf8 , b.gather8 (img, b.bit_and(x, b.splat(31))));
518     }
519 
520     test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
521         const int img[] = {12,34,56,78, 90,98,76,54};
522 
523         constexpr int N = 20;
524         int      buf32[N];
525         uint16_t buf16[N];
526         uint8_t  buf8 [N];
527 
528         for (int i = 0; i < 20; i++) {
529             buf32[i] = i;
530         }
531 
532         program.eval(N, img, buf32, buf16, buf8);
533         int i = 0;
534         REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 12 && buf8[i] == 12); i++;
535         REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] ==  0 && buf8[i] ==  0); i++;
536         REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 34 && buf8[i] ==  0); i++;
537         REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] ==  0 && buf8[i] ==  0); i++;
538         REPORTER_ASSERT(r, buf32[i] == 90 && buf16[i] == 56 && buf8[i] == 34); i++;
539         REPORTER_ASSERT(r, buf32[i] == 98 && buf16[i] ==  0 && buf8[i] ==  0); i++;
540         REPORTER_ASSERT(r, buf32[i] == 76 && buf16[i] == 78 && buf8[i] ==  0); i++;
541         REPORTER_ASSERT(r, buf32[i] == 54 && buf16[i] ==  0 && buf8[i] ==  0); i++;
542 
543         REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 90 && buf8[i] == 56); i++;
544         REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] ==  0 && buf8[i] ==  0); i++;
545         REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 98 && buf8[i] ==  0); i++;
546         REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] ==  0 && buf8[i] ==  0); i++;
547         REPORTER_ASSERT(r, buf32[i] == 90 && buf16[i] == 76 && buf8[i] == 78); i++;
548         REPORTER_ASSERT(r, buf32[i] == 98 && buf16[i] ==  0 && buf8[i] ==  0); i++;
549         REPORTER_ASSERT(r, buf32[i] == 76 && buf16[i] == 54 && buf8[i] ==  0); i++;
550         REPORTER_ASSERT(r, buf32[i] == 54 && buf16[i] ==  0 && buf8[i] ==  0); i++;
551 
552         REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 12 && buf8[i] == 90); i++;
553         REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] ==  0 && buf8[i] ==  0); i++;
554         REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 34 && buf8[i] ==  0); i++;
555         REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] ==  0 && buf8[i] ==  0); i++;
556     });
557 }
558 
DEF_TEST(SkVM_bitops,r)559 DEF_TEST(SkVM_bitops, r) {
560     skvm::Builder b;
561     {
562         skvm::Arg ptr = b.varying<int>();
563 
564         skvm::I32 x = b.load32(ptr);
565 
566         x = b.bit_and  (x, b.splat(0xf1));  // 0x40
567         x = b.bit_or   (x, b.splat(0x80));  // 0xc0
568         x = b.bit_xor  (x, b.splat(0xfe));  // 0x3e
569         x = b.bit_clear(x, b.splat(0x30));  // 0x0e
570 
571         x = b.shl(x, 28);  // 0xe000'0000
572         x = b.sra(x, 28);  // 0xffff'fffe
573         x = b.shr(x,  1);  // 0x7fff'ffff
574 
575         b.store32(ptr, x);
576     }
577 
578     test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
579         int x = 0x42;
580         program.eval(1, &x);
581         REPORTER_ASSERT(r, x == 0x7fff'ffff);
582     });
583 }
584 
DEF_TEST(SkVM_f32,r)585 DEF_TEST(SkVM_f32, r) {
586     skvm::Builder b;
587     {
588         skvm::Arg arg = b.varying<float>();
589 
590         skvm::F32 x = b.bit_cast(b.load32(arg)),
591                   y = b.add(x,x),   // y = 2x
592                   z = b.sub(y,x),   // z = 2x-x = x
593                   w = b.div(z,x);   // w = x/x = 1
594         b.store32(arg, b.bit_cast(w));
595     }
596 
597     test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
598         float buf[] = { 1,2,3,4,5,6,7,8,9 };
599         program.eval(SK_ARRAY_COUNT(buf), buf);
600         for (float v : buf) {
601             REPORTER_ASSERT(r, v == 1.0f);
602         }
603     });
604 }
605 
DEF_TEST(SkVM_cmp_i32,r)606 DEF_TEST(SkVM_cmp_i32, r) {
607     skvm::Builder b;
608     {
609         skvm::I32 x = b.load32(b.varying<int>());
610 
611         auto to_bit = [&](int shift, skvm::I32 mask) {
612             return b.shl(b.bit_and(mask, b.splat(0x1)), shift);
613         };
614 
615         skvm::I32 m = b.splat(0);
616         m = b.bit_or(m, to_bit(0, b. eq(x, b.splat(0))));
617         m = b.bit_or(m, to_bit(1, b.neq(x, b.splat(1))));
618         m = b.bit_or(m, to_bit(2, b. lt(x, b.splat(2))));
619         m = b.bit_or(m, to_bit(3, b.lte(x, b.splat(3))));
620         m = b.bit_or(m, to_bit(4, b. gt(x, b.splat(4))));
621         m = b.bit_or(m, to_bit(5, b.gte(x, b.splat(5))));
622 
623         b.store32(b.varying<int>(), m);
624     }
625 
626     test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
627         int in[] = { 0,1,2,3,4,5,6,7,8,9 };
628         int out[SK_ARRAY_COUNT(in)];
629 
630         program.eval(SK_ARRAY_COUNT(in), in, out);
631 
632         REPORTER_ASSERT(r, out[0] == 0b001111);
633         REPORTER_ASSERT(r, out[1] == 0b001100);
634         REPORTER_ASSERT(r, out[2] == 0b001010);
635         REPORTER_ASSERT(r, out[3] == 0b001010);
636         REPORTER_ASSERT(r, out[4] == 0b000010);
637         for (int i = 5; i < (int)SK_ARRAY_COUNT(out); i++) {
638             REPORTER_ASSERT(r, out[i] == 0b110010);
639         }
640     });
641 }
642 
DEF_TEST(SkVM_cmp_f32,r)643 DEF_TEST(SkVM_cmp_f32, r) {
644     skvm::Builder b;
645     {
646         skvm::F32 x = b.bit_cast(b.load32(b.varying<float>()));
647 
648         auto to_bit = [&](int shift, skvm::I32 mask) {
649             return b.shl(b.bit_and(mask, b.splat(0x1)), shift);
650         };
651 
652         skvm::I32 m = b.splat(0);
653         m = b.bit_or(m, to_bit(0, b. eq(x, b.splat(0.0f))));
654         m = b.bit_or(m, to_bit(1, b.neq(x, b.splat(1.0f))));
655         m = b.bit_or(m, to_bit(2, b. lt(x, b.splat(2.0f))));
656         m = b.bit_or(m, to_bit(3, b.lte(x, b.splat(3.0f))));
657         m = b.bit_or(m, to_bit(4, b. gt(x, b.splat(4.0f))));
658         m = b.bit_or(m, to_bit(5, b.gte(x, b.splat(5.0f))));
659 
660         b.store32(b.varying<int>(), m);
661     }
662 
663     test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
664         float in[] = { 0,1,2,3,4,5,6,7,8,9 };
665         int out[SK_ARRAY_COUNT(in)];
666 
667         program.eval(SK_ARRAY_COUNT(in), in, out);
668 
669         REPORTER_ASSERT(r, out[0] == 0b001111);
670         REPORTER_ASSERT(r, out[1] == 0b001100);
671         REPORTER_ASSERT(r, out[2] == 0b001010);
672         REPORTER_ASSERT(r, out[3] == 0b001010);
673         REPORTER_ASSERT(r, out[4] == 0b000010);
674         for (int i = 5; i < (int)SK_ARRAY_COUNT(out); i++) {
675             REPORTER_ASSERT(r, out[i] == 0b110010);
676         }
677     });
678 }
679 
DEF_TEST(SkVM_i16x2,r)680 DEF_TEST(SkVM_i16x2, r) {
681     skvm::Builder b;
682     {
683         skvm::Arg buf = b.varying<int>();
684 
685         skvm::I32 x = b.load32(buf),
686                   y = b.add_16x2(x,x),   // y = 2x
687                   z = b.mul_16x2(x,y),   // z = 2x^2
688                   w = b.sub_16x2(z,x),   // w = x(2x-1)
689                   v = b.shl_16x2(w,7),   // These shifts will be a no-op
690                   u = b.sra_16x2(v,7);   // for all but x=12 and x=13.
691         b.store32(buf, u);
692     }
693 
694     test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
695         uint16_t buf[] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13 };
696 
697         program.eval(SK_ARRAY_COUNT(buf)/2, buf);
698         for (int i = 0; i < 12; i++) {
699             REPORTER_ASSERT(r, buf[i] == i*(2*i-1));
700         }
701         REPORTER_ASSERT(r, buf[12] == 0xff14);   // 12*23 = 0x114
702         REPORTER_ASSERT(r, buf[13] == 0xff45);   // 13*25 = 0x145
703     });
704 }
705 
DEF_TEST(SkVM_cmp_i16,r)706 DEF_TEST(SkVM_cmp_i16, r) {
707     skvm::Builder b;
708     {
709         skvm::Arg buf = b.varying<int>();
710         skvm::I32 x = b.load32(buf);
711 
712         auto to_bit = [&](int shift, skvm::I32 mask) {
713             return b.shl_16x2(b.bit_and(mask, b.splat(0x0001'0001)), shift);
714         };
715 
716         skvm::I32 m = b.splat(0);
717         m = b.bit_or(m, to_bit(0, b. eq_16x2(x, b.splat(0x0000'0000))));
718         m = b.bit_or(m, to_bit(1, b.neq_16x2(x, b.splat(0x0001'0001))));
719         m = b.bit_or(m, to_bit(2, b. lt_16x2(x, b.splat(0x0002'0002))));
720         m = b.bit_or(m, to_bit(3, b.lte_16x2(x, b.splat(0x0003'0003))));
721         m = b.bit_or(m, to_bit(4, b. gt_16x2(x, b.splat(0x0004'0004))));
722         m = b.bit_or(m, to_bit(5, b.gte_16x2(x, b.splat(0x0005'0005))));
723 
724         b.store32(buf, m);
725     }
726 
727     test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
728         int16_t buf[] = { 0,1, 2,3, 4,5, 6,7, 8,9 };
729 
730         program.eval(SK_ARRAY_COUNT(buf)/2, buf);
731 
732         REPORTER_ASSERT(r, buf[0] == 0b001111);
733         REPORTER_ASSERT(r, buf[1] == 0b001100);
734         REPORTER_ASSERT(r, buf[2] == 0b001010);
735         REPORTER_ASSERT(r, buf[3] == 0b001010);
736         REPORTER_ASSERT(r, buf[4] == 0b000010);
737         for (int i = 5; i < (int)SK_ARRAY_COUNT(buf); i++) {
738             REPORTER_ASSERT(r, buf[i] == 0b110010);
739         }
740     });
741 }
742 
743 
DEF_TEST(SkVM_mad,r)744 DEF_TEST(SkVM_mad, r) {
745     // This program is designed to exercise the tricky corners of instruction
746     // and register selection for Op::mad_f32.
747 
748     skvm::Builder b;
749     {
750         skvm::Arg arg = b.varying<int>();
751 
752         skvm::F32 x = b.to_f32(b.load32(arg)),
753                   y = b.mad(x,x,x),   // x is needed in the future, so r[x] != r[y].
754                   z = b.mad(y,y,x),   // y is needed in the future, but r[z] = r[x] is ok.
755                   w = b.mad(z,z,y),   // w can alias z but not y.
756                   v = b.mad(w,y,w);   // Got to stop somewhere.
757         b.store32(arg, b.to_i32(v));
758     }
759 
760     test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
761         int x = 2;
762         program.eval(1, &x);
763         // x = 2
764         // y = 2*2 + 2 = 6
765         // z = 6*6 + 2 = 38
766         // w = 38*38 + 6 = 1450
767         // v = 1450*6 + 1450 = 10150
768         REPORTER_ASSERT(r, x == 10150);
769     });
770 }
771 
DEF_TEST(SkVM_madder,r)772 DEF_TEST(SkVM_madder, r) {
773     skvm::Builder b;
774     {
775         skvm::Arg arg = b.varying<float>();
776 
777         skvm::F32 x = b.bit_cast(b.load32(arg)),
778                   y = b.mad(x,x,x),   // x is needed in the future, so r[x] != r[y].
779                   z = b.mad(y,x,y),   // r[x] can be reused after this instruction, but not r[y].
780                   w = b.mad(y,y,z);
781         b.store32(arg, b.bit_cast(w));
782     }
783 
784     test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
785         float x = 2.0f;
786         // y = 2*2 + 2 = 6
787         // z = 6*2 + 6 = 18
788         // w = 6*6 + 18 = 54
789         program.eval(1, &x);
790         REPORTER_ASSERT(r, x == 54.0f);
791     });
792 }
793 
DEF_TEST(SkVM_hoist,r)794 DEF_TEST(SkVM_hoist, r) {
795     // This program uses enough constants that it will fail to JIT if we hoist them.
796     // The JIT will try again without hoisting, and that'll just need 2 registers.
797     skvm::Builder b;
798     {
799         skvm::Arg arg = b.varying<int>();
800         skvm::I32 x = b.load32(arg);
801         for (int i = 0; i < 32; i++) {
802             x = b.add(x, b.splat(i));
803         }
804         b.store32(arg, x);
805     }
806 
807     test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
808         int x = 4;
809         program.eval(1, &x);
810         // x += 0 + 1 + 2 + 3 + ... + 30 + 31
811         // x += 496
812         REPORTER_ASSERT(r, x == 500);
813     });
814 }
815 
DEF_TEST(SkVM_select,r)816 DEF_TEST(SkVM_select, r) {
817     skvm::Builder b;
818     {
819         skvm::Arg buf = b.varying<int>();
820 
821         skvm::I32 x = b.load32(buf);
822 
823         x = b.select( b.gt(x, b.splat(4)), x, b.splat(42) );
824 
825         b.store32(buf, x);
826     }
827 
828     test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
829         int buf[] = { 0,1,2,3,4,5,6,7,8 };
830         program.eval(SK_ARRAY_COUNT(buf), buf);
831         for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
832             REPORTER_ASSERT(r, buf[i] == (i > 4 ? i : 42));
833         }
834     });
835 }
836 
DEF_TEST(SkVM_NewOps,r)837 DEF_TEST(SkVM_NewOps, r) {
838     // Exercise a somewhat arbitrary set of new ops.
839     skvm::Builder b;
840     {
841         skvm::Arg buf      = b.varying<int16_t>(),
842                   img      = b.uniform(),
843                   uniforms = b.uniform();
844 
845         skvm::I32 x = b.load16(buf);
846 
847         x = b.add(x, b.uniform32(uniforms, 0));
848         x = b.mul(x, b.uniform8 (uniforms, 4));
849         x = b.sub(x, b.uniform16(uniforms, 6));
850 
851         skvm::I32 limit = b.uniform32(uniforms, 8);
852         x = b.select(b.lt(x, b.splat(0)), b.splat(0), x);
853         x = b.select(b.gt(x, limit     ), limit     , x);
854 
855         x = b.gather8(img, x);
856 
857         b.store16(buf, x);
858     }
859 
860     if ((false)) {
861         SkDynamicMemoryWStream buf;
862         dump(b, &buf);
863         sk_sp<SkData> blob = buf.detachAsData();
864         SkDebugf("%.*s\n", blob->size(), blob->data());
865     }
866 
867     test_jit_and_interpreter(b.done(), [&](const skvm::Program& program) {
868         const int N = 31;
869         int16_t buf[N];
870         for (int i = 0; i < N; i++) {
871             buf[i] = i;
872         }
873 
874         const int M = 16;
875         uint8_t img[M];
876         for (int i = 0; i < M; i++) {
877             img[i] = i*i;
878         }
879 
880         struct {
881             int      add   = 5;
882             uint8_t  mul   = 3;
883             uint16_t sub   = 18;
884             int      limit = M-1;
885         } uniforms;
886 
887         program.eval(N, buf, img, &uniforms);
888 
889         for (int i = 0; i < N; i++) {
890             // Our first math calculates x = (i+5)*3 - 18 a.k.a 3*(i-1).
891             int x = 3*(i-1);
892 
893             // Then that's pinned to the limits of img.
894             if (i < 2) { x =  0; }  // Notice i == 1 hits x == 0 exactly...
895             if (i > 5) { x = 15; }  // ...and i == 6 hits x == 15 exactly
896             REPORTER_ASSERT(r, buf[i] == img[x]);
897         }
898     });
899 }
900 
901 
902 template <typename Fn>
test_asm(skiatest::Reporter * r,Fn && fn,std::initializer_list<uint8_t> expected)903 static void test_asm(skiatest::Reporter* r, Fn&& fn, std::initializer_list<uint8_t> expected) {
904     uint8_t buf[4096];
905     skvm::Assembler a{buf};
906     fn(a);
907 
908     REPORTER_ASSERT(r, a.size() == expected.size());
909 
910     auto got = (const uint8_t*)buf,
911          want = expected.begin();
912     for (int i = 0; i < (int)std::min(a.size(), expected.size()); i++) {
913         REPORTER_ASSERT(r, got[i] == want[i],
914                         "byte %d was %02x, want %02x", i, got[i], want[i]);
915     }
916 }
917 
DEF_TEST(SkVM_Assembler,r)918 DEF_TEST(SkVM_Assembler, r) {
919     // Easiest way to generate test cases is
920     //
921     //   echo '...some asm...' | llvm-mc -show-encoding -x86-asm-syntax=intel
922     //
923     // The -x86-asm-syntax=intel bit is optional, controlling the
924     // input syntax only; the output will always be AT&T  op x,y,dst style.
925     // Our APIs read more like Intel op dst,x,y as op(dst,x,y), so I find
926     // that a bit easier to use here, despite maybe favoring AT&T overall.
927 
928     using A = skvm::Assembler;
929     // Our exit strategy from AVX code.
930     test_asm(r, [&](A& a) {
931         a.vzeroupper();
932         a.ret();
933     },{
934         0xc5, 0xf8, 0x77,
935         0xc3,
936     });
937 
938     // Align should pad with zero
939     test_asm(r, [&](A& a) {
940         a.ret();
941         a.align(4);
942     },{
943         0xc3,
944         0x00, 0x00, 0x00,
945     });
946 
947     test_asm(r, [&](A& a) {
948         a.add(A::rax, 8);       // Always good to test rax.
949         a.sub(A::rax, 32);
950 
951         a.add(A::rdi, 12);      // Last 0x48 REX
952         a.sub(A::rdi, 8);
953 
954         a.add(A::r8 , 7);       // First 0x49 REX
955         a.sub(A::r8 , 4);
956 
957         a.add(A::rsi, 128);     // Requires 4 byte immediate.
958         a.sub(A::r8 , 1000000);
959     },{
960         0x48, 0x83, 0b11'000'000, 0x08,
961         0x48, 0x83, 0b11'101'000, 0x20,
962 
963         0x48, 0x83, 0b11'000'111, 0x0c,
964         0x48, 0x83, 0b11'101'111, 0x08,
965 
966         0x49, 0x83, 0b11'000'000, 0x07,
967         0x49, 0x83, 0b11'101'000, 0x04,
968 
969         0x48, 0x81, 0b11'000'110, 0x80, 0x00, 0x00, 0x00,
970         0x49, 0x81, 0b11'101'000, 0x40, 0x42, 0x0f, 0x00,
971     });
972 
973 
974     test_asm(r, [&](A& a) {
975         a.vpaddd (A::ymm0, A::ymm1, A::ymm2);  // Low registers and 0x0f map     -> 2-byte VEX.
976         a.vpaddd (A::ymm8, A::ymm1, A::ymm2);  // A high dst register is ok      -> 2-byte VEX.
977         a.vpaddd (A::ymm0, A::ymm8, A::ymm2);  // A high first argument register -> 2-byte VEX.
978         a.vpaddd (A::ymm0, A::ymm1, A::ymm8);  // A high second argument         -> 3-byte VEX.
979         a.vpmulld(A::ymm0, A::ymm1, A::ymm2);  // Using non-0x0f map instruction -> 3-byte VEX.
980         a.vpsubd (A::ymm0, A::ymm1, A::ymm2);  // Test vpsubd to ensure argument order is right.
981     },{
982         /*    VEX     */ /*op*/ /*modRM*/
983         0xc5,       0xf5, 0xfe, 0xc2,
984         0xc5,       0x75, 0xfe, 0xc2,
985         0xc5,       0xbd, 0xfe, 0xc2,
986         0xc4, 0xc1, 0x75, 0xfe, 0xc0,
987         0xc4, 0xe2, 0x75, 0x40, 0xc2,
988         0xc5,       0xf5, 0xfa, 0xc2,
989     });
990 
991     test_asm(r, [&](A& a) {
992         a.vpcmpeqd(A::ymm0, A::ymm1, A::ymm2);
993         a.vpcmpgtd(A::ymm0, A::ymm1, A::ymm2);
994     },{
995         0xc5,0xf5,0x76,0xc2,
996         0xc5,0xf5,0x66,0xc2,
997     });
998 
999     test_asm(r, [&](A& a) {
1000         a.vpblendvb(A::ymm0, A::ymm1, A::ymm2, A::ymm3);
1001     },{
1002         0xc4,0xe3,0x75, 0x4c, 0xc2, 0x30,
1003     });
1004 
1005     test_asm(r, [&](A& a) {
1006         a.vpsrld(A::ymm15, A::ymm2, 8);
1007         a.vpsrld(A::ymm0 , A::ymm8, 5);
1008     },{
1009         0xc5,     0x85, 0x72,0xd2, 0x08,
1010         0xc4,0xc1,0x7d, 0x72,0xd0, 0x05,
1011     });
1012 
1013     test_asm(r, [&](A& a) {
1014         a.vpermq(A::ymm1, A::ymm2, 5);
1015     },{
1016         0xc4,0xe3,0xfd, 0x00,0xca, 0x05,
1017     });
1018 
1019     test_asm(r, [&](A& a) {
1020         A::Label l = a.here();
1021         a.byte(1);
1022         a.byte(2);
1023         a.byte(3);
1024         a.byte(4);
1025 
1026         a.vbroadcastss(A::ymm0 , &l);
1027         a.vbroadcastss(A::ymm1 , &l);
1028         a.vbroadcastss(A::ymm8 , &l);
1029         a.vbroadcastss(A::ymm15, &l);
1030 
1031         a.vpshufb(A::ymm4, A::ymm3, &l);
1032     },{
1033         0x01, 0x02, 0x03, 0x4,
1034 
1035         /*     VEX    */  /*op*/ /*   ModRM    */  /*     offset     */
1036         0xc4, 0xe2, 0x7d,  0x18,   0b00'000'101,   0xf3,0xff,0xff,0xff,   // 0xfffffff3 == -13
1037         0xc4, 0xe2, 0x7d,  0x18,   0b00'001'101,   0xea,0xff,0xff,0xff,   // 0xffffffea == -22
1038         0xc4, 0x62, 0x7d,  0x18,   0b00'000'101,   0xe1,0xff,0xff,0xff,   // 0xffffffe1 == -31
1039         0xc4, 0x62, 0x7d,  0x18,   0b00'111'101,   0xd8,0xff,0xff,0xff,   // 0xffffffd8 == -40
1040 
1041         0xc4, 0xe2, 0x65,  0x00,   0b00'100'101,   0xcf,0xff,0xff,0xff,   // 0xffffffcf == -49
1042     });
1043 
1044     test_asm(r, [&](A& a) {
1045         a.vbroadcastss(A::ymm0,  A::rdi,   0);
1046         a.vbroadcastss(A::ymm13, A::r14,   7);
1047         a.vbroadcastss(A::ymm8,  A::rdx, -12);
1048         a.vbroadcastss(A::ymm8,  A::rdx, 400);
1049 
1050         a.vbroadcastss(A::ymm8,  A::xmm0);
1051         a.vbroadcastss(A::ymm0,  A::xmm13);
1052     },{
1053         /*   VEX    */ /*op*/     /*ModRM*/   /*offset*/
1054         0xc4,0xe2,0x7d, 0x18,   0b00'000'111,
1055         0xc4,0x42,0x7d, 0x18,   0b01'101'110,  0x07,
1056         0xc4,0x62,0x7d, 0x18,   0b01'000'010,  0xf4,
1057         0xc4,0x62,0x7d, 0x18,   0b10'000'010,  0x90,0x01,0x00,0x00,
1058 
1059         0xc4,0x62,0x7d, 0x18,   0b11'000'000,
1060         0xc4,0xc2,0x7d, 0x18,   0b11'000'101,
1061     });
1062 
1063     test_asm(r, [&](A& a) {
1064         A::Label l = a.here();
1065         a.jne(&l);
1066         a.jne(&l);
1067         a.je (&l);
1068         a.jmp(&l);
1069         a.jl (&l);
1070 
1071         a.cmp(A::rdx, 0);
1072         a.cmp(A::rax, 12);
1073         a.cmp(A::r14, 2000000000);
1074     },{
1075         0x0f,0x85, 0xfa,0xff,0xff,0xff,   // near jne -6 bytes
1076         0x0f,0x85, 0xf4,0xff,0xff,0xff,   // near jne -12 bytes
1077         0x0f,0x84, 0xee,0xff,0xff,0xff,   // near je  -18 bytes
1078         0xe9,      0xe9,0xff,0xff,0xff,   // near jmp -23 bytes
1079         0x0f,0x8c, 0xe3,0xff,0xff,0xff,   // near jl  -29 bytes
1080 
1081         0x48,0x83,0xfa,0x00,
1082         0x48,0x83,0xf8,0x0c,
1083         0x49,0x81,0xfe,0x00,0x94,0x35,0x77,
1084     });
1085 
1086     test_asm(r, [&](A& a) {
1087         a.vmovups(A::ymm5, A::rsi);
1088         a.vmovups(A::rsi, A::ymm5);
1089 
1090         a.vmovups(A::rsi, A::xmm5);
1091 
1092         a.vpmovzxwd(A::ymm4, A::rsi);
1093         a.vpmovzxbd(A::ymm4, A::rsi);
1094 
1095         a.vmovq(A::rdx, A::xmm15);
1096     },{
1097         /*    VEX    */  /*Op*/  /*  ModRM  */
1098         0xc5,     0xfc,   0x10,  0b00'101'110,
1099         0xc5,     0xfc,   0x11,  0b00'101'110,
1100 
1101         0xc5,     0xf8,   0x11,  0b00'101'110,
1102 
1103         0xc4,0xe2,0x7d,   0x33,  0b00'100'110,
1104         0xc4,0xe2,0x7d,   0x31,  0b00'100'110,
1105 
1106         0xc5,     0x79,   0xd6,  0b00'111'010,
1107     });
1108 
1109     test_asm(r, [&](A& a) {
1110         a.movzbl(A::rax, A::rsi, 0);   // Low registers for src and dst.
1111         a.movzbl(A::rax, A::r8,  0);   // High src register.
1112         a.movzbl(A::r8 , A::rsi, 0);   // High dst register.
1113         a.movzbl(A::r8,  A::rsi, 12);
1114         a.movzbl(A::r8,  A::rsi, 400);
1115 
1116         a.vmovd(A::rax, A::xmm0);
1117         a.vmovd(A::rax, A::xmm8);
1118         a.vmovd(A::r8,  A::xmm0);
1119 
1120         a.vmovd(A::xmm0, A::rax);
1121         a.vmovd(A::xmm8, A::rax);
1122         a.vmovd(A::xmm0, A::r8);
1123 
1124         a.vmovd_direct(A::rax, A::xmm0);
1125         a.vmovd_direct(A::rax, A::xmm8);
1126         a.vmovd_direct(A::r8,  A::xmm0);
1127 
1128         a.vmovd_direct(A::xmm0, A::rax);
1129         a.vmovd_direct(A::xmm8, A::rax);
1130         a.vmovd_direct(A::xmm0, A::r8);
1131 
1132         a.movb(A::rdx, A::rax);
1133         a.movb(A::rdx, A::r8);
1134         a.movb(A::r8 , A::rax);
1135     },{
1136         0x0f,0xb6,0x06,
1137         0x41,0x0f,0xb6,0x00,
1138         0x44,0x0f,0xb6,0x06,
1139         0x44,0x0f,0xb6,0x46, 12,
1140         0x44,0x0f,0xb6,0x86, 0x90,0x01,0x00,0x00,
1141 
1142         0xc5,0xf9,0x7e,0x00,
1143         0xc5,0x79,0x7e,0x00,
1144         0xc4,0xc1,0x79,0x7e,0x00,
1145 
1146         0xc5,0xf9,0x6e,0x00,
1147         0xc5,0x79,0x6e,0x00,
1148         0xc4,0xc1,0x79,0x6e,0x00,
1149 
1150         0xc5,0xf9,0x7e,0xc0,
1151         0xc5,0x79,0x7e,0xc0,
1152         0xc4,0xc1,0x79,0x7e,0xc0,
1153 
1154         0xc5,0xf9,0x6e,0xc0,
1155         0xc5,0x79,0x6e,0xc0,
1156         0xc4,0xc1,0x79,0x6e,0xc0,
1157 
1158         0x88, 0x02,
1159         0x44, 0x88, 0x02,
1160         0x41, 0x88, 0x00,
1161     });
1162 
1163     test_asm(r, [&](A& a) {
1164         a.vpinsrw(A::xmm1, A::xmm8, A::rsi, 4);
1165         a.vpinsrw(A::xmm8, A::xmm1, A::r8, 12);
1166 
1167         a.vpinsrb(A::xmm1, A::xmm8, A::rsi, 4);
1168         a.vpinsrb(A::xmm8, A::xmm1, A::r8, 12);
1169 
1170         a.vpextrw(A::rsi, A::xmm8, 7);
1171         a.vpextrw(A::r8,  A::xmm1, 15);
1172 
1173         a.vpextrb(A::rsi, A::xmm8, 7);
1174         a.vpextrb(A::r8,  A::xmm1, 15);
1175     },{
1176         0xc5,0xb9,      0xc4, 0x0e,  4,
1177         0xc4,0x41,0x71, 0xc4, 0x00, 12,
1178 
1179         0xc4,0xe3,0x39, 0x20, 0x0e,  4,
1180         0xc4,0x43,0x71, 0x20, 0x00, 12,
1181 
1182         0xc4,0x63,0x79, 0x15, 0x06,  7,
1183         0xc4,0xc3,0x79, 0x15, 0x08, 15,
1184 
1185         0xc4,0x63,0x79, 0x14, 0x06,  7,
1186         0xc4,0xc3,0x79, 0x14, 0x08, 15,
1187     });
1188 
1189     test_asm(r, [&](A& a) {
1190         a.vpandn(A::ymm3, A::ymm12, A::ymm2);
1191     },{
1192         0xc5, 0x9d, 0xdf, 0xda,
1193     });
1194 
1195     test_asm(r, [&](A& a) {
1196         a.vmovdqa   (A::ymm3, A::ymm2);
1197         a.vcvttps2dq(A::ymm3, A::ymm2);
1198         a.vcvtdq2ps (A::ymm3, A::ymm2);
1199     },{
1200         0xc5,0xfd,0x6f,0xda,
1201         0xc5,0xfe,0x5b,0xda,
1202         0xc5,0xfc,0x5b,0xda,
1203     });
1204 
1205     // echo "fmul v4.4s, v3.4s, v1.4s" | llvm-mc -show-encoding -arch arm64
1206 
1207     test_asm(r, [&](A& a) {
1208         a.and16b(A::v4, A::v3, A::v1);
1209         a.orr16b(A::v4, A::v3, A::v1);
1210         a.eor16b(A::v4, A::v3, A::v1);
1211         a.bic16b(A::v4, A::v3, A::v1);
1212 
1213         a.add4s(A::v4, A::v3, A::v1);
1214         a.sub4s(A::v4, A::v3, A::v1);
1215         a.mul4s(A::v4, A::v3, A::v1);
1216 
1217         a.sub8h(A::v4, A::v3, A::v1);
1218         a.mul8h(A::v4, A::v3, A::v1);
1219 
1220         a.fadd4s(A::v4, A::v3, A::v1);
1221         a.fsub4s(A::v4, A::v3, A::v1);
1222         a.fmul4s(A::v4, A::v3, A::v1);
1223         a.fdiv4s(A::v4, A::v3, A::v1);
1224 
1225         a.fmla4s(A::v4, A::v3, A::v1);
1226     },{
1227         0x64,0x1c,0x21,0x4e,
1228         0x64,0x1c,0xa1,0x4e,
1229         0x64,0x1c,0x21,0x6e,
1230         0x64,0x1c,0x61,0x4e,
1231 
1232         0x64,0x84,0xa1,0x4e,
1233         0x64,0x84,0xa1,0x6e,
1234         0x64,0x9c,0xa1,0x4e,
1235 
1236         0x64,0x84,0x61,0x6e,
1237         0x64,0x9c,0x61,0x4e,
1238 
1239         0x64,0xd4,0x21,0x4e,
1240         0x64,0xd4,0xa1,0x4e,
1241         0x64,0xdc,0x21,0x6e,
1242         0x64,0xfc,0x21,0x6e,
1243 
1244         0x64,0xcc,0x21,0x4e,
1245     });
1246 
1247     test_asm(r, [&](A& a) {
1248         a.shl4s(A::v4, A::v3,  0);
1249         a.shl4s(A::v4, A::v3,  1);
1250         a.shl4s(A::v4, A::v3,  8);
1251         a.shl4s(A::v4, A::v3, 16);
1252         a.shl4s(A::v4, A::v3, 31);
1253 
1254         a.sshr4s(A::v4, A::v3,  1);
1255         a.sshr4s(A::v4, A::v3,  8);
1256         a.sshr4s(A::v4, A::v3, 31);
1257 
1258         a.ushr4s(A::v4, A::v3,  1);
1259         a.ushr4s(A::v4, A::v3,  8);
1260         a.ushr4s(A::v4, A::v3, 31);
1261 
1262         a.ushr8h(A::v4, A::v3,  1);
1263         a.ushr8h(A::v4, A::v3,  8);
1264         a.ushr8h(A::v4, A::v3, 15);
1265     },{
1266         0x64,0x54,0x20,0x4f,
1267         0x64,0x54,0x21,0x4f,
1268         0x64,0x54,0x28,0x4f,
1269         0x64,0x54,0x30,0x4f,
1270         0x64,0x54,0x3f,0x4f,
1271 
1272         0x64,0x04,0x3f,0x4f,
1273         0x64,0x04,0x38,0x4f,
1274         0x64,0x04,0x21,0x4f,
1275 
1276         0x64,0x04,0x3f,0x6f,
1277         0x64,0x04,0x38,0x6f,
1278         0x64,0x04,0x21,0x6f,
1279 
1280         0x64,0x04,0x1f,0x6f,
1281         0x64,0x04,0x18,0x6f,
1282         0x64,0x04,0x11,0x6f,
1283     });
1284 
1285     test_asm(r, [&](A& a) {
1286         a.sli4s(A::v4, A::v3,  0);
1287         a.sli4s(A::v4, A::v3,  1);
1288         a.sli4s(A::v4, A::v3,  8);
1289         a.sli4s(A::v4, A::v3, 16);
1290         a.sli4s(A::v4, A::v3, 31);
1291     },{
1292         0x64,0x54,0x20,0x6f,
1293         0x64,0x54,0x21,0x6f,
1294         0x64,0x54,0x28,0x6f,
1295         0x64,0x54,0x30,0x6f,
1296         0x64,0x54,0x3f,0x6f,
1297     });
1298 
1299     test_asm(r, [&](A& a) {
1300         a.scvtf4s (A::v4, A::v3);
1301         a.fcvtzs4s(A::v4, A::v3);
1302     },{
1303         0x64,0xd8,0x21,0x4e,
1304         0x64,0xb8,0xa1,0x4e,
1305     });
1306 
1307     test_asm(r, [&](A& a) {
1308         a.ret(A::x30);   // Conventional ret using link register.
1309         a.ret(A::x13);   // Can really return using any register if we like.
1310 
1311         a.add(A::x2, A::x2,  4);
1312         a.add(A::x3, A::x2, 32);
1313 
1314         a.sub(A::x2, A::x2, 4);
1315         a.sub(A::x3, A::x2, 32);
1316 
1317         a.subs(A::x2, A::x2,  4);
1318         a.subs(A::x3, A::x2, 32);
1319 
1320         a.subs(A::xzr, A::x2, 4);  // These are actually the same instruction!
1321         a.cmp(A::x2, 4);
1322 
1323         A::Label l = a.here();
1324         a.bne(&l);
1325         a.bne(&l);
1326         a.blt(&l);
1327         a.b(&l);
1328         a.cbnz(A::x2, &l);
1329         a.cbz(A::x2, &l);
1330     },{
1331         0xc0,0x03,0x5f,0xd6,
1332         0xa0,0x01,0x5f,0xd6,
1333 
1334         0x42,0x10,0x00,0x91,
1335         0x43,0x80,0x00,0x91,
1336 
1337         0x42,0x10,0x00,0xd1,
1338         0x43,0x80,0x00,0xd1,
1339 
1340         0x42,0x10,0x00,0xf1,
1341         0x43,0x80,0x00,0xf1,
1342 
1343         0x5f,0x10,0x00,0xf1,
1344         0x5f,0x10,0x00,0xf1,
1345 
1346         0x01,0x00,0x00,0x54,   // b.ne #0
1347         0xe1,0xff,0xff,0x54,   // b.ne #-4
1348         0xcb,0xff,0xff,0x54,   // b.lt #-8
1349         0xae,0xff,0xff,0x54,   // b.al #-12
1350         0x82,0xff,0xff,0xb5,   // cbnz x2, #-16
1351         0x62,0xff,0xff,0xb4,   // cbz x2, #-20
1352     });
1353 
1354     // Can we cbz() to a not-yet-defined label?
1355     test_asm(r, [&](A& a) {
1356         A::Label l;
1357         a.cbz(A::x2, &l);
1358         a.add(A::x3, A::x2, 32);
1359         a.label(&l);
1360         a.ret(A::x30);
1361     },{
1362         0x42,0x00,0x00,0xb4,  // cbz x2, #8
1363         0x43,0x80,0x00,0x91,  // add x3, x2, #32
1364         0xc0,0x03,0x5f,0xd6,  // ret
1365     });
1366 
1367     // If we start a label as a backward label,
1368     // can we redefine it to be a future label?
1369     // (Not sure this is useful... just want to test it works.)
1370     test_asm(r, [&](A& a) {
1371         A::Label l1 = a.here();
1372         a.add(A::x3, A::x2, 32);
1373         a.cbz(A::x2, &l1);          // This will jump backward... nothing sneaky.
1374 
1375         A::Label l2 = a.here();     // Start off the same...
1376         a.add(A::x3, A::x2, 32);
1377         a.cbz(A::x2, &l2);          // Looks like this will go backward...
1378         a.add(A::x2, A::x2, 4);
1379         a.add(A::x3, A::x2, 32);
1380         a.label(&l2);               // But no... actually forward!  What a switcheroo!
1381     },{
1382         0x43,0x80,0x00,0x91,  // add x3, x2, #32
1383         0xe2,0xff,0xff,0xb4,  // cbz x2, #-4
1384 
1385         0x43,0x80,0x00,0x91,  // add x3, x2, #32
1386         0x62,0x00,0x00,0xb4,  // cbz x2, #12
1387         0x42,0x10,0x00,0x91,  // add x2, x2, #4
1388         0x43,0x80,0x00,0x91,  // add x3, x2, #32
1389     });
1390 
1391     // Loading from a label on ARM.
1392     test_asm(r, [&](A& a) {
1393         A::Label fore,aft;
1394         a.label(&fore);
1395         a.word(0x01234567);
1396         a.ldrq(A::v1, &fore);
1397         a.ldrq(A::v2, &aft);
1398         a.label(&aft);
1399         a.word(0x76543210);
1400     },{
1401         0x67,0x45,0x23,0x01,
1402         0xe1,0xff,0xff,0x9c,  // ldr q1, #-4
1403         0x22,0x00,0x00,0x9c,  // ldr q2, #4
1404         0x10,0x32,0x54,0x76,
1405     });
1406 
1407     test_asm(r, [&](A& a) {
1408         a.ldrq(A::v0, A::x8);
1409         a.strq(A::v0, A::x8);
1410     },{
1411         0x00,0x01,0xc0,0x3d,
1412         0x00,0x01,0x80,0x3d,
1413     });
1414 
1415     test_asm(r, [&](A& a) {
1416         a.xtns2h(A::v0, A::v0);
1417         a.xtnh2b(A::v0, A::v0);
1418         a.strs  (A::v0, A::x0);
1419 
1420         a.ldrs   (A::v0, A::x0);
1421         a.uxtlb2h(A::v0, A::v0);
1422         a.uxtlh2s(A::v0, A::v0);
1423     },{
1424         0x00,0x28,0x61,0x0e,
1425         0x00,0x28,0x21,0x0e,
1426         0x00,0x00,0x00,0xbd,
1427 
1428         0x00,0x00,0x40,0xbd,
1429         0x00,0xa4,0x08,0x2f,
1430         0x00,0xa4,0x10,0x2f,
1431     });
1432 
1433     test_asm(r, [&](A& a) {
1434         a.ldrb(A::v0, A::x8);
1435         a.strb(A::v0, A::x8);
1436     },{
1437         0x00,0x01,0x40,0x3d,
1438         0x00,0x01,0x00,0x3d,
1439     });
1440 
1441     test_asm(r, [&](A& a) {
1442         a.tbl(A::v0, A::v1, A::v2);
1443     },{
1444         0x20,0x00,0x02,0x4e,
1445     });
1446 }
1447