• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2019 Google LLC
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "include/core/SkColorPriv.h"
9 #include "include/private/SkColorData.h"
10 #include "src/core/SkMSAN.h"
11 #include "src/core/SkVM.h"
12 #include "tests/Test.h"
13 #include "tools/Resources.h"
14 #include "tools/SkVMBuilders.h"
15 
16 using Fmt = SrcoverBuilder_F32::Fmt;
fmt_name(Fmt fmt)17 const char* fmt_name(Fmt fmt) {
18     switch (fmt) {
19         case Fmt::A8:        return "A8";
20         case Fmt::G8:        return "G8";
21         case Fmt::RGBA_8888: return "RGBA_8888";
22     }
23     return "";
24 }
25 
dump(skvm::Builder & builder,SkWStream * o)26 static void dump(skvm::Builder& builder, SkWStream* o) {
27     skvm::Program program = builder.done();
28     builder.dump(o);
29     o->writeText("\n");
30     program.dump(o);
31     o->writeText("\n");
32 }
33 
34 // TODO: I'd like this to go away and have every test in here run both JIT and interpreter.
35 template <typename Fn>
test_interpreter_only(skiatest::Reporter * r,skvm::Program && program,Fn && test)36 static void test_interpreter_only(skiatest::Reporter* r, skvm::Program&& program, Fn&& test) {
37     REPORTER_ASSERT(r, !program.hasJIT());
38     test((const skvm::Program&) program);
39 }
40 
41 template <typename Fn>
test_jit_and_interpreter(skiatest::Reporter * r,skvm::Program && program,Fn && test)42 static void test_jit_and_interpreter(skiatest::Reporter* r, skvm::Program&& program, Fn&& test) {
43     static const bool can_jit = []{
44         // This is about the simplest program we can write, setting an int buffer to a constant.
45         // If this can't JIT, the platform does not support JITing.
46         skvm::Builder b;
47         b.store32(b.varying<int>(), b.splat(42));
48         skvm::Program p = b.done();
49         return p.hasJIT();
50     }();
51 
52     if (can_jit) {
53         REPORTER_ASSERT(r, program.hasJIT());
54         test((const skvm::Program&) program);
55         program.dropJIT();
56     }
57     test_interpreter_only(r, std::move(program), std::move(test));
58 }
59 
60 
DEF_TEST(SkVM,r)61 DEF_TEST(SkVM, r) {
62     SkDynamicMemoryWStream buf;
63 
64     // Write all combinations of SrcoverBuilder_F32
65     for (int s = 0; s < 3; s++)
66     for (int d = 0; d < 3; d++) {
67         auto srcFmt = (Fmt)s,
68              dstFmt = (Fmt)d;
69         SrcoverBuilder_F32 builder{srcFmt, dstFmt};
70 
71         buf.writeText(fmt_name(srcFmt));
72         buf.writeText(" over ");
73         buf.writeText(fmt_name(dstFmt));
74         buf.writeText("\n");
75         dump(builder, &buf);
76     }
77 
78     // Write the I32 Srcovers also.
79     {
80         SrcoverBuilder_I32_Naive builder;
81         buf.writeText("I32 (Naive) 8888 over 8888\n");
82         dump(builder, &buf);
83     }
84     {
85         SrcoverBuilder_I32 builder;
86         buf.writeText("I32 8888 over 8888\n");
87         dump(builder, &buf);
88     }
89     {
90         SrcoverBuilder_I32_SWAR builder;
91         buf.writeText("I32 (SWAR) 8888 over 8888\n");
92         dump(builder, &buf);
93     }
94 
95     {
96         skvm::Builder b;
97         skvm::Arg arg = b.varying<int>();
98 
99         // x and y can both be hoisted,
100         // and x can die at y, while y must live for the loop.
101         skvm::I32 x = b.splat(1),
102                   y = b.add(x, b.splat(2));
103         b.store32(arg, b.mul(b.load32(arg), y));
104 
105         skvm::Program program = b.done();
106         REPORTER_ASSERT(r, program.nregs() == 2);
107 
108         std::vector<skvm::OptimizedInstruction> insts = b.optimize();
109         REPORTER_ASSERT(r, insts.size() == 6);
110         REPORTER_ASSERT(r,  insts[0].can_hoist && insts[0].death == 2 && !insts[0].used_in_loop);
111         REPORTER_ASSERT(r,  insts[1].can_hoist && insts[1].death == 2 && !insts[1].used_in_loop);
112         REPORTER_ASSERT(r,  insts[2].can_hoist && insts[2].death == 4 &&  insts[2].used_in_loop);
113         REPORTER_ASSERT(r, !insts[3].can_hoist);
114         REPORTER_ASSERT(r, !insts[4].can_hoist);
115         REPORTER_ASSERT(r, !insts[5].can_hoist);
116 
117         dump(b, &buf);
118 
119         test_jit_and_interpreter(r, std::move(program), [&](const skvm::Program& program) {
120             int arg[] = {0,1,2,3,4,5,6,7,8,9};
121 
122             program.eval(SK_ARRAY_COUNT(arg), arg);
123 
124             for (int i = 0; i < (int)SK_ARRAY_COUNT(arg); i++) {
125                 REPORTER_ASSERT(r, arg[i] == i*3);
126             }
127         });
128     }
129 
130     {
131         // Demonstrate the value of program reordering.
132         skvm::Builder b;
133         skvm::Arg sp = b.varying<int>(),
134                   dp = b.varying<int>();
135 
136         skvm::I32 byte = b.splat(0xff);
137 
138         skvm::I32 src = b.load32(sp),
139                   sr  = b.extract(src,  0, byte),
140                   sg  = b.extract(src,  8, byte),
141                   sb  = b.extract(src, 16, byte),
142                   sa  = b.extract(src, 24, byte);
143 
144         skvm::I32 dst = b.load32(dp),
145                   dr  = b.extract(dst,  0, byte),
146                   dg  = b.extract(dst,  8, byte),
147                   db  = b.extract(dst, 16, byte),
148                   da  = b.extract(dst, 24, byte);
149 
150         skvm::I32 R = b.add(sr, dr),
151                   G = b.add(sg, dg),
152                   B = b.add(sb, db),
153                   A = b.add(sa, da);
154 
155         skvm::I32 rg = b.pack(R, G, 8),
156                   ba = b.pack(B, A, 8),
157                   rgba = b.pack(rg, ba, 16);
158 
159         b.store32(dp, rgba);
160 
161         dump(b, &buf);
162     }
163 
164     sk_sp<SkData> blob = buf.detachAsData();
165     {
166 
167         sk_sp<SkData> expected = GetResourceAsData("SkVMTest.expected");
168         REPORTER_ASSERT(r, expected, "Couldn't load SkVMTest.expected.");
169         if (expected) {
170             if (blob->size() != expected->size()
171                     || 0 != memcmp(blob->data(), expected->data(), blob->size())) {
172 
173                 ERRORF(r, "SkVMTest expected\n%.*s\nbut got\n%.*s\n",
174                        expected->size(), expected->data(),
175                        blob->size(), blob->data());
176             }
177 
178             SkFILEWStream out(GetResourcePath("SkVMTest.expected").c_str());
179             if (out.isValid()) {
180                 out.write(blob->data(), blob->size());
181             }
182         }
183     }
184 
185     auto test_8888 = [&](skvm::Program&& program) {
186         uint32_t src[9];
187         uint32_t dst[SK_ARRAY_COUNT(src)];
188 
189         test_jit_and_interpreter(r, std::move(program), [&](const skvm::Program& program) {
190             for (int i = 0; i < (int)SK_ARRAY_COUNT(src); i++) {
191                 src[i] = 0xbb007733;
192                 dst[i] = 0xffaaccee;
193             }
194 
195             SkPMColor expected = SkPMSrcOver(src[0], dst[0]);  // 0xff2dad73
196 
197             program.eval((int)SK_ARRAY_COUNT(src), src, dst);
198 
199             // dst is probably 0xff2dad72.
200             for (auto got : dst) {
201                 auto want = expected;
202                 for (int i = 0; i < 4; i++) {
203                     uint8_t d = got  & 0xff,
204                             w = want & 0xff;
205                     if (abs(d-w) >= 2) {
206                         SkDebugf("d %02x, w %02x\n", d,w);
207                     }
208                     REPORTER_ASSERT(r, abs(d-w) < 2);
209                     got  >>= 8;
210                     want >>= 8;
211                 }
212             }
213         });
214     };
215 
216     test_8888(SrcoverBuilder_F32{Fmt::RGBA_8888, Fmt::RGBA_8888}.done("srcover_f32"));
217     test_8888(SrcoverBuilder_I32_Naive{}.done("srcover_i32_naive"));
218     test_8888(SrcoverBuilder_I32{}.done("srcover_i32"));
219     test_8888(SrcoverBuilder_I32_SWAR{}.done("srcover_i32_SWAR"));
220 
221     test_jit_and_interpreter(r, SrcoverBuilder_F32{Fmt::RGBA_8888, Fmt::G8}.done(),
222                              [&](const skvm::Program& program) {
223         uint32_t src[9];
224         uint8_t  dst[SK_ARRAY_COUNT(src)];
225 
226         for (int i = 0; i < (int)SK_ARRAY_COUNT(src); i++) {
227             src[i] = 0xbb007733;
228             dst[i] = 0x42;
229         }
230 
231         SkPMColor over = SkPMSrcOver(SkPackARGB32(0xbb, 0x33, 0x77, 0x00),
232                                      0xff424242);
233 
234         uint8_t want = SkComputeLuminance(SkGetPackedR32(over),
235                                           SkGetPackedG32(over),
236                                           SkGetPackedB32(over));
237         program.eval((int)SK_ARRAY_COUNT(src), src, dst);
238 
239         for (auto got : dst) {
240             REPORTER_ASSERT(r, abs(got-want) < 3);
241         }
242     });
243 
244     test_jit_and_interpreter(r, SrcoverBuilder_F32{Fmt::A8, Fmt::A8}.done(),
245                              [&](const skvm::Program& program) {
246         uint8_t src[256],
247                 dst[256];
248         for (int i = 0; i < 256; i++) {
249             src[i] = 255 - i;
250             dst[i] = i;
251         }
252 
253         program.eval(256, src, dst);
254 
255         for (int i = 0; i < 256; i++) {
256             uint8_t want = SkGetPackedA32(SkPMSrcOver(SkPackARGB32(src[i], 0,0,0),
257                                                       SkPackARGB32(     i, 0,0,0)));
258             REPORTER_ASSERT(r, abs(dst[i]-want) < 2);
259         }
260     });
261 }
262 
DEF_TEST(SkVM_Pointless,r)263 DEF_TEST(SkVM_Pointless, r) {
264     // Let's build a program with no memory arguments.
265     // It should all be pegged as dead code, but we should be able to "run" it.
266     skvm::Builder b;
267     {
268         b.add(b.splat(5.0f),
269               b.splat(4.0f));
270     }
271 
272     test_jit_and_interpreter(r, b.done(), [&](const skvm::Program& program) {
273         for (int N = 0; N < 64; N++) {
274             program.eval(N);
275         }
276     });
277 
278     for (const skvm::OptimizedInstruction& inst : b.optimize()) {
279         REPORTER_ASSERT(r, inst.death == 0 && inst.can_hoist == true);
280     }
281 }
282 
DEF_TEST(SkVM_LoopCounts,r)283 DEF_TEST(SkVM_LoopCounts, r) {
284     // Make sure we cover all the exact N we want.
285 
286     // buf[i] += 1
287     skvm::Builder b;
288     skvm::Arg arg = b.varying<int>();
289     b.store32(arg,
290               b.add(b.splat(1),
291                     b.load32(arg)));
292 
293     test_jit_and_interpreter(r, b.done(), [&](const skvm::Program& program) {
294         int buf[64];
295         for (int N = 0; N <= (int)SK_ARRAY_COUNT(buf); N++) {
296             for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
297                 buf[i] = i;
298             }
299             program.eval(N, buf);
300 
301             for (int i = 0; i < N; i++) {
302                 REPORTER_ASSERT(r, buf[i] == i+1);
303             }
304             for (int i = N; i < (int)SK_ARRAY_COUNT(buf); i++) {
305                 REPORTER_ASSERT(r, buf[i] == i);
306             }
307         }
308     });
309 }
310 
DEF_TEST(SkVM_gather32,r)311 DEF_TEST(SkVM_gather32, r) {
312     skvm::Builder b;
313     {
314         skvm::Arg uniforms = b.uniform(),
315                   buf      = b.varying<int>();
316         skvm::I32 x = b.load32(buf);
317         b.store32(buf, b.gather32(uniforms,0, b.bit_and(x, b.splat(7))));
318     }
319 
320 #if defined(SK_CPU_X86)
321     test_jit_and_interpreter
322 #else
323     test_interpreter_only
324 #endif
325     (r, b.done(), [&](const skvm::Program& program) {
326         const int img[] = {12,34,56,78, 90,98,76,54};
327 
328         int buf[20];
329         for (int i = 0; i < 20; i++) {
330             buf[i] = i;
331         }
332 
333         struct Uniforms {
334             const int* img;
335         } uniforms{img};
336 
337         program.eval(20, &uniforms, buf);
338         int i = 0;
339         REPORTER_ASSERT(r, buf[i] == 12); i++;
340         REPORTER_ASSERT(r, buf[i] == 34); i++;
341         REPORTER_ASSERT(r, buf[i] == 56); i++;
342         REPORTER_ASSERT(r, buf[i] == 78); i++;
343         REPORTER_ASSERT(r, buf[i] == 90); i++;
344         REPORTER_ASSERT(r, buf[i] == 98); i++;
345         REPORTER_ASSERT(r, buf[i] == 76); i++;
346         REPORTER_ASSERT(r, buf[i] == 54); i++;
347 
348         REPORTER_ASSERT(r, buf[i] == 12); i++;
349         REPORTER_ASSERT(r, buf[i] == 34); i++;
350         REPORTER_ASSERT(r, buf[i] == 56); i++;
351         REPORTER_ASSERT(r, buf[i] == 78); i++;
352         REPORTER_ASSERT(r, buf[i] == 90); i++;
353         REPORTER_ASSERT(r, buf[i] == 98); i++;
354         REPORTER_ASSERT(r, buf[i] == 76); i++;
355         REPORTER_ASSERT(r, buf[i] == 54); i++;
356 
357         REPORTER_ASSERT(r, buf[i] == 12); i++;
358         REPORTER_ASSERT(r, buf[i] == 34); i++;
359         REPORTER_ASSERT(r, buf[i] == 56); i++;
360         REPORTER_ASSERT(r, buf[i] == 78); i++;
361     });
362 }
363 
DEF_TEST(SkVM_gathers,r)364 DEF_TEST(SkVM_gathers, r) {
365     skvm::Builder b;
366     {
367         skvm::Arg uniforms = b.uniform(),
368                   buf32    = b.varying<int>(),
369                   buf16    = b.varying<uint16_t>(),
370                   buf8     = b.varying<uint8_t>();
371 
372         skvm::I32 x = b.load32(buf32);
373 
374         b.store32(buf32, b.gather32(uniforms,0, b.bit_and(x, b.splat( 7))));
375         b.store16(buf16, b.gather16(uniforms,0, b.bit_and(x, b.splat(15))));
376         b.store8 (buf8 , b.gather8 (uniforms,0, b.bit_and(x, b.splat(31))));
377     }
378 
379     test_interpreter_only(r, b.done(), [&](const skvm::Program& program) {
380         const int img[] = {12,34,56,78, 90,98,76,54};
381 
382         constexpr int N = 20;
383         int      buf32[N];
384         uint16_t buf16[N];
385         uint8_t  buf8 [N];
386 
387         for (int i = 0; i < 20; i++) {
388             buf32[i] = i;
389         }
390 
391         struct Uniforms {
392             const int* img;
393         } uniforms{img};
394 
395         program.eval(N, &uniforms, buf32, buf16, buf8);
396         int i = 0;
397         REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 12 && buf8[i] == 12); i++;
398         REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] ==  0 && buf8[i] ==  0); i++;
399         REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 34 && buf8[i] ==  0); i++;
400         REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] ==  0 && buf8[i] ==  0); i++;
401         REPORTER_ASSERT(r, buf32[i] == 90 && buf16[i] == 56 && buf8[i] == 34); i++;
402         REPORTER_ASSERT(r, buf32[i] == 98 && buf16[i] ==  0 && buf8[i] ==  0); i++;
403         REPORTER_ASSERT(r, buf32[i] == 76 && buf16[i] == 78 && buf8[i] ==  0); i++;
404         REPORTER_ASSERT(r, buf32[i] == 54 && buf16[i] ==  0 && buf8[i] ==  0); i++;
405 
406         REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 90 && buf8[i] == 56); i++;
407         REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] ==  0 && buf8[i] ==  0); i++;
408         REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 98 && buf8[i] ==  0); i++;
409         REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] ==  0 && buf8[i] ==  0); i++;
410         REPORTER_ASSERT(r, buf32[i] == 90 && buf16[i] == 76 && buf8[i] == 78); i++;
411         REPORTER_ASSERT(r, buf32[i] == 98 && buf16[i] ==  0 && buf8[i] ==  0); i++;
412         REPORTER_ASSERT(r, buf32[i] == 76 && buf16[i] == 54 && buf8[i] ==  0); i++;
413         REPORTER_ASSERT(r, buf32[i] == 54 && buf16[i] ==  0 && buf8[i] ==  0); i++;
414 
415         REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 12 && buf8[i] == 90); i++;
416         REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] ==  0 && buf8[i] ==  0); i++;
417         REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 34 && buf8[i] ==  0); i++;
418         REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] ==  0 && buf8[i] ==  0); i++;
419     });
420 }
421 
DEF_TEST(SkVM_bitops,r)422 DEF_TEST(SkVM_bitops, r) {
423     skvm::Builder b;
424     {
425         skvm::Arg ptr = b.varying<int>();
426 
427         skvm::I32 x = b.load32(ptr);
428 
429         x = b.bit_and  (x, b.splat(0xf1));  // 0x40
430         x = b.bit_or   (x, b.splat(0x80));  // 0xc0
431         x = b.bit_xor  (x, b.splat(0xfe));  // 0x3e
432         x = b.bit_clear(x, b.splat(0x30));  // 0x0e
433 
434         x = b.shl(x, 28);  // 0xe000'0000
435         x = b.sra(x, 28);  // 0xffff'fffe
436         x = b.shr(x,  1);  // 0x7fff'ffff
437 
438         b.store32(ptr, x);
439     }
440 
441     test_jit_and_interpreter(r, b.done(), [&](const skvm::Program& program) {
442         int x = 0x42;
443         program.eval(1, &x);
444         REPORTER_ASSERT(r, x == 0x7fff'ffff);
445     });
446 }
447 
DEF_TEST(SkVM_f32,r)448 DEF_TEST(SkVM_f32, r) {
449     skvm::Builder b;
450     {
451         skvm::Arg arg = b.varying<float>();
452 
453         skvm::F32 x = b.bit_cast(b.load32(arg)),
454                   y = b.add(x,x),   // y = 2x
455                   z = b.sub(y,x),   // z = 2x-x = x
456                   w = b.div(z,x);   // w = x/x = 1
457         b.store32(arg, b.bit_cast(w));
458     }
459 
460     test_jit_and_interpreter(r, b.done(), [&](const skvm::Program& program) {
461         float buf[] = { 1,2,3,4,5,6,7,8,9 };
462         program.eval(SK_ARRAY_COUNT(buf), buf);
463         for (float v : buf) {
464             REPORTER_ASSERT(r, v == 1.0f);
465         }
466     });
467 }
468 
DEF_TEST(SkVM_cmp_i32,r)469 DEF_TEST(SkVM_cmp_i32, r) {
470     skvm::Builder b;
471     {
472         skvm::I32 x = b.load32(b.varying<int>());
473 
474         auto to_bit = [&](int shift, skvm::I32 mask) {
475             return b.shl(b.bit_and(mask, b.splat(0x1)), shift);
476         };
477 
478         skvm::I32 m = b.splat(0);
479         m = b.bit_or(m, to_bit(0, b. eq(x, b.splat(0))));
480         m = b.bit_or(m, to_bit(1, b.neq(x, b.splat(1))));
481         m = b.bit_or(m, to_bit(2, b. lt(x, b.splat(2))));
482         m = b.bit_or(m, to_bit(3, b.lte(x, b.splat(3))));
483         m = b.bit_or(m, to_bit(4, b. gt(x, b.splat(4))));
484         m = b.bit_or(m, to_bit(5, b.gte(x, b.splat(5))));
485 
486         b.store32(b.varying<int>(), m);
487     }
488 
489     test_interpreter_only(r, b.done(), [&](const skvm::Program& program) {
490         int in[] = { 0,1,2,3,4,5,6,7,8,9 };
491         int out[SK_ARRAY_COUNT(in)];
492 
493         program.eval(SK_ARRAY_COUNT(in), in, out);
494 
495         REPORTER_ASSERT(r, out[0] == 0b001111);
496         REPORTER_ASSERT(r, out[1] == 0b001100);
497         REPORTER_ASSERT(r, out[2] == 0b001010);
498         REPORTER_ASSERT(r, out[3] == 0b001010);
499         REPORTER_ASSERT(r, out[4] == 0b000010);
500         for (int i = 5; i < (int)SK_ARRAY_COUNT(out); i++) {
501             REPORTER_ASSERT(r, out[i] == 0b110010);
502         }
503     });
504 }
505 
DEF_TEST(SkVM_cmp_f32,r)506 DEF_TEST(SkVM_cmp_f32, r) {
507     skvm::Builder b;
508     {
509         skvm::F32 x = b.bit_cast(b.load32(b.varying<float>()));
510 
511         auto to_bit = [&](int shift, skvm::I32 mask) {
512             return b.shl(b.bit_and(mask, b.splat(0x1)), shift);
513         };
514 
515         skvm::I32 m = b.splat(0);
516         m = b.bit_or(m, to_bit(0, b. eq(x, b.splat(0.0f))));
517         m = b.bit_or(m, to_bit(1, b.neq(x, b.splat(1.0f))));
518         m = b.bit_or(m, to_bit(2, b. lt(x, b.splat(2.0f))));
519         m = b.bit_or(m, to_bit(3, b.lte(x, b.splat(3.0f))));
520         m = b.bit_or(m, to_bit(4, b. gt(x, b.splat(4.0f))));
521         m = b.bit_or(m, to_bit(5, b.gte(x, b.splat(5.0f))));
522 
523         b.store32(b.varying<int>(), m);
524     }
525 
526     test_jit_and_interpreter(r, b.done(), [&](const skvm::Program& program) {
527         float in[] = { 0,1,2,3,4,5,6,7,8,9 };
528         int out[SK_ARRAY_COUNT(in)];
529 
530         program.eval(SK_ARRAY_COUNT(in), in, out);
531 
532         REPORTER_ASSERT(r, out[0] == 0b001111);
533         REPORTER_ASSERT(r, out[1] == 0b001100);
534         REPORTER_ASSERT(r, out[2] == 0b001010);
535         REPORTER_ASSERT(r, out[3] == 0b001010);
536         REPORTER_ASSERT(r, out[4] == 0b000010);
537         for (int i = 5; i < (int)SK_ARRAY_COUNT(out); i++) {
538             REPORTER_ASSERT(r, out[i] == 0b110010);
539         }
540     });
541 }
542 
DEF_TEST(SkVM_i16x2,r)543 DEF_TEST(SkVM_i16x2, r) {
544     skvm::Builder b;
545     {
546         skvm::Arg buf = b.varying<int>();
547 
548         skvm::I32 x = b.load32(buf),
549                   y = b.add_16x2(x,x),   // y = 2x
550                   z = b.mul_16x2(x,y),   // z = 2x^2
551                   w = b.sub_16x2(z,x),   // w = x(2x-1)
552                   v = b.shl_16x2(w,7),   // These shifts will be a no-op
553                   u = b.sra_16x2(v,7);   // for all but x=12 and x=13.
554         b.store32(buf, u);
555     }
556 
557     test_interpreter_only(r, b.done(), [&](const skvm::Program& program) {
558         uint16_t buf[] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13 };
559 
560         program.eval(SK_ARRAY_COUNT(buf)/2, buf);
561         for (int i = 0; i < 12; i++) {
562             REPORTER_ASSERT(r, buf[i] == i*(2*i-1));
563         }
564         REPORTER_ASSERT(r, buf[12] == 0xff14);   // 12*23 = 0x114
565         REPORTER_ASSERT(r, buf[13] == 0xff45);   // 13*25 = 0x145
566     });
567 }
568 
DEF_TEST(SkVM_cmp_i16,r)569 DEF_TEST(SkVM_cmp_i16, r) {
570     skvm::Builder b;
571     {
572         skvm::Arg buf = b.varying<int>();
573         skvm::I32 x = b.load32(buf);
574 
575         auto to_bit = [&](int shift, skvm::I32 mask) {
576             return b.shl_16x2(b.bit_and(mask, b.splat(0x0001'0001)), shift);
577         };
578 
579         skvm::I32 m = b.splat(0);
580         m = b.bit_or(m, to_bit(0, b. eq_16x2(x, b.splat(0x0000'0000))));
581         m = b.bit_or(m, to_bit(1, b.neq_16x2(x, b.splat(0x0001'0001))));
582         m = b.bit_or(m, to_bit(2, b. lt_16x2(x, b.splat(0x0002'0002))));
583         m = b.bit_or(m, to_bit(3, b.lte_16x2(x, b.splat(0x0003'0003))));
584         m = b.bit_or(m, to_bit(4, b. gt_16x2(x, b.splat(0x0004'0004))));
585         m = b.bit_or(m, to_bit(5, b.gte_16x2(x, b.splat(0x0005'0005))));
586 
587         b.store32(buf, m);
588     }
589 
590     test_interpreter_only(r, b.done(), [&](const skvm::Program& program) {
591         int16_t buf[] = { 0,1, 2,3, 4,5, 6,7, 8,9 };
592 
593         program.eval(SK_ARRAY_COUNT(buf)/2, buf);
594 
595         REPORTER_ASSERT(r, buf[0] == 0b001111);
596         REPORTER_ASSERT(r, buf[1] == 0b001100);
597         REPORTER_ASSERT(r, buf[2] == 0b001010);
598         REPORTER_ASSERT(r, buf[3] == 0b001010);
599         REPORTER_ASSERT(r, buf[4] == 0b000010);
600         for (int i = 5; i < (int)SK_ARRAY_COUNT(buf); i++) {
601             REPORTER_ASSERT(r, buf[i] == 0b110010);
602         }
603     });
604 }
605 
606 
DEF_TEST(SkVM_mad,r)607 DEF_TEST(SkVM_mad, r) {
608     // This program is designed to exercise the tricky corners of instruction
609     // and register selection for Op::mad_f32.
610 
611     skvm::Builder b;
612     {
613         skvm::Arg arg = b.varying<int>();
614 
615         skvm::F32 x = b.to_f32(b.load32(arg)),
616                   y = b.mad(x,x,x),   // x is needed in the future, so r[x] != r[y].
617                   z = b.mad(y,y,x),   // y is needed in the future, but r[z] = r[x] is ok.
618                   w = b.mad(z,z,y),   // w can alias z but not y.
619                   v = b.mad(w,y,w);   // Got to stop somewhere.
620         b.store32(arg, b.trunc(v));
621     }
622 
623     test_jit_and_interpreter(r, b.done(), [&](const skvm::Program& program) {
624         int x = 2;
625         program.eval(1, &x);
626         // x = 2
627         // y = 2*2 + 2 = 6
628         // z = 6*6 + 2 = 38
629         // w = 38*38 + 6 = 1450
630         // v = 1450*6 + 1450 = 10150
631         REPORTER_ASSERT(r, x == 10150);
632     });
633 }
634 
DEF_TEST(SkVM_madder,r)635 DEF_TEST(SkVM_madder, r) {
636     skvm::Builder b;
637     {
638         skvm::Arg arg = b.varying<float>();
639 
640         skvm::F32 x = b.bit_cast(b.load32(arg)),
641                   y = b.mad(x,x,x),   // x is needed in the future, so r[x] != r[y].
642                   z = b.mad(y,x,y),   // r[x] can be reused after this instruction, but not r[y].
643                   w = b.mad(y,y,z);
644         b.store32(arg, b.bit_cast(w));
645     }
646 
647     test_jit_and_interpreter(r, b.done(), [&](const skvm::Program& program) {
648         float x = 2.0f;
649         // y = 2*2 + 2 = 6
650         // z = 6*2 + 6 = 18
651         // w = 6*6 + 18 = 54
652         program.eval(1, &x);
653         REPORTER_ASSERT(r, x == 54.0f);
654     });
655 }
656 
DEF_TEST(SkVM_floor,r)657 DEF_TEST(SkVM_floor, r) {
658     skvm::Builder b;
659     {
660         skvm::Arg arg = b.varying<float>();
661         b.store32(arg, b.bit_cast(b.floor(b.bit_cast(b.load32(arg)))));
662     }
663 
664 #if defined(SK_CPU_X86)
665     test_jit_and_interpreter
666 #else
667     test_interpreter_only
668 #endif
669     (r, b.done(), [&](const skvm::Program& program) {
670         float buf[]  = { -2.0f, -1.5f, -1.0f, 0.0f, 1.0f, 1.5f, 2.0f };
671         float want[] = { -2.0f, -2.0f, -1.0f, 0.0f, 1.0f, 1.0f, 2.0f };
672         program.eval(SK_ARRAY_COUNT(buf), buf);
673         for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
674             REPORTER_ASSERT(r, buf[i] == want[i]);
675         }
676     });
677 }
678 
DEF_TEST(SkVM_hoist,r)679 DEF_TEST(SkVM_hoist, r) {
680     // This program uses enough constants that it will fail to JIT if we hoist them.
681     // The JIT will try again without hoisting, and that'll just need 2 registers.
682     skvm::Builder b;
683     {
684         skvm::Arg arg = b.varying<int>();
685         skvm::I32 x = b.load32(arg);
686         for (int i = 0; i < 32; i++) {
687             x = b.add(x, b.splat(i));
688         }
689         b.store32(arg, x);
690     }
691 
692     test_jit_and_interpreter(r, b.done(), [&](const skvm::Program& program) {
693         int x = 4;
694         program.eval(1, &x);
695         // x += 0 + 1 + 2 + 3 + ... + 30 + 31
696         // x += 496
697         REPORTER_ASSERT(r, x == 500);
698     });
699 }
700 
DEF_TEST(SkVM_select,r)701 DEF_TEST(SkVM_select, r) {
702     skvm::Builder b;
703     {
704         skvm::Arg buf = b.varying<int>();
705 
706         skvm::I32 x = b.load32(buf);
707 
708         x = b.select( b.gt(x, b.splat(4)), x, b.splat(42) );
709 
710         b.store32(buf, x);
711     }
712 
713     test_jit_and_interpreter(r, b.done(), [&](const skvm::Program& program) {
714         int buf[] = { 0,1,2,3,4,5,6,7,8 };
715         program.eval(SK_ARRAY_COUNT(buf), buf);
716         for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
717             REPORTER_ASSERT(r, buf[i] == (i > 4 ? i : 42));
718         }
719     });
720 }
721 
DEF_TEST(SkVM_NewOps,r)722 DEF_TEST(SkVM_NewOps, r) {
723     // Exercise a somewhat arbitrary set of new ops.
724     skvm::Builder b;
725     {
726         skvm::Arg buf      = b.varying<int16_t>(),
727                   uniforms = b.uniform();
728 
729         skvm::I32 x = b.load16(buf);
730 
731         const size_t kPtr = sizeof(const int*);
732 
733         x = b.add(x, b.uniform32(uniforms, kPtr+0));
734         x = b.mul(x, b.uniform8 (uniforms, kPtr+4));
735         x = b.sub(x, b.uniform16(uniforms, kPtr+6));
736 
737         skvm::I32 limit = b.uniform32(uniforms, kPtr+8);
738         x = b.select(b.lt(x, b.splat(0)), b.splat(0), x);
739         x = b.select(b.gt(x, limit     ), limit     , x);
740 
741         x = b.gather8(uniforms,0, x);
742 
743         b.store16(buf, x);
744     }
745 
746     if ((false)) {
747         SkDynamicMemoryWStream buf;
748         dump(b, &buf);
749         sk_sp<SkData> blob = buf.detachAsData();
750         SkDebugf("%.*s\n", blob->size(), blob->data());
751     }
752 
753     test_interpreter_only(r, b.done(), [&](const skvm::Program& program) {
754         const int N = 31;
755         int16_t buf[N];
756         for (int i = 0; i < N; i++) {
757             buf[i] = i;
758         }
759 
760         const int M = 16;
761         uint8_t img[M];
762         for (int i = 0; i < M; i++) {
763             img[i] = i*i;
764         }
765 
766         struct {
767             const uint8_t* img;
768             int      add   = 5;
769             uint8_t  mul   = 3;
770             uint16_t sub   = 18;
771             int      limit = M-1;
772         } uniforms{img};
773 
774         program.eval(N, buf, &uniforms);
775 
776         for (int i = 0; i < N; i++) {
777             // Our first math calculates x = (i+5)*3 - 18 a.k.a 3*(i-1).
778             int x = 3*(i-1);
779 
780             // Then that's pinned to the limits of img.
781             if (i < 2) { x =  0; }  // Notice i == 1 hits x == 0 exactly...
782             if (i > 5) { x = 15; }  // ...and i == 6 hits x == 15 exactly
783             REPORTER_ASSERT(r, buf[i] == img[x]);
784         }
785     });
786 }
787 
DEF_TEST(SkVM_MSAN,r)788 DEF_TEST(SkVM_MSAN, r) {
789     // This little memset32() program should be able to JIT, but if we run that
790     // JIT code in an MSAN build, it won't see the writes initialize buf.  So
791     // this tests that we're using the interpreter instead.
792     skvm::Builder b;
793     b.store32(b.varying<int>(), b.splat(42));
794 
795     test_jit_and_interpreter(r, b.done(), [&](const skvm::Program& program) {
796         constexpr int K = 17;
797         int buf[K];                 // Intentionally uninitialized.
798         program.eval(K, buf);
799         sk_msan_assert_initialized(buf, buf+K);
800         for (int x : buf) {
801             REPORTER_ASSERT(r, x == 42);
802         }
803     });
804 }
805 
DEF_TEST(SkVM_assert,r)806 DEF_TEST(SkVM_assert, r) {
807     skvm::Builder b;
808     b.assert_true(b.lt(b.load32(b.varying<int>()),
809                        b.splat(42)));
810 
811     test_jit_and_interpreter(r, b.done(), [&](const skvm::Program& program) {
812         int buf[] = { 0,1,2,3,4,5,6,7,8,9 };
813         program.eval(SK_ARRAY_COUNT(buf), buf);
814     });
815 }
816 
DEF_TEST(SkVM_premul,reporter)817 DEF_TEST(SkVM_premul, reporter) {
818     // Test that premul is short-circuited when alpha is known opaque.
819     {
820         skvm::Builder p;
821         auto rptr = p.varying<int>(),
822              aptr = p.varying<int>();
823 
824         skvm::F32 r = p.bit_cast(p.load32(rptr)),
825                   g = p.splat(0.0f),
826                   b = p.splat(0.0f),
827                   a = p.bit_cast(p.load32(aptr));
828 
829         p.premul(&r, &g, &b, a);
830         p.store32(rptr, p.bit_cast(r));
831 
832         // load red, load alpha, red *= alpha, store red
833         REPORTER_ASSERT(reporter, p.done().instructions().size() == 4);
834     }
835 
836     {
837         skvm::Builder p;
838         auto rptr = p.varying<int>();
839 
840         skvm::F32 r = p.bit_cast(p.load32(rptr)),
841                   g = p.splat(0.0f),
842                   b = p.splat(0.0f),
843                   a = p.splat(1.0f);
844 
845         p.premul(&r, &g, &b, a);
846         p.store32(rptr, p.bit_cast(r));
847 
848         // load red, store red
849         REPORTER_ASSERT(reporter, p.done().instructions().size() == 2);
850     }
851 
852     // Same deal for unpremul.
853     {
854         skvm::Builder p;
855         auto rptr = p.varying<int>(),
856              aptr = p.varying<int>();
857 
858         skvm::F32 r = p.bit_cast(p.load32(rptr)),
859                   g = p.splat(0.0f),
860                   b = p.splat(0.0f),
861                   a = p.bit_cast(p.load32(aptr));
862 
863         p.unpremul(&r, &g, &b, a);
864         p.store32(rptr, p.bit_cast(r));
865 
866         // load red, load alpha, a bunch of unpremul instructions, store red
867         REPORTER_ASSERT(reporter, p.done().instructions().size() >= 4);
868     }
869 
870     {
871         skvm::Builder p;
872         auto rptr = p.varying<int>();
873 
874         skvm::F32 r = p.bit_cast(p.load32(rptr)),
875                   g = p.splat(0.0f),
876                   b = p.splat(0.0f),
877                   a = p.splat(1.0f);
878 
879         p.unpremul(&r, &g, &b, a);
880         p.store32(rptr, p.bit_cast(r));
881 
882         // load red, store red
883         REPORTER_ASSERT(reporter, p.done().instructions().size() == 2);
884     }
885 }
886 
887 template <typename Fn>
test_asm(skiatest::Reporter * r,Fn && fn,std::initializer_list<uint8_t> expected)888 static void test_asm(skiatest::Reporter* r, Fn&& fn, std::initializer_list<uint8_t> expected) {
889     uint8_t buf[4096];
890     skvm::Assembler a{buf};
891     fn(a);
892 
893     REPORTER_ASSERT(r, a.size() == expected.size());
894 
895     auto got = (const uint8_t*)buf,
896          want = expected.begin();
897     for (int i = 0; i < (int)std::min(a.size(), expected.size()); i++) {
898         REPORTER_ASSERT(r, got[i] == want[i],
899                         "byte %d was %02x, want %02x", i, got[i], want[i]);
900     }
901 }
902 
DEF_TEST(SkVM_Assembler,r)903 DEF_TEST(SkVM_Assembler, r) {
904     // Easiest way to generate test cases is
905     //
906     //   echo '...some asm...' | llvm-mc -show-encoding -x86-asm-syntax=intel
907     //
908     // The -x86-asm-syntax=intel bit is optional, controlling the
909     // input syntax only; the output will always be AT&T  op x,y,dst style.
910     // Our APIs read more like Intel op dst,x,y as op(dst,x,y), so I find
911     // that a bit easier to use here, despite maybe favoring AT&T overall.
912 
913     using A = skvm::Assembler;
914     // Our exit strategy from AVX code.
915     test_asm(r, [&](A& a) {
916         a.int3();
917         a.vzeroupper();
918         a.ret();
919     },{
920         0xcc,
921         0xc5, 0xf8, 0x77,
922         0xc3,
923     });
924 
925     // Align should pad with zero
926     test_asm(r, [&](A& a) {
927         a.ret();
928         a.align(4);
929     },{
930         0xc3,
931         0x00, 0x00, 0x00,
932     });
933 
934     test_asm(r, [&](A& a) {
935         a.add(A::rax, 8);       // Always good to test rax.
936         a.sub(A::rax, 32);
937 
938         a.add(A::rdi, 12);      // Last 0x48 REX
939         a.sub(A::rdi, 8);
940 
941         a.add(A::r8 , 7);       // First 0x49 REX
942         a.sub(A::r8 , 4);
943 
944         a.add(A::rsi, 128);     // Requires 4 byte immediate.
945         a.sub(A::r8 , 1000000);
946     },{
947         0x48, 0x83, 0b11'000'000, 0x08,
948         0x48, 0x83, 0b11'101'000, 0x20,
949 
950         0x48, 0x83, 0b11'000'111, 0x0c,
951         0x48, 0x83, 0b11'101'111, 0x08,
952 
953         0x49, 0x83, 0b11'000'000, 0x07,
954         0x49, 0x83, 0b11'101'000, 0x04,
955 
956         0x48, 0x81, 0b11'000'110, 0x80, 0x00, 0x00, 0x00,
957         0x49, 0x81, 0b11'101'000, 0x40, 0x42, 0x0f, 0x00,
958     });
959 
960 
961     test_asm(r, [&](A& a) {
962         a.vpaddd (A::ymm0, A::ymm1, A::ymm2);  // Low registers and 0x0f map     -> 2-byte VEX.
963         a.vpaddd (A::ymm8, A::ymm1, A::ymm2);  // A high dst register is ok      -> 2-byte VEX.
964         a.vpaddd (A::ymm0, A::ymm8, A::ymm2);  // A high first argument register -> 2-byte VEX.
965         a.vpaddd (A::ymm0, A::ymm1, A::ymm8);  // A high second argument         -> 3-byte VEX.
966         a.vpmulld(A::ymm0, A::ymm1, A::ymm2);  // Using non-0x0f map instruction -> 3-byte VEX.
967         a.vpsubd (A::ymm0, A::ymm1, A::ymm2);  // Test vpsubd to ensure argument order is right.
968     },{
969         /*    VEX     */ /*op*/ /*modRM*/
970         0xc5,       0xf5, 0xfe, 0xc2,
971         0xc5,       0x75, 0xfe, 0xc2,
972         0xc5,       0xbd, 0xfe, 0xc2,
973         0xc4, 0xc1, 0x75, 0xfe, 0xc0,
974         0xc4, 0xe2, 0x75, 0x40, 0xc2,
975         0xc5,       0xf5, 0xfa, 0xc2,
976     });
977 
978     test_asm(r, [&](A& a) {
979         a.vpcmpeqd (A::ymm0, A::ymm1, A::ymm2);
980         a.vpcmpgtd (A::ymm0, A::ymm1, A::ymm2);
981         a.vcmpeqps (A::ymm0, A::ymm1, A::ymm2);
982         a.vcmpltps (A::ymm0, A::ymm1, A::ymm2);
983         a.vcmpleps (A::ymm0, A::ymm1, A::ymm2);
984         a.vcmpneqps(A::ymm0, A::ymm1, A::ymm2);
985     },{
986         0xc5,0xf5,0x76,0xc2,
987         0xc5,0xf5,0x66,0xc2,
988         0xc5,0xf4,0xc2,0xc2,0x00,
989         0xc5,0xf4,0xc2,0xc2,0x01,
990         0xc5,0xf4,0xc2,0xc2,0x02,
991         0xc5,0xf4,0xc2,0xc2,0x04,
992     });
993 
994     test_asm(r, [&](A& a) {
995         a.vminps(A::ymm0, A::ymm1, A::ymm2);
996         a.vmaxps(A::ymm0, A::ymm1, A::ymm2);
997     },{
998         0xc5,0xf4,0x5d,0xc2,
999         0xc5,0xf4,0x5f,0xc2,
1000     });
1001 
1002     test_asm(r, [&](A& a) {
1003         a.vpblendvb(A::ymm0, A::ymm1, A::ymm2, A::ymm3);
1004     },{
1005         0xc4,0xe3,0x75, 0x4c, 0xc2, 0x30,
1006     });
1007 
1008     test_asm(r, [&](A& a) {
1009         a.vpsrld(A::ymm15, A::ymm2, 8);
1010         a.vpsrld(A::ymm0 , A::ymm8, 5);
1011     },{
1012         0xc5,     0x85, 0x72,0xd2, 0x08,
1013         0xc4,0xc1,0x7d, 0x72,0xd0, 0x05,
1014     });
1015 
1016     test_asm(r, [&](A& a) {
1017         a.vpermq(A::ymm1, A::ymm2, 5);
1018     },{
1019         0xc4,0xe3,0xfd, 0x00,0xca, 0x05,
1020     });
1021 
1022     test_asm(r, [&](A& a) {
1023         a.vroundps(A::ymm1, A::ymm2, A::NEAREST);
1024         a.vroundps(A::ymm1, A::ymm2, A::FLOOR);
1025         a.vroundps(A::ymm1, A::ymm2, A::CEIL);
1026         a.vroundps(A::ymm1, A::ymm2, A::TRUNC);
1027     },{
1028         0xc4,0xe3,0x7d,0x08,0xca,0x00,
1029         0xc4,0xe3,0x7d,0x08,0xca,0x01,
1030         0xc4,0xe3,0x7d,0x08,0xca,0x02,
1031         0xc4,0xe3,0x7d,0x08,0xca,0x03,
1032     });
1033 
1034     test_asm(r, [&](A& a) {
1035         A::Label l = a.here();
1036         a.byte(1);
1037         a.byte(2);
1038         a.byte(3);
1039         a.byte(4);
1040 
1041         a.vbroadcastss(A::ymm0 , &l);
1042         a.vbroadcastss(A::ymm1 , &l);
1043         a.vbroadcastss(A::ymm8 , &l);
1044         a.vbroadcastss(A::ymm15, &l);
1045 
1046         a.vpshufb(A::ymm4, A::ymm3, &l);
1047         a.vpaddd (A::ymm4, A::ymm3, &l);
1048         a.vpsubd (A::ymm4, A::ymm3, &l);
1049 
1050         a.vptest(A::ymm4, &l);
1051 
1052         a.vmulps (A::ymm4, A::ymm3, &l);
1053     },{
1054         0x01, 0x02, 0x03, 0x4,
1055 
1056         /*     VEX    */  /*op*/ /*   ModRM    */  /*     offset     */
1057         0xc4, 0xe2, 0x7d,  0x18,   0b00'000'101,   0xf3,0xff,0xff,0xff,   // 0xfffffff3 == -13
1058         0xc4, 0xe2, 0x7d,  0x18,   0b00'001'101,   0xea,0xff,0xff,0xff,   // 0xffffffea == -22
1059         0xc4, 0x62, 0x7d,  0x18,   0b00'000'101,   0xe1,0xff,0xff,0xff,   // 0xffffffe1 == -31
1060         0xc4, 0x62, 0x7d,  0x18,   0b00'111'101,   0xd8,0xff,0xff,0xff,   // 0xffffffd8 == -40
1061 
1062         0xc4, 0xe2, 0x65,  0x00,   0b00'100'101,   0xcf,0xff,0xff,0xff,   // 0xffffffcf == -49
1063 
1064         0xc5, 0xe5,        0xfe,   0b00'100'101,   0xc7,0xff,0xff,0xff,   // 0xffffffc7 == -57
1065         0xc5, 0xe5,        0xfa,   0b00'100'101,   0xbf,0xff,0xff,0xff,   // 0xffffffbf == -65
1066 
1067         0xc4, 0xe2, 0x7d,  0x17,   0b00'100'101,   0xb6,0xff,0xff,0xff,   // 0xffffffb6 == -74
1068 
1069         0xc5, 0xe4,        0x59,   0b00'100'101,   0xae,0xff,0xff,0xff,   // 0xffffffaf == -82
1070     });
1071 
1072     test_asm(r, [&](A& a) {
1073         a.vbroadcastss(A::ymm0,  A::rdi,   0);
1074         a.vbroadcastss(A::ymm13, A::r14,   7);
1075         a.vbroadcastss(A::ymm8,  A::rdx, -12);
1076         a.vbroadcastss(A::ymm8,  A::rdx, 400);
1077 
1078         a.vbroadcastss(A::ymm8,  A::xmm0);
1079         a.vbroadcastss(A::ymm0,  A::xmm13);
1080     },{
1081         /*   VEX    */ /*op*/     /*ModRM*/   /*offset*/
1082         0xc4,0xe2,0x7d, 0x18,   0b00'000'111,
1083         0xc4,0x42,0x7d, 0x18,   0b01'101'110,  0x07,
1084         0xc4,0x62,0x7d, 0x18,   0b01'000'010,  0xf4,
1085         0xc4,0x62,0x7d, 0x18,   0b10'000'010,  0x90,0x01,0x00,0x00,
1086 
1087         0xc4,0x62,0x7d, 0x18,   0b11'000'000,
1088         0xc4,0xc2,0x7d, 0x18,   0b11'000'101,
1089     });
1090 
1091     test_asm(r, [&](A& a) {
1092         A::Label l = a.here();
1093         a.jne(&l);
1094         a.jne(&l);
1095         a.je (&l);
1096         a.jmp(&l);
1097         a.jl (&l);
1098         a.jc (&l);
1099 
1100         a.cmp(A::rdx, 0);
1101         a.cmp(A::rax, 12);
1102         a.cmp(A::r14, 2000000000);
1103     },{
1104         0x0f,0x85, 0xfa,0xff,0xff,0xff,   // near jne -6 bytes
1105         0x0f,0x85, 0xf4,0xff,0xff,0xff,   // near jne -12 bytes
1106         0x0f,0x84, 0xee,0xff,0xff,0xff,   // near je  -18 bytes
1107         0xe9,      0xe9,0xff,0xff,0xff,   // near jmp -23 bytes
1108         0x0f,0x8c, 0xe3,0xff,0xff,0xff,   // near jl  -29 bytes
1109         0x0f,0x82, 0xdd,0xff,0xff,0xff,   // near jc  -35 bytes
1110 
1111         0x48,0x83,0xfa,0x00,
1112         0x48,0x83,0xf8,0x0c,
1113         0x49,0x81,0xfe,0x00,0x94,0x35,0x77,
1114     });
1115 
1116     test_asm(r, [&](A& a) {
1117         a.vmovups(A::ymm5, A::rsi);
1118         a.vmovups(A::rsi, A::ymm5);
1119 
1120         a.vmovups(A::rsi, A::xmm5);
1121 
1122         a.vpmovzxwd(A::ymm4, A::rsi);
1123         a.vpmovzxbd(A::ymm4, A::rsi);
1124 
1125         a.vmovq(A::rdx, A::xmm15);
1126     },{
1127         /*    VEX    */  /*Op*/  /*  ModRM  */
1128         0xc5,     0xfc,   0x10,  0b00'101'110,
1129         0xc5,     0xfc,   0x11,  0b00'101'110,
1130 
1131         0xc5,     0xf8,   0x11,  0b00'101'110,
1132 
1133         0xc4,0xe2,0x7d,   0x33,  0b00'100'110,
1134         0xc4,0xe2,0x7d,   0x31,  0b00'100'110,
1135 
1136         0xc5,     0x79,   0xd6,  0b00'111'010,
1137     });
1138 
1139     test_asm(r, [&](A& a) {
1140         a.movzbl(A::rax, A::rsi, 0);   // Low registers for src and dst.
1141         a.movzbl(A::rax, A::r8,  0);   // High src register.
1142         a.movzbl(A::r8 , A::rsi, 0);   // High dst register.
1143         a.movzbl(A::r8,  A::rsi, 12);
1144         a.movzbl(A::r8,  A::rsi, 400);
1145 
1146         a.vmovd(A::rax, A::xmm0);
1147         a.vmovd(A::rax, A::xmm8);
1148         a.vmovd(A::r8,  A::xmm0);
1149 
1150         a.vmovd(A::xmm0, A::rax);
1151         a.vmovd(A::xmm8, A::rax);
1152         a.vmovd(A::xmm0, A::r8);
1153 
1154         a.vmovd(A::xmm0 , A::FOUR, A::rcx, A::rax);
1155         a.vmovd(A::xmm15, A::TWO,  A::r8,  A::rax);
1156         a.vmovd(A::xmm0 , A::ONE,  A::rcx, A::r8);
1157 
1158         a.vmovd_direct(A::rax, A::xmm0);
1159         a.vmovd_direct(A::rax, A::xmm8);
1160         a.vmovd_direct(A::r8,  A::xmm0);
1161 
1162         a.vmovd_direct(A::xmm0, A::rax);
1163         a.vmovd_direct(A::xmm8, A::rax);
1164         a.vmovd_direct(A::xmm0, A::r8);
1165 
1166         a.movb(A::rdx, A::rax);
1167         a.movb(A::rdx, A::r8);
1168         a.movb(A::r8 , A::rax);
1169     },{
1170         0x0f,0xb6,0x06,
1171         0x41,0x0f,0xb6,0x00,
1172         0x44,0x0f,0xb6,0x06,
1173         0x44,0x0f,0xb6,0x46, 12,
1174         0x44,0x0f,0xb6,0x86, 0x90,0x01,0x00,0x00,
1175 
1176         0xc5,0xf9,0x7e,0x00,
1177         0xc5,0x79,0x7e,0x00,
1178         0xc4,0xc1,0x79,0x7e,0x00,
1179 
1180         0xc5,0xf9,0x6e,0x00,
1181         0xc5,0x79,0x6e,0x00,
1182         0xc4,0xc1,0x79,0x6e,0x00,
1183 
1184         0xc5,0xf9,0x6e,0x04,0x88,
1185         0xc4,0x21,0x79,0x6e,0x3c,0x40,
1186         0xc4,0xc1,0x79,0x6e,0x04,0x08,
1187 
1188         0xc5,0xf9,0x7e,0xc0,
1189         0xc5,0x79,0x7e,0xc0,
1190         0xc4,0xc1,0x79,0x7e,0xc0,
1191 
1192         0xc5,0xf9,0x6e,0xc0,
1193         0xc5,0x79,0x6e,0xc0,
1194         0xc4,0xc1,0x79,0x6e,0xc0,
1195 
1196         0x88, 0x02,
1197         0x44, 0x88, 0x02,
1198         0x41, 0x88, 0x00,
1199     });
1200 
1201     test_asm(r, [&](A& a) {
1202         a.vpinsrw(A::xmm1, A::xmm8, A::rsi, 4);
1203         a.vpinsrw(A::xmm8, A::xmm1, A::r8, 12);
1204 
1205         a.vpinsrb(A::xmm1, A::xmm8, A::rsi, 4);
1206         a.vpinsrb(A::xmm8, A::xmm1, A::r8, 12);
1207 
1208         a.vpextrw(A::rsi, A::xmm8, 7);
1209         a.vpextrw(A::r8,  A::xmm1, 15);
1210 
1211         a.vpextrb(A::rsi, A::xmm8, 7);
1212         a.vpextrb(A::r8,  A::xmm1, 15);
1213     },{
1214         0xc5,0xb9,      0xc4, 0x0e,  4,
1215         0xc4,0x41,0x71, 0xc4, 0x00, 12,
1216 
1217         0xc4,0xe3,0x39, 0x20, 0x0e,  4,
1218         0xc4,0x43,0x71, 0x20, 0x00, 12,
1219 
1220         0xc4,0x63,0x79, 0x15, 0x06,  7,
1221         0xc4,0xc3,0x79, 0x15, 0x08, 15,
1222 
1223         0xc4,0x63,0x79, 0x14, 0x06,  7,
1224         0xc4,0xc3,0x79, 0x14, 0x08, 15,
1225     });
1226 
1227     test_asm(r, [&](A& a) {
1228         a.vpandn(A::ymm3, A::ymm12, A::ymm2);
1229     },{
1230         0xc5, 0x9d, 0xdf, 0xda,
1231     });
1232 
1233     test_asm(r, [&](A& a) {
1234         a.vmovdqa   (A::ymm3, A::ymm2);
1235         a.vcvttps2dq(A::ymm3, A::ymm2);
1236         a.vcvtdq2ps (A::ymm3, A::ymm2);
1237         a.vcvtps2dq (A::ymm3, A::ymm2);
1238         a.vsqrtps   (A::ymm3, A::ymm2);
1239     },{
1240         0xc5,0xfd,0x6f,0xda,
1241         0xc5,0xfe,0x5b,0xda,
1242         0xc5,0xfc,0x5b,0xda,
1243         0xc5,0xfd,0x5b,0xda,
1244         0xc5,0xfc,0x51,0xda,
1245     });
1246 
1247     test_asm(r, [&](A& a) {
1248         a.vgatherdps(A::ymm1 , A::FOUR , A::ymm0 , A::rdi, A::ymm2 );
1249         a.vgatherdps(A::ymm0 , A::ONE  , A::ymm2 , A::rax, A::ymm1 );
1250         a.vgatherdps(A::ymm10, A::ONE  , A::ymm2 , A::rax, A::ymm1 );
1251         a.vgatherdps(A::ymm0 , A::ONE  , A::ymm12, A::rax, A::ymm1 );
1252         a.vgatherdps(A::ymm0 , A::ONE  , A::ymm2 , A::r9 , A::ymm1 );
1253         a.vgatherdps(A::ymm0 , A::ONE  , A::ymm2 , A::rax, A::ymm12);
1254         a.vgatherdps(A::ymm0 , A::EIGHT, A::ymm2 , A::rax, A::ymm12);
1255     },{
1256         0xc4,0xe2,0x6d,0x92,0x0c,0x87,
1257         0xc4,0xe2,0x75,0x92,0x04,0x10,
1258         0xc4,0x62,0x75,0x92,0x14,0x10,
1259         0xc4,0xa2,0x75,0x92,0x04,0x20,
1260         0xc4,0xc2,0x75,0x92,0x04,0x11,
1261         0xc4,0xe2,0x1d,0x92,0x04,0x10,
1262         0xc4,0xe2,0x1d,0x92,0x04,0xd0,
1263     });
1264 
1265     test_asm(r, [&](A& a) {
1266         a.movq(A::rax, A::rdi, 0);
1267         a.movq(A::rax, A::rdi, 1);
1268         a.movq(A::rax, A::rdi, 512);
1269         a.movq(A::r15, A::r13, 42);
1270         a.movq(A::rax, A::r13, 42);
1271         a.movq(A::r15, A::rax, 42);
1272     },{
1273         0x48, 0x8b, 0x07,
1274         0x48, 0x8b, 0x47, 0x01,
1275         0x48, 0x8b, 0x87, 0x00,0x02,0x00,0x00,
1276         0x4d, 0x8b, 0x7d, 0x2a,
1277         0x49, 0x8b, 0x45, 0x2a,
1278         0x4c, 0x8b, 0x78, 0x2a,
1279     });
1280 
1281     // echo "fmul v4.4s, v3.4s, v1.4s" | llvm-mc -show-encoding -arch arm64
1282 
1283     test_asm(r, [&](A& a) {
1284         a.and16b(A::v4, A::v3, A::v1);
1285         a.orr16b(A::v4, A::v3, A::v1);
1286         a.eor16b(A::v4, A::v3, A::v1);
1287         a.bic16b(A::v4, A::v3, A::v1);
1288         a.bsl16b(A::v4, A::v3, A::v1);
1289         a.not16b(A::v4, A::v3);
1290 
1291         a.add4s(A::v4, A::v3, A::v1);
1292         a.sub4s(A::v4, A::v3, A::v1);
1293         a.mul4s(A::v4, A::v3, A::v1);
1294 
1295         a.cmeq4s(A::v4, A::v3, A::v1);
1296         a.cmgt4s(A::v4, A::v3, A::v1);
1297 
1298         a.sub8h(A::v4, A::v3, A::v1);
1299         a.mul8h(A::v4, A::v3, A::v1);
1300 
1301         a.fadd4s(A::v4, A::v3, A::v1);
1302         a.fsub4s(A::v4, A::v3, A::v1);
1303         a.fmul4s(A::v4, A::v3, A::v1);
1304         a.fdiv4s(A::v4, A::v3, A::v1);
1305         a.fmin4s(A::v4, A::v3, A::v1);
1306         a.fmax4s(A::v4, A::v3, A::v1);
1307 
1308         a.fmla4s(A::v4, A::v3, A::v1);
1309         a.fmls4s(A::v4, A::v3, A::v1);
1310 
1311         a.fcmeq4s(A::v4, A::v3, A::v1);
1312         a.fcmgt4s(A::v4, A::v3, A::v1);
1313         a.fcmge4s(A::v4, A::v3, A::v1);
1314     },{
1315         0x64,0x1c,0x21,0x4e,
1316         0x64,0x1c,0xa1,0x4e,
1317         0x64,0x1c,0x21,0x6e,
1318         0x64,0x1c,0x61,0x4e,
1319         0x64,0x1c,0x61,0x6e,
1320         0x64,0x58,0x20,0x6e,
1321 
1322         0x64,0x84,0xa1,0x4e,
1323         0x64,0x84,0xa1,0x6e,
1324         0x64,0x9c,0xa1,0x4e,
1325 
1326         0x64,0x8c,0xa1,0x6e,
1327         0x64,0x34,0xa1,0x4e,
1328 
1329         0x64,0x84,0x61,0x6e,
1330         0x64,0x9c,0x61,0x4e,
1331 
1332         0x64,0xd4,0x21,0x4e,
1333         0x64,0xd4,0xa1,0x4e,
1334         0x64,0xdc,0x21,0x6e,
1335         0x64,0xfc,0x21,0x6e,
1336         0x64,0xf4,0xa1,0x4e,
1337         0x64,0xf4,0x21,0x4e,
1338 
1339         0x64,0xcc,0x21,0x4e,
1340         0x64,0xcc,0xa1,0x4e,
1341 
1342         0x64,0xe4,0x21,0x4e,
1343         0x64,0xe4,0xa1,0x6e,
1344         0x64,0xe4,0x21,0x6e,
1345     });
1346 
1347     test_asm(r, [&](A& a) {
1348         a.shl4s(A::v4, A::v3,  0);
1349         a.shl4s(A::v4, A::v3,  1);
1350         a.shl4s(A::v4, A::v3,  8);
1351         a.shl4s(A::v4, A::v3, 16);
1352         a.shl4s(A::v4, A::v3, 31);
1353 
1354         a.sshr4s(A::v4, A::v3,  1);
1355         a.sshr4s(A::v4, A::v3,  8);
1356         a.sshr4s(A::v4, A::v3, 31);
1357 
1358         a.ushr4s(A::v4, A::v3,  1);
1359         a.ushr4s(A::v4, A::v3,  8);
1360         a.ushr4s(A::v4, A::v3, 31);
1361 
1362         a.ushr8h(A::v4, A::v3,  1);
1363         a.ushr8h(A::v4, A::v3,  8);
1364         a.ushr8h(A::v4, A::v3, 15);
1365     },{
1366         0x64,0x54,0x20,0x4f,
1367         0x64,0x54,0x21,0x4f,
1368         0x64,0x54,0x28,0x4f,
1369         0x64,0x54,0x30,0x4f,
1370         0x64,0x54,0x3f,0x4f,
1371 
1372         0x64,0x04,0x3f,0x4f,
1373         0x64,0x04,0x38,0x4f,
1374         0x64,0x04,0x21,0x4f,
1375 
1376         0x64,0x04,0x3f,0x6f,
1377         0x64,0x04,0x38,0x6f,
1378         0x64,0x04,0x21,0x6f,
1379 
1380         0x64,0x04,0x1f,0x6f,
1381         0x64,0x04,0x18,0x6f,
1382         0x64,0x04,0x11,0x6f,
1383     });
1384 
1385     test_asm(r, [&](A& a) {
1386         a.sli4s(A::v4, A::v3,  0);
1387         a.sli4s(A::v4, A::v3,  1);
1388         a.sli4s(A::v4, A::v3,  8);
1389         a.sli4s(A::v4, A::v3, 16);
1390         a.sli4s(A::v4, A::v3, 31);
1391     },{
1392         0x64,0x54,0x20,0x6f,
1393         0x64,0x54,0x21,0x6f,
1394         0x64,0x54,0x28,0x6f,
1395         0x64,0x54,0x30,0x6f,
1396         0x64,0x54,0x3f,0x6f,
1397     });
1398 
1399     test_asm(r, [&](A& a) {
1400         a.scvtf4s (A::v4, A::v3);
1401         a.fcvtzs4s(A::v4, A::v3);
1402         a.fcvtns4s(A::v4, A::v3);
1403     },{
1404         0x64,0xd8,0x21,0x4e,
1405         0x64,0xb8,0xa1,0x4e,
1406         0x64,0xa8,0x21,0x4e,
1407     });
1408 
1409     test_asm(r, [&](A& a) {
1410         a.brk(0);
1411         a.brk(65535);
1412 
1413         a.ret(A::x30);   // Conventional ret using link register.
1414         a.ret(A::x13);   // Can really return using any register if we like.
1415 
1416         a.add(A::x2, A::x2,  4);
1417         a.add(A::x3, A::x2, 32);
1418 
1419         a.sub(A::x2, A::x2, 4);
1420         a.sub(A::x3, A::x2, 32);
1421 
1422         a.subs(A::x2, A::x2,  4);
1423         a.subs(A::x3, A::x2, 32);
1424 
1425         a.subs(A::xzr, A::x2, 4);  // These are actually the same instruction!
1426         a.cmp(A::x2, 4);
1427 
1428         A::Label l = a.here();
1429         a.bne(&l);
1430         a.bne(&l);
1431         a.blt(&l);
1432         a.b(&l);
1433         a.cbnz(A::x2, &l);
1434         a.cbz(A::x2, &l);
1435     },{
1436         0x00,0x00,0x20,0xd4,
1437         0xe0,0xff,0x3f,0xd4,
1438 
1439         0xc0,0x03,0x5f,0xd6,
1440         0xa0,0x01,0x5f,0xd6,
1441 
1442         0x42,0x10,0x00,0x91,
1443         0x43,0x80,0x00,0x91,
1444 
1445         0x42,0x10,0x00,0xd1,
1446         0x43,0x80,0x00,0xd1,
1447 
1448         0x42,0x10,0x00,0xf1,
1449         0x43,0x80,0x00,0xf1,
1450 
1451         0x5f,0x10,0x00,0xf1,
1452         0x5f,0x10,0x00,0xf1,
1453 
1454         0x01,0x00,0x00,0x54,   // b.ne #0
1455         0xe1,0xff,0xff,0x54,   // b.ne #-4
1456         0xcb,0xff,0xff,0x54,   // b.lt #-8
1457         0xae,0xff,0xff,0x54,   // b.al #-12
1458         0x82,0xff,0xff,0xb5,   // cbnz x2, #-16
1459         0x62,0xff,0xff,0xb4,   // cbz x2, #-20
1460     });
1461 
1462     // Can we cbz() to a not-yet-defined label?
1463     test_asm(r, [&](A& a) {
1464         A::Label l;
1465         a.cbz(A::x2, &l);
1466         a.add(A::x3, A::x2, 32);
1467         a.label(&l);
1468         a.ret(A::x30);
1469     },{
1470         0x42,0x00,0x00,0xb4,  // cbz x2, #8
1471         0x43,0x80,0x00,0x91,  // add x3, x2, #32
1472         0xc0,0x03,0x5f,0xd6,  // ret
1473     });
1474 
1475     // If we start a label as a backward label,
1476     // can we redefine it to be a future label?
1477     // (Not sure this is useful... just want to test it works.)
1478     test_asm(r, [&](A& a) {
1479         A::Label l1 = a.here();
1480         a.add(A::x3, A::x2, 32);
1481         a.cbz(A::x2, &l1);          // This will jump backward... nothing sneaky.
1482 
1483         A::Label l2 = a.here();     // Start off the same...
1484         a.add(A::x3, A::x2, 32);
1485         a.cbz(A::x2, &l2);          // Looks like this will go backward...
1486         a.add(A::x2, A::x2, 4);
1487         a.add(A::x3, A::x2, 32);
1488         a.label(&l2);               // But no... actually forward!  What a switcheroo!
1489     },{
1490         0x43,0x80,0x00,0x91,  // add x3, x2, #32
1491         0xe2,0xff,0xff,0xb4,  // cbz x2, #-4
1492 
1493         0x43,0x80,0x00,0x91,  // add x3, x2, #32
1494         0x62,0x00,0x00,0xb4,  // cbz x2, #12
1495         0x42,0x10,0x00,0x91,  // add x2, x2, #4
1496         0x43,0x80,0x00,0x91,  // add x3, x2, #32
1497     });
1498 
1499     // Loading from a label on ARM.
1500     test_asm(r, [&](A& a) {
1501         A::Label fore,aft;
1502         a.label(&fore);
1503         a.word(0x01234567);
1504         a.ldrq(A::v1, &fore);
1505         a.ldrq(A::v2, &aft);
1506         a.label(&aft);
1507         a.word(0x76543210);
1508     },{
1509         0x67,0x45,0x23,0x01,
1510         0xe1,0xff,0xff,0x9c,  // ldr q1, #-4
1511         0x22,0x00,0x00,0x9c,  // ldr q2, #4
1512         0x10,0x32,0x54,0x76,
1513     });
1514 
1515     test_asm(r, [&](A& a) {
1516         a.ldrq(A::v0, A::x8);
1517         a.strq(A::v0, A::x8);
1518     },{
1519         0x00,0x01,0xc0,0x3d,
1520         0x00,0x01,0x80,0x3d,
1521     });
1522 
1523     test_asm(r, [&](A& a) {
1524         a.xtns2h(A::v0, A::v0);
1525         a.xtnh2b(A::v0, A::v0);
1526         a.strs  (A::v0, A::x0);
1527 
1528         a.ldrs   (A::v0, A::x0);
1529         a.uxtlb2h(A::v0, A::v0);
1530         a.uxtlh2s(A::v0, A::v0);
1531 
1532         a.uminv4s(A::v3, A::v4);
1533         a.fmovs  (A::x3, A::v4);  // fmov w3,s4
1534     },{
1535         0x00,0x28,0x61,0x0e,
1536         0x00,0x28,0x21,0x0e,
1537         0x00,0x00,0x00,0xbd,
1538 
1539         0x00,0x00,0x40,0xbd,
1540         0x00,0xa4,0x08,0x2f,
1541         0x00,0xa4,0x10,0x2f,
1542 
1543         0x83,0xa8,0xb1,0x6e,
1544         0x83,0x00,0x26,0x1e,
1545     });
1546 
1547     test_asm(r, [&](A& a) {
1548         a.ldrb(A::v0, A::x8);
1549         a.strb(A::v0, A::x8);
1550     },{
1551         0x00,0x01,0x40,0x3d,
1552         0x00,0x01,0x00,0x3d,
1553     });
1554 
1555     test_asm(r, [&](A& a) {
1556         a.tbl(A::v0, A::v1, A::v2);
1557     },{
1558         0x20,0x00,0x02,0x4e,
1559     });
1560 }
1561