• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2019 Google LLC
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "include/core/SkColorPriv.h"
9 #include "include/private/SkColorData.h"
10 #include "src/core/SkCpu.h"
11 #include "src/core/SkMSAN.h"
12 #include "src/core/SkVM.h"
13 #include "src/gpu/GrShaderCaps.h"
14 #include "src/sksl/SkSLCompiler.h"
15 #include "src/sksl/codegen/SkSLVMCodeGenerator.h"
16 #include "src/sksl/tracing/SkVMDebugTrace.h"
17 #include "src/utils/SkVMVisualizer.h"
18 #include "tests/Test.h"
19 
20 template <typename Fn>
test_jit_and_interpreter(const skvm::Builder & b,Fn && test)21 static void test_jit_and_interpreter(const skvm::Builder& b, Fn&& test) {
22     skvm::Program p = b.done();
23     test(p);
24     if (p.hasJIT()) {
25         test(b.done(/*debug_name=*/nullptr, /*allow_jit=*/false));
26     }
27 }
28 
DEF_TEST(SkVM_eliminate_dead_code,r)29 DEF_TEST(SkVM_eliminate_dead_code, r) {
30     skvm::Builder b;
31     {
32         skvm::Ptr arg = b.varying<int>();
33         skvm::I32 l = b.load32(arg);
34         skvm::I32 a = b.add(l, l);
35         b.add(a, b.splat(7));
36     }
37 
38     std::vector<skvm::Instruction> program = b.program();
39     REPORTER_ASSERT(r, program.size() == 4);
40 
41     program = skvm::eliminate_dead_code(program);
42     REPORTER_ASSERT(r, program.size() == 0);
43 }
44 
DEF_TEST(SkVM_Pointless,r)45 DEF_TEST(SkVM_Pointless, r) {
46     // Let's build a program with no memory arguments.
47     // It should all be pegged as dead code, but we should be able to "run" it.
48     skvm::Builder b;
49     {
50         b.add(b.splat(5.0f),
51               b.splat(4.0f));
52     }
53 
54     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
55         for (int N = 0; N < 64; N++) {
56             program.eval(N);
57         }
58     });
59 
60     for (const skvm::OptimizedInstruction& inst : b.optimize()) {
61         REPORTER_ASSERT(r, inst.death == 0 && inst.can_hoist == true);
62     }
63 }
64 
DEF_TEST(SkVM_memset,r)65 DEF_TEST(SkVM_memset, r) {
66     skvm::Builder b;
67     b.store32(b.varying<int>(), b.splat(42));
68 
69     test_jit_and_interpreter(b, [&](const skvm::Program& p) {
70         int buf[18];
71         buf[17] = 47;
72 
73         p.eval(17, buf);
74         for (int i = 0; i < 17; i++) {
75             REPORTER_ASSERT(r, buf[i] == 42);
76         }
77         REPORTER_ASSERT(r, buf[17] == 47);
78     });
79 }
80 
DEF_TEST(SkVM_memcpy,r)81 DEF_TEST(SkVM_memcpy, r) {
82     skvm::Builder b;
83     {
84         auto src = b.varying<int>(),
85              dst = b.varying<int>();
86         b.store32(dst, b.load32(src));
87     }
88 
89     test_jit_and_interpreter(b, [&](const skvm::Program& p) {
90         int src[] = {1,2,3,4,5,6,7,8,9},
91             dst[] = {0,0,0,0,0,0,0,0,0};
92 
93         p.eval(SK_ARRAY_COUNT(src)-1, src, dst);
94         for (size_t i = 0; i < SK_ARRAY_COUNT(src)-1; i++) {
95             REPORTER_ASSERT(r, dst[i] == src[i]);
96         }
97         size_t i = SK_ARRAY_COUNT(src)-1;
98         REPORTER_ASSERT(r, dst[i] == 0);
99     });
100 }
101 
DEF_TEST(SkVM_allow_jit,r)102 DEF_TEST(SkVM_allow_jit, r) {
103     skvm::Builder b;
104     {
105         auto src = b.varying<int>(),
106              dst = b.varying<int>();
107         b.store32(dst, b.load32(src));
108     }
109 
110     if (b.done("test-allow_jit", /*allow_jit=*/true).hasJIT()) {
111         REPORTER_ASSERT(r, !b.done("", false).hasJIT());
112     }
113 }
114 
DEF_TEST(SkVM_LoopCounts,r)115 DEF_TEST(SkVM_LoopCounts, r) {
116     // Make sure we cover all the exact N we want.
117 
118     // buf[i] += 1
119     skvm::Builder b;
120     skvm::Ptr arg = b.varying<int>();
121     b.store32(arg,
122               b.add(b.splat(1),
123                     b.load32(arg)));
124 
125     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
126         int buf[64];
127         for (int N = 0; N <= (int)SK_ARRAY_COUNT(buf); N++) {
128             for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
129                 buf[i] = i;
130             }
131             program.eval(N, buf);
132 
133             for (int i = 0; i < N; i++) {
134                 REPORTER_ASSERT(r, buf[i] == i+1);
135             }
136             for (int i = N; i < (int)SK_ARRAY_COUNT(buf); i++) {
137                 REPORTER_ASSERT(r, buf[i] == i);
138             }
139         }
140     });
141 }
142 
DEF_TEST(SkVM_gather32,r)143 DEF_TEST(SkVM_gather32, r) {
144     skvm::Builder b;
145     {
146         skvm::UPtr uniforms = b.uniform();
147         skvm::Ptr buf = b.varying<int>();
148         skvm::I32 x = b.load32(buf);
149         b.store32(buf, b.gather32(uniforms,0, b.bit_and(x, b.splat(7))));
150     }
151 
152     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
153         const int img[] = {12,34,56,78, 90,98,76,54};
154 
155         int buf[20];
156         for (int i = 0; i < 20; i++) {
157             buf[i] = i;
158         }
159 
160         struct Uniforms {
161             const int* img;
162         } uniforms{img};
163 
164         program.eval(20, &uniforms, buf);
165         int i = 0;
166         REPORTER_ASSERT(r, buf[i] == 12); i++;
167         REPORTER_ASSERT(r, buf[i] == 34); i++;
168         REPORTER_ASSERT(r, buf[i] == 56); i++;
169         REPORTER_ASSERT(r, buf[i] == 78); i++;
170         REPORTER_ASSERT(r, buf[i] == 90); i++;
171         REPORTER_ASSERT(r, buf[i] == 98); i++;
172         REPORTER_ASSERT(r, buf[i] == 76); i++;
173         REPORTER_ASSERT(r, buf[i] == 54); i++;
174 
175         REPORTER_ASSERT(r, buf[i] == 12); i++;
176         REPORTER_ASSERT(r, buf[i] == 34); i++;
177         REPORTER_ASSERT(r, buf[i] == 56); i++;
178         REPORTER_ASSERT(r, buf[i] == 78); i++;
179         REPORTER_ASSERT(r, buf[i] == 90); i++;
180         REPORTER_ASSERT(r, buf[i] == 98); i++;
181         REPORTER_ASSERT(r, buf[i] == 76); i++;
182         REPORTER_ASSERT(r, buf[i] == 54); i++;
183 
184         REPORTER_ASSERT(r, buf[i] == 12); i++;
185         REPORTER_ASSERT(r, buf[i] == 34); i++;
186         REPORTER_ASSERT(r, buf[i] == 56); i++;
187         REPORTER_ASSERT(r, buf[i] == 78); i++;
188     });
189 }
190 
DEF_TEST(SkVM_gathers,r)191 DEF_TEST(SkVM_gathers, r) {
192     skvm::Builder b;
193     {
194         skvm::UPtr uniforms = b.uniform();
195         skvm::Ptr buf32    = b.varying<int>(),
196                   buf16    = b.varying<uint16_t>(),
197                   buf8     = b.varying<uint8_t>();
198 
199         skvm::I32 x = b.load32(buf32);
200 
201         b.store32(buf32, b.gather32(uniforms,0, b.bit_and(x, b.splat( 7))));
202         b.store16(buf16, b.gather16(uniforms,0, b.bit_and(x, b.splat(15))));
203         b.store8 (buf8 , b.gather8 (uniforms,0, b.bit_and(x, b.splat(31))));
204     }
205 
206     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
207         const int img[] = {12,34,56,78, 90,98,76,54};
208 
209         constexpr int N = 20;
210         int      buf32[N];
211         uint16_t buf16[N];
212         uint8_t  buf8 [N];
213 
214         for (int i = 0; i < 20; i++) {
215             buf32[i] = i;
216         }
217 
218         struct Uniforms {
219             const int* img;
220         } uniforms{img};
221 
222         program.eval(N, &uniforms, buf32, buf16, buf8);
223         int i = 0;
224         REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 12 && buf8[i] == 12); i++;
225         REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] ==  0 && buf8[i] ==  0); i++;
226         REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 34 && buf8[i] ==  0); i++;
227         REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] ==  0 && buf8[i] ==  0); i++;
228         REPORTER_ASSERT(r, buf32[i] == 90 && buf16[i] == 56 && buf8[i] == 34); i++;
229         REPORTER_ASSERT(r, buf32[i] == 98 && buf16[i] ==  0 && buf8[i] ==  0); i++;
230         REPORTER_ASSERT(r, buf32[i] == 76 && buf16[i] == 78 && buf8[i] ==  0); i++;
231         REPORTER_ASSERT(r, buf32[i] == 54 && buf16[i] ==  0 && buf8[i] ==  0); i++;
232 
233         REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 90 && buf8[i] == 56); i++;
234         REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] ==  0 && buf8[i] ==  0); i++;
235         REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 98 && buf8[i] ==  0); i++;
236         REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] ==  0 && buf8[i] ==  0); i++;
237         REPORTER_ASSERT(r, buf32[i] == 90 && buf16[i] == 76 && buf8[i] == 78); i++;
238         REPORTER_ASSERT(r, buf32[i] == 98 && buf16[i] ==  0 && buf8[i] ==  0); i++;
239         REPORTER_ASSERT(r, buf32[i] == 76 && buf16[i] == 54 && buf8[i] ==  0); i++;
240         REPORTER_ASSERT(r, buf32[i] == 54 && buf16[i] ==  0 && buf8[i] ==  0); i++;
241 
242         REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 12 && buf8[i] == 90); i++;
243         REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] ==  0 && buf8[i] ==  0); i++;
244         REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 34 && buf8[i] ==  0); i++;
245         REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] ==  0 && buf8[i] ==  0); i++;
246     });
247 }
248 
DEF_TEST(SkVM_gathers2,r)249 DEF_TEST(SkVM_gathers2, r) {
250     skvm::Builder b;
251     {
252         skvm::UPtr uniforms = b.uniform();
253         skvm::Ptr buf32    = b.varying<int>(),
254                   buf16    = b.varying<uint16_t>(),
255                   buf8     = b.varying<uint8_t>();
256 
257         skvm::I32 x = b.load32(buf32);
258 
259         b.store32(buf32, b.gather32(uniforms,0, x));
260         b.store16(buf16, b.gather16(uniforms,0, x));
261         b.store8 (buf8 , b.gather8 (uniforms,0, x));
262     }
263 
264     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
265         uint8_t img[256];
266         for (int i = 0; i < 256; i++) {
267             img[i] = i;
268         }
269 
270         int      buf32[64];
271         uint16_t buf16[64];
272         uint8_t  buf8 [64];
273 
274         for (int i = 0; i < 64; i++) {
275             buf32[i] = (i*47)&63;
276             buf16[i] = 0;
277             buf8 [i] = 0;
278         }
279 
280         struct Uniforms {
281             const uint8_t* img;
282         } uniforms{img};
283 
284         program.eval(64, &uniforms, buf32, buf16, buf8);
285 
286         for (int i = 0; i < 64; i++) {
287             REPORTER_ASSERT(r, buf8[i] == ((i*47)&63));  // 0,47,30,13,60,...
288         }
289 
290         REPORTER_ASSERT(r, buf16[ 0] == 0x0100);
291         REPORTER_ASSERT(r, buf16[63] == 0x2322);
292 
293         REPORTER_ASSERT(r, buf32[ 0] == 0x03020100);
294         REPORTER_ASSERT(r, buf32[63] == 0x47464544);
295     });
296 }
297 
DEF_TEST(SkVM_bitops,r)298 DEF_TEST(SkVM_bitops, r) {
299     skvm::Builder b;
300     {
301         skvm::Ptr ptr = b.varying<int>();
302 
303         skvm::I32 x = b.load32(ptr);
304 
305         x = b.bit_and  (x, b.splat(0xf1));  // 0x40
306         x = b.bit_or   (x, b.splat(0x80));  // 0xc0
307         x = b.bit_xor  (x, b.splat(0xfe));  // 0x3e
308         x = b.bit_clear(x, b.splat(0x30));  // 0x0e
309 
310         x = b.shl(x, 28);  // 0xe000'0000
311         x = b.sra(x, 28);  // 0xffff'fffe
312         x = b.shr(x,  1);  // 0x7fff'ffff
313 
314         b.store32(ptr, x);
315     }
316 
317     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
318         int x = 0x42;
319         program.eval(1, &x);
320         REPORTER_ASSERT(r, x == 0x7fff'ffff);
321     });
322 }
323 
DEF_TEST(SkVM_select_is_NaN,r)324 DEF_TEST(SkVM_select_is_NaN, r) {
325     skvm::Builder b;
326     {
327         skvm::Ptr src = b.varying<float>(),
328                   dst = b.varying<float>();
329 
330         skvm::F32 x = b.loadF(src);
331         x = select(is_NaN(x), b.splat(0.0f)
332                             , x);
333         b.storeF(dst, x);
334     }
335 
336     std::vector<skvm::OptimizedInstruction> program = b.optimize();
337     REPORTER_ASSERT(r, program.size() == 4);
338     REPORTER_ASSERT(r, program[0].op == skvm::Op::load32);
339     REPORTER_ASSERT(r, program[1].op == skvm::Op::neq_f32);
340     REPORTER_ASSERT(r, program[2].op == skvm::Op::bit_clear);
341     REPORTER_ASSERT(r, program[3].op == skvm::Op::store32);
342 
343     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
344         // ±NaN, ±0, ±1, ±inf
345         uint32_t src[] = {0x7f80'0001, 0xff80'0001, 0x0000'0000, 0x8000'0000,
346                           0x3f80'0000, 0xbf80'0000, 0x7f80'0000, 0xff80'0000};
347         uint32_t dst[SK_ARRAY_COUNT(src)];
348         program.eval(SK_ARRAY_COUNT(src), src, dst);
349 
350         for (int i = 0; i < (int)SK_ARRAY_COUNT(src); i++) {
351             REPORTER_ASSERT(r, dst[i] == (i < 2 ? 0 : src[i]));
352         }
353     });
354 }
355 
DEF_TEST(SkVM_f32,r)356 DEF_TEST(SkVM_f32, r) {
357     skvm::Builder b;
358     {
359         skvm::Ptr arg = b.varying<float>();
360 
361         skvm::F32 x = b.loadF(arg),
362                   y = b.add(x,x),   // y = 2x
363                   z = b.sub(y,x),   // z = 2x-x = x
364                   w = b.div(z,x);   // w = x/x = 1
365         b.storeF(arg, w);
366     }
367 
368     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
369         float buf[] = { 1,2,3,4,5,6,7,8,9 };
370         program.eval(SK_ARRAY_COUNT(buf), buf);
371         for (float v : buf) {
372             REPORTER_ASSERT(r, v == 1.0f);
373         }
374     });
375 }
376 
DEF_TEST(SkVM_cmp_i32,r)377 DEF_TEST(SkVM_cmp_i32, r) {
378     skvm::Builder b;
379     {
380         skvm::I32 x = b.load32(b.varying<int>());
381 
382         auto to_bit = [&](int shift, skvm::I32 mask) {
383             return b.shl(b.bit_and(mask, b.splat(0x1)), shift);
384         };
385 
386         skvm::I32 m = b.splat(0);
387         m = b.bit_or(m, to_bit(0, b. eq(x, b.splat(0))));
388         m = b.bit_or(m, to_bit(1, b.neq(x, b.splat(1))));
389         m = b.bit_or(m, to_bit(2, b. lt(x, b.splat(2))));
390         m = b.bit_or(m, to_bit(3, b.lte(x, b.splat(3))));
391         m = b.bit_or(m, to_bit(4, b. gt(x, b.splat(4))));
392         m = b.bit_or(m, to_bit(5, b.gte(x, b.splat(5))));
393 
394         b.store32(b.varying<int>(), m);
395     }
396     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
397         int in[] = { 0,1,2,3,4,5,6,7,8,9 };
398         int out[SK_ARRAY_COUNT(in)];
399 
400         program.eval(SK_ARRAY_COUNT(in), in, out);
401 
402         REPORTER_ASSERT(r, out[0] == 0b001111);
403         REPORTER_ASSERT(r, out[1] == 0b001100);
404         REPORTER_ASSERT(r, out[2] == 0b001010);
405         REPORTER_ASSERT(r, out[3] == 0b001010);
406         REPORTER_ASSERT(r, out[4] == 0b000010);
407         for (int i = 5; i < (int)SK_ARRAY_COUNT(out); i++) {
408             REPORTER_ASSERT(r, out[i] == 0b110010);
409         }
410     });
411 }
412 
DEF_TEST(SkVM_cmp_f32,r)413 DEF_TEST(SkVM_cmp_f32, r) {
414     skvm::Builder b;
415     {
416         skvm::F32 x = b.loadF(b.varying<float>());
417 
418         auto to_bit = [&](int shift, skvm::I32 mask) {
419             return b.shl(b.bit_and(mask, b.splat(0x1)), shift);
420         };
421 
422         skvm::I32 m = b.splat(0);
423         m = b.bit_or(m, to_bit(0, b. eq(x, b.splat(0.0f))));
424         m = b.bit_or(m, to_bit(1, b.neq(x, b.splat(1.0f))));
425         m = b.bit_or(m, to_bit(2, b. lt(x, b.splat(2.0f))));
426         m = b.bit_or(m, to_bit(3, b.lte(x, b.splat(3.0f))));
427         m = b.bit_or(m, to_bit(4, b. gt(x, b.splat(4.0f))));
428         m = b.bit_or(m, to_bit(5, b.gte(x, b.splat(5.0f))));
429 
430         b.store32(b.varying<int>(), m);
431     }
432 
433     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
434         float in[] = { 0,1,2,3,4,5,6,7,8,9 };
435         int out[SK_ARRAY_COUNT(in)];
436 
437         program.eval(SK_ARRAY_COUNT(in), in, out);
438 
439         REPORTER_ASSERT(r, out[0] == 0b001111);
440         REPORTER_ASSERT(r, out[1] == 0b001100);
441         REPORTER_ASSERT(r, out[2] == 0b001010);
442         REPORTER_ASSERT(r, out[3] == 0b001010);
443         REPORTER_ASSERT(r, out[4] == 0b000010);
444         for (int i = 5; i < (int)SK_ARRAY_COUNT(out); i++) {
445             REPORTER_ASSERT(r, out[i] == 0b110010);
446         }
447     });
448 }
449 
DEF_TEST(SkVM_index,r)450 DEF_TEST(SkVM_index, r) {
451     skvm::Builder b;
452     b.store32(b.varying<int>(), b.index());
453 
454     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
455         int buf[23];
456         program.eval(SK_ARRAY_COUNT(buf), buf);
457         for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
458             REPORTER_ASSERT(r, buf[i] == (int)SK_ARRAY_COUNT(buf)-i);
459         }
460     });
461 }
462 
DEF_TEST(SkVM_mad,r)463 DEF_TEST(SkVM_mad, r) {
464     // This program is designed to exercise the tricky corners of instruction
465     // and register selection for Op::mad_f32.
466 
467     skvm::Builder b;
468     {
469         skvm::Ptr arg = b.varying<int>();
470 
471         skvm::F32 x = b.to_F32(b.load32(arg)),
472                   y = b.mad(x,x,x),   // x is needed in the future, so r[x] != r[y].
473                   z = b.mad(y,y,x),   // y is needed in the future, but r[z] = r[x] is ok.
474                   w = b.mad(z,z,y),   // w can alias z but not y.
475                   v = b.mad(w,y,w);   // Got to stop somewhere.
476         b.store32(arg, b.trunc(v));
477     }
478 
479     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
480         int x = 2;
481         program.eval(1, &x);
482         // x = 2
483         // y = 2*2 + 2 = 6
484         // z = 6*6 + 2 = 38
485         // w = 38*38 + 6 = 1450
486         // v = 1450*6 + 1450 = 10150
487         REPORTER_ASSERT(r, x == 10150);
488     });
489 }
490 
DEF_TEST(SkVM_fms,r)491 DEF_TEST(SkVM_fms, r) {
492     // Create a pattern that can be peepholed into an Op::fms_f32.
493     skvm::Builder b;
494     {
495         skvm::Ptr arg = b.varying<int>();
496 
497         skvm::F32 x = b.to_F32(b.load32(arg)),
498                   v = b.sub(b.mul(x, b.splat(2.0f)),
499                             b.splat(1.0f));
500         b.store32(arg, b.trunc(v));
501     }
502 
503     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
504         int buf[] = {0,1,2,3,4,5,6,7,8,9,10};
505         program.eval((int)SK_ARRAY_COUNT(buf), &buf);
506 
507         for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
508             REPORTER_ASSERT(r, buf[i] = 2*i-1);
509         }
510     });
511 }
512 
DEF_TEST(SkVM_fnma,r)513 DEF_TEST(SkVM_fnma, r) {
514     // Create a pattern that can be peepholed into an Op::fnma_f32.
515     skvm::Builder b;
516     {
517         skvm::Ptr arg = b.varying<int>();
518 
519         skvm::F32 x = b.to_F32(b.load32(arg)),
520                   v = b.sub(b.splat(1.0f),
521                             b.mul(x, b.splat(2.0f)));
522         b.store32(arg, b.trunc(v));
523     }
524 
525     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
526         int buf[] = {0,1,2,3,4,5,6,7,8,9,10};
527         program.eval((int)SK_ARRAY_COUNT(buf), &buf);
528 
529         for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
530             REPORTER_ASSERT(r, buf[i] = 1-2*i);
531         }
532     });
533 }
534 
DEF_TEST(SkVM_madder,r)535 DEF_TEST(SkVM_madder, r) {
536     skvm::Builder b;
537     {
538         skvm::Ptr arg = b.varying<float>();
539 
540         skvm::F32 x = b.loadF(arg),
541                   y = b.mad(x,x,x),   // x is needed in the future, so r[x] != r[y].
542                   z = b.mad(y,x,y),   // r[x] can be reused after this instruction, but not r[y].
543                   w = b.mad(y,y,z);
544         b.storeF(arg, w);
545     }
546 
547     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
548         float x = 2.0f;
549         // y = 2*2 + 2 = 6
550         // z = 6*2 + 6 = 18
551         // w = 6*6 + 18 = 54
552         program.eval(1, &x);
553         REPORTER_ASSERT(r, x == 54.0f);
554     });
555 }
556 
DEF_TEST(SkVM_floor,r)557 DEF_TEST(SkVM_floor, r) {
558     skvm::Builder b;
559     {
560         skvm::Ptr arg = b.varying<float>();
561         b.storeF(arg, b.floor(b.loadF(arg)));
562     }
563 
564     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
565         float buf[]  = { -2.0f, -1.5f, -1.0f, 0.0f, 1.0f, 1.5f, 2.0f };
566         float want[] = { -2.0f, -2.0f, -1.0f, 0.0f, 1.0f, 1.0f, 2.0f };
567         program.eval(SK_ARRAY_COUNT(buf), buf);
568         for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
569             REPORTER_ASSERT(r, buf[i] == want[i]);
570         }
571     });
572 }
573 
DEF_TEST(SkVM_round,r)574 DEF_TEST(SkVM_round, r) {
575     skvm::Builder b;
576     {
577         skvm::Ptr src = b.varying<float>();
578         skvm::Ptr dst = b.varying<int>();
579         b.store32(dst, b.round(b.loadF(src)));
580     }
581 
582     // The test cases on exact 0.5f boundaries assume the current rounding mode is nearest even.
583     // We haven't explicitly guaranteed that here... it just probably is.
584     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
585         float buf[]  = { -1.5f, -0.5f, 0.0f, 0.5f, 0.2f, 0.6f, 1.0f, 1.4f, 1.5f, 2.0f };
586         int want[] =   { -2   ,  0   , 0   , 0   , 0   , 1   , 1   , 1   , 2   , 2    };
587         int dst[SK_ARRAY_COUNT(buf)];
588 
589         program.eval(SK_ARRAY_COUNT(buf), buf, dst);
590         for (int i = 0; i < (int)SK_ARRAY_COUNT(dst); i++) {
591             REPORTER_ASSERT(r, dst[i] == want[i]);
592         }
593     });
594 }
595 
DEF_TEST(SkVM_min,r)596 DEF_TEST(SkVM_min, r) {
597     skvm::Builder b;
598     {
599         skvm::Ptr src1 = b.varying<float>();
600         skvm::Ptr src2 = b.varying<float>();
601         skvm::Ptr dst = b.varying<float>();
602 
603         b.storeF(dst, b.min(b.loadF(src1), b.loadF(src2)));
604     }
605 
606     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
607         float s1[]  =  { 0.0f, 1.0f, 4.0f, -1.0f, -1.0f};
608         float s2[]  =  { 0.0f, 2.0f, 3.0f,  1.0f, -2.0f};
609         float want[] = { 0.0f, 1.0f, 3.0f, -1.0f, -2.0f};
610         float d[SK_ARRAY_COUNT(s1)];
611         program.eval(SK_ARRAY_COUNT(d), s1, s2, d);
612         for (int i = 0; i < (int)SK_ARRAY_COUNT(d); i++) {
613           REPORTER_ASSERT(r, d[i] == want[i]);
614         }
615     });
616 }
617 
DEF_TEST(SkVM_max,r)618 DEF_TEST(SkVM_max, r) {
619     skvm::Builder b;
620     {
621         skvm::Ptr src1 = b.varying<float>();
622         skvm::Ptr src2 = b.varying<float>();
623         skvm::Ptr dst = b.varying<float>();
624 
625         b.storeF(dst, b.max(b.loadF(src1), b.loadF(src2)));
626     }
627 
628     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
629         float s1[]  =  { 0.0f, 1.0f, 4.0f, -1.0f, -1.0f};
630         float s2[]  =  { 0.0f, 2.0f, 3.0f,  1.0f, -2.0f};
631         float want[] = { 0.0f, 2.0f, 4.0f,  1.0f, -1.0f};
632         float d[SK_ARRAY_COUNT(s1)];
633         program.eval(SK_ARRAY_COUNT(d), s1, s2, d);
634         for (int i = 0; i < (int)SK_ARRAY_COUNT(d); i++) {
635           REPORTER_ASSERT(r, d[i] == want[i]);
636         }
637     });
638 }
639 
DEF_TEST(SkVM_hoist,r)640 DEF_TEST(SkVM_hoist, r) {
641     // This program uses enough constants that it will fail to JIT if we hoist them.
642     // The JIT will try again without hoisting, and that'll just need 2 registers.
643     skvm::Builder b;
644     {
645         skvm::Ptr arg = b.varying<int>();
646         skvm::I32 x = b.load32(arg);
647         for (int i = 0; i < 32; i++) {
648             x = b.add(x, b.splat(i));
649         }
650         b.store32(arg, x);
651     }
652 
653     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
654         int x = 4;
655         program.eval(1, &x);
656         // x += 0 + 1 + 2 + 3 + ... + 30 + 31
657         // x += 496
658         REPORTER_ASSERT(r, x == 500);
659     });
660 }
661 
DEF_TEST(SkVM_select,r)662 DEF_TEST(SkVM_select, r) {
663     skvm::Builder b;
664     {
665         skvm::Ptr buf = b.varying<int>();
666 
667         skvm::I32 x = b.load32(buf);
668 
669         x = b.select( b.gt(x, b.splat(4)), x, b.splat(42) );
670 
671         b.store32(buf, x);
672     }
673 
674     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
675         int buf[] = { 0,1,2,3,4,5,6,7,8 };
676         program.eval(SK_ARRAY_COUNT(buf), buf);
677         for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
678             REPORTER_ASSERT(r, buf[i] == (i > 4 ? i : 42));
679         }
680     });
681 }
682 
DEF_TEST(SkVM_swap,r)683 DEF_TEST(SkVM_swap, r) {
684     skvm::Builder b;
685     {
686         // This program is the equivalent of
687         //     x = *X
688         //     y = *Y
689         //     *X = y
690         //     *Y = x
691         // One rescheduling of the program based only on data flow of Op arguments is
692         //     x = *X
693         //     *Y = x
694         //     y = *Y
695         //     *X = y
696         // but this reordering does not produce the same results and is invalid.
697         skvm::Ptr X = b.varying<int>(),
698                   Y = b.varying<int>();
699 
700         skvm::I32 x = b.load32(X),
701                   y = b.load32(Y);
702 
703         b.store32(X, y);
704         b.store32(Y, x);
705     }
706 
707     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
708         int b1[] = { 0,1,2,3 };
709         int b2[] = { 4,5,6,7 };
710         program.eval(SK_ARRAY_COUNT(b1), b1, b2);
711         for (int i = 0; i < (int)SK_ARRAY_COUNT(b1); i++) {
712             REPORTER_ASSERT(r, b1[i] == 4 + i);
713             REPORTER_ASSERT(r, b2[i] == i);
714         }
715     });
716 }
717 
DEF_TEST(SkVM_NewOps,r)718 DEF_TEST(SkVM_NewOps, r) {
719     // Exercise a somewhat arbitrary set of new ops.
720     skvm::Builder b;
721     {
722         skvm::Ptr buf = b.varying<int16_t>();
723         skvm::UPtr uniforms = b.uniform();
724 
725         skvm::I32 x = b.load16(buf);
726 
727         const size_t kPtr = sizeof(const int*);
728 
729         x = b.add(x, b.uniform32(uniforms, kPtr+0));
730         x = b.mul(x, b.uniform32(uniforms, kPtr+4));
731         x = b.sub(x, b.uniform32(uniforms, kPtr+8));
732 
733         skvm::I32 limit = b.uniform32(uniforms, kPtr+12);
734         x = b.select(b.lt(x, b.splat(0)), b.splat(0), x);
735         x = b.select(b.gt(x, limit     ), limit     , x);
736 
737         x = b.gather8(uniforms,0, x);
738 
739         b.store16(buf, x);
740     }
741 
742     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
743         const int N = 31;
744         int16_t buf[N];
745         for (int i = 0; i < N; i++) {
746             buf[i] = i;
747         }
748 
749         const int M = 16;
750         uint8_t img[M];
751         for (int i = 0; i < M; i++) {
752             img[i] = i*i;
753         }
754 
755         struct {
756             const uint8_t* img;
757             int      add   = 5;
758             int      mul   = 3;
759             int      sub   = 18;
760             int      limit = M-1;
761         } uniforms{img};
762 
763         program.eval(N, buf, &uniforms);
764 
765         for (int i = 0; i < N; i++) {
766             // Our first math calculates x = (i+5)*3 - 18 a.k.a 3*(i-1).
767             int x = 3*(i-1);
768 
769             // Then that's pinned to the limits of img.
770             if (i < 2) { x =  0; }  // Notice i == 1 hits x == 0 exactly...
771             if (i > 5) { x = 15; }  // ...and i == 6 hits x == 15 exactly
772             REPORTER_ASSERT(r, buf[i] == img[x]);
773         }
774     });
775 }
776 
DEF_TEST(SKVM_array32,r)777 DEF_TEST(SKVM_array32, r) {
778 
779 
780 
781     skvm::Builder b;
782     skvm::Uniforms uniforms(b.uniform(), 0);
783     // Take up the first slot, so other uniforms are not at 0 offset.
784     uniforms.push(0);
785     int i[] = {3, 7};
786     skvm::Uniform array = uniforms.pushArray(i);
787     float f[] = {5, 9};
788     skvm::Uniform arrayF = uniforms.pushArrayF(f);
789     {
790         skvm::Ptr buf0     = b.varying<int32_t>(),
791                   buf1     = b.varying<int32_t>(),
792                   buf2     = b.varying<int32_t>();
793 
794         skvm::I32 j = b.array32(array, 0);
795         b.store32(buf0, j);
796         skvm::I32 k = b.array32(array, 1);
797         b.store32(buf1, k);
798 
799         skvm::F32 x = b.arrayF(arrayF, 0);
800         skvm::F32 y = b.arrayF(arrayF, 1);
801         b.store32(buf2, b.trunc(b.add(x, y)));
802     }
803 
804     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
805         const int K = 10;
806         int32_t buf0[K],
807                 buf1[K],
808                 buf2[K];
809 
810         // reset the i[0] for the two tests.
811         i[0] = 3;
812         f[1] = 9;
813         program.eval(K, uniforms.buf.data(), buf0, buf1, buf2);
814         for (auto v : buf0) {
815             REPORTER_ASSERT(r, v == 3);
816         }
817         for (auto v : buf1) {
818             REPORTER_ASSERT(r, v == 7);
819         }
820         for (auto v : buf2) {
821             REPORTER_ASSERT(r, v == 14);
822         }
823         i[0] = 4;
824         f[1] = 10;
825         program.eval(K, uniforms.buf.data(), buf0, buf1, buf2);
826         for (auto v : buf0) {
827             REPORTER_ASSERT(r, v == 4);
828         }
829         for (auto v : buf1) {
830             REPORTER_ASSERT(r, v == 7);
831         }
832         for (auto v : buf2) {
833             REPORTER_ASSERT(r, v == 15);
834         }
835     });
836 }
837 
DEF_TEST(SkVM_sqrt,r)838 DEF_TEST(SkVM_sqrt, r) {
839     skvm::Builder b;
840     auto buf = b.varying<int>();
841     b.storeF(buf, b.sqrt(b.loadF(buf)));
842 
843     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
844         constexpr int K = 17;
845         float buf[K];
846         for (int i = 0; i < K; i++) {
847             buf[i] = (float)(i*i);
848         }
849 
850         // x^2 -> x
851         program.eval(K, buf);
852 
853         for (int i = 0; i < K; i++) {
854             REPORTER_ASSERT(r, buf[i] == (float)i);
855         }
856     });
857 }
858 
DEF_TEST(SkVM_MSAN,r)859 DEF_TEST(SkVM_MSAN, r) {
860     // This little memset32() program should be able to JIT, but if we run that
861     // JIT code in an MSAN build, it won't see the writes initialize buf.  So
862     // this tests that we're using the interpreter instead.
863     skvm::Builder b;
864     b.store32(b.varying<int>(), b.splat(42));
865 
866     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
867         constexpr int K = 17;
868         int buf[K];                 // Intentionally uninitialized.
869         program.eval(K, buf);
870         sk_msan_assert_initialized(buf, buf+K);
871         for (int x : buf) {
872             REPORTER_ASSERT(r, x == 42);
873         }
874     });
875 }
876 
DEF_TEST(SkVM_assert,r)877 DEF_TEST(SkVM_assert, r) {
878     skvm::Builder b;
879     b.assert_true(b.lt(b.load32(b.varying<int>()),
880                        b.splat(42)));
881 
882     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
883         int buf[] = { 0,1,2,3,4,5,6,7,8,9 };
884         program.eval(SK_ARRAY_COUNT(buf), buf);
885     });
886 }
887 
DEF_TEST(SkVM_trace_line,r)888 DEF_TEST(SkVM_trace_line, r) {
889     class TestTraceHook : public skvm::TraceHook {
890     public:
891         void var(int, int32_t) override { fBuffer.push_back(-9999999); }
892         void enter(int) override        { fBuffer.push_back(-9999999); }
893         void exit(int) override         { fBuffer.push_back(-9999999); }
894         void scope(int) override        { fBuffer.push_back(-9999999); }
895         void line(int lineNum) override { fBuffer.push_back(lineNum); }
896 
897         std::vector<int> fBuffer;
898     };
899 
900     skvm::Builder b;
901     TestTraceHook testTrace;
902     int traceHookID = b.attachTraceHook(&testTrace);
903     b.trace_line(traceHookID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 123);
904     b.trace_line(traceHookID, b.splat(0x00000000), b.splat(0xFFFFFFFF), 456);
905     b.trace_line(traceHookID, b.splat(0xFFFFFFFF), b.splat(0x00000000), 567);
906     b.trace_line(traceHookID, b.splat(0x00000000), b.splat(0x00000000), 678);
907     b.trace_line(traceHookID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 789);
908     skvm::Program p = b.done();
909     p.eval(1);
910 
911     REPORTER_ASSERT(r, (testTrace.fBuffer == std::vector<int>{123, 789}));
912 }
913 
DEF_TEST(SkVM_trace_var,r)914 DEF_TEST(SkVM_trace_var, r) {
915     class TestTraceHook : public skvm::TraceHook {
916     public:
917         void line(int) override                  { fBuffer.push_back(-9999999); }
918         void enter(int) override                 { fBuffer.push_back(-9999999); }
919         void exit(int) override                  { fBuffer.push_back(-9999999); }
920         void scope(int) override                 { fBuffer.push_back(-9999999); }
921         void var(int slot, int32_t val) override {
922             fBuffer.push_back(slot);
923             fBuffer.push_back(val);
924         }
925 
926         std::vector<int> fBuffer;
927     };
928 
929     skvm::Builder b;
930     TestTraceHook testTrace;
931     int traceHookID = b.attachTraceHook(&testTrace);
932     b.trace_var(traceHookID, b.splat(0x00000000), b.splat(0xFFFFFFFF), 2, b.splat(333));
933     b.trace_var(traceHookID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 4, b.splat(555));
934     b.trace_var(traceHookID, b.splat(0x00000000), b.splat(0x00000000), 5, b.splat(666));
935     b.trace_var(traceHookID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 6, b.splat(777));
936     b.trace_var(traceHookID, b.splat(0xFFFFFFFF), b.splat(0x00000000), 8, b.splat(999));
937     skvm::Program p = b.done();
938     p.eval(1);
939 
940     REPORTER_ASSERT(r, (testTrace.fBuffer == std::vector<int>{4, 555, 6, 777}));
941 }
942 
DEF_TEST(SkVM_trace_enter_exit,r)943 DEF_TEST(SkVM_trace_enter_exit, r) {
944     class TestTraceHook : public skvm::TraceHook {
945     public:
946         void line(int) override                   { fBuffer.push_back(-9999999); }
947         void var(int, int32_t) override           { fBuffer.push_back(-9999999); }
948         void scope(int) override                  { fBuffer.push_back(-9999999); }
949         void enter(int fnIdx) override {
950             fBuffer.push_back(fnIdx);
951             fBuffer.push_back(1);
952         }
953         void exit(int fnIdx) override {
954             fBuffer.push_back(fnIdx);
955             fBuffer.push_back(0);
956         }
957 
958         std::vector<int> fBuffer;
959     };
960 
961     skvm::Builder b;
962     TestTraceHook testTrace;
963     int traceHookID = b.attachTraceHook(&testTrace);
964     b.trace_enter(traceHookID, b.splat(0x00000000), b.splat(0x00000000), 99);
965     b.trace_enter(traceHookID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 12);
966     b.trace_enter(traceHookID, b.splat(0x00000000), b.splat(0xFFFFFFFF), 34);
967     b.trace_exit(traceHookID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 56);
968     b.trace_exit(traceHookID, b.splat(0xFFFFFFFF), b.splat(0x00000000), 78);
969     b.trace_exit(traceHookID, b.splat(0x00000000), b.splat(0x00000000), 90);
970     skvm::Program p = b.done();
971     p.eval(1);
972 
973     REPORTER_ASSERT(r, (testTrace.fBuffer == std::vector<int>{12, 1, 56, 0}));
974 }
975 
DEF_TEST(SkVM_trace_scope,r)976 DEF_TEST(SkVM_trace_scope, r) {
977     class TestTraceHook : public skvm::TraceHook {
978     public:
979         void var(int, int32_t) override { fBuffer.push_back(-9999999); }
980         void enter(int) override        { fBuffer.push_back(-9999999); }
981         void exit(int) override         { fBuffer.push_back(-9999999); }
982         void line(int) override         { fBuffer.push_back(-9999999); }
983         void scope(int delta) override  { fBuffer.push_back(delta); }
984 
985         std::vector<int> fBuffer;
986     };
987 
988     skvm::Builder b;
989     TestTraceHook testTrace;
990     int traceHookID = b.attachTraceHook(&testTrace);
991     b.trace_scope(traceHookID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 1);
992     b.trace_scope(traceHookID, b.splat(0xFFFFFFFF), b.splat(0x00000000), -2);
993     b.trace_scope(traceHookID, b.splat(0x00000000), b.splat(0x00000000), 3);
994     b.trace_scope(traceHookID, b.splat(0x00000000), b.splat(0xFFFFFFFF), 4);
995     b.trace_scope(traceHookID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), -5);
996     skvm::Program p = b.done();
997     p.eval(1);
998 
999     REPORTER_ASSERT(r, (testTrace.fBuffer == std::vector<int>{1, -5}));
1000 }
1001 
DEF_TEST(SkVM_trace_multiple_hooks,r)1002 DEF_TEST(SkVM_trace_multiple_hooks, r) {
1003     class TestTraceHook : public skvm::TraceHook {
1004     public:
1005         void var(int, int32_t) override { fBuffer.push_back(-9999999); }
1006         void enter(int) override        { fBuffer.push_back(-9999999); }
1007         void exit(int) override         { fBuffer.push_back(-9999999); }
1008         void scope(int) override        { fBuffer.push_back(-9999999); }
1009         void line(int lineNum) override { fBuffer.push_back(lineNum); }
1010 
1011         std::vector<int> fBuffer;
1012     };
1013 
1014     skvm::Builder b;
1015     TestTraceHook testTraceA, testTraceB, testTraceC;
1016     int traceHookAID = b.attachTraceHook(&testTraceA);
1017     int traceHookBID = b.attachTraceHook(&testTraceB);
1018     int traceHookCID = b.attachTraceHook(&testTraceC);
1019     b.trace_line(traceHookCID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 111);
1020     b.trace_line(traceHookAID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 222);
1021     b.trace_line(traceHookCID, b.splat(0x00000000), b.splat(0x00000000), 333);
1022     b.trace_line(traceHookBID, b.splat(0xFFFFFFFF), b.splat(0x00000000), 444);
1023     b.trace_line(traceHookAID, b.splat(0x00000000), b.splat(0xFFFFFFFF), 555);
1024     b.trace_line(traceHookBID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 666);
1025     skvm::Program p = b.done();
1026     p.eval(1);
1027 
1028     REPORTER_ASSERT(r, (testTraceA.fBuffer == std::vector<int>{222}));
1029     REPORTER_ASSERT(r, (testTraceB.fBuffer == std::vector<int>{666}));
1030     REPORTER_ASSERT(r, (testTraceC.fBuffer == std::vector<int>{111}));
1031 }
1032 
DEF_TEST(SkVM_premul,reporter)1033 DEF_TEST(SkVM_premul, reporter) {
1034     // Test that premul is short-circuited when alpha is known opaque.
1035     {
1036         skvm::Builder p;
1037         auto rptr = p.varying<int>(),
1038              aptr = p.varying<int>();
1039 
1040         skvm::F32 r = p.loadF(rptr),
1041                   g = p.splat(0.0f),
1042                   b = p.splat(0.0f),
1043                   a = p.loadF(aptr);
1044 
1045         p.premul(&r, &g, &b, a);
1046         p.storeF(rptr, r);
1047 
1048         // load red, load alpha, red *= alpha, store red
1049         REPORTER_ASSERT(reporter, p.done().instructions().size() == 4);
1050     }
1051 
1052     {
1053         skvm::Builder p;
1054         auto rptr = p.varying<int>();
1055 
1056         skvm::F32 r = p.loadF(rptr),
1057                   g = p.splat(0.0f),
1058                   b = p.splat(0.0f),
1059                   a = p.splat(1.0f);
1060 
1061         p.premul(&r, &g, &b, a);
1062         p.storeF(rptr, r);
1063 
1064         // load red, store red
1065         REPORTER_ASSERT(reporter, p.done().instructions().size() == 2);
1066     }
1067 
1068     // Same deal for unpremul.
1069     {
1070         skvm::Builder p;
1071         auto rptr = p.varying<int>(),
1072              aptr = p.varying<int>();
1073 
1074         skvm::F32 r = p.loadF(rptr),
1075                   g = p.splat(0.0f),
1076                   b = p.splat(0.0f),
1077                   a = p.loadF(aptr);
1078 
1079         p.unpremul(&r, &g, &b, a);
1080         p.storeF(rptr, r);
1081 
1082         // load red, load alpha, a bunch of unpremul instructions, store red
1083         REPORTER_ASSERT(reporter, p.done().instructions().size() >= 4);
1084     }
1085 
1086     {
1087         skvm::Builder p;
1088         auto rptr = p.varying<int>();
1089 
1090         skvm::F32 r = p.loadF(rptr),
1091                   g = p.splat(0.0f),
1092                   b = p.splat(0.0f),
1093                   a = p.splat(1.0f);
1094 
1095         p.unpremul(&r, &g, &b, a);
1096         p.storeF(rptr, r);
1097 
1098         // load red, store red
1099         REPORTER_ASSERT(reporter, p.done().instructions().size() == 2);
1100     }
1101 }
1102 
1103 template <typename Fn>
test_asm(skiatest::Reporter * r,Fn && fn,std::initializer_list<uint8_t> expected)1104 static void test_asm(skiatest::Reporter* r, Fn&& fn, std::initializer_list<uint8_t> expected) {
1105     uint8_t buf[4096];
1106     skvm::Assembler a{buf};
1107     fn(a);
1108 
1109     REPORTER_ASSERT(r, a.size() == expected.size());
1110 
1111     auto got = (const uint8_t*)buf,
1112          want = expected.begin();
1113     for (int i = 0; i < (int)std::min(a.size(), expected.size()); i++) {
1114         REPORTER_ASSERT(r, got[i] == want[i],
1115                         "byte %d was %02x, want %02x", i, got[i], want[i]);
1116     }
1117 }
1118 
DEF_TEST(SkVM_Assembler,r)1119 DEF_TEST(SkVM_Assembler, r) {
1120     // Easiest way to generate test cases is
1121     //
1122     //   echo '...some asm...' | llvm-mc -show-encoding -x86-asm-syntax=intel
1123     //
1124     // The -x86-asm-syntax=intel bit is optional, controlling the
1125     // input syntax only; the output will always be AT&T  op x,y,dst style.
1126     // Our APIs read more like Intel op dst,x,y as op(dst,x,y), so I find
1127     // that a bit easier to use here, despite maybe favoring AT&T overall.
1128 
1129     using A = skvm::Assembler;
1130     // Our exit strategy from AVX code.
1131     test_asm(r, [&](A& a) {
1132         a.int3();
1133         a.vzeroupper();
1134         a.ret();
1135     },{
1136         0xcc,
1137         0xc5, 0xf8, 0x77,
1138         0xc3,
1139     });
1140 
1141     // Align should pad with zero
1142     test_asm(r, [&](A& a) {
1143         a.ret();
1144         a.align(4);
1145     },{
1146         0xc3,
1147         0x00, 0x00, 0x00,
1148     });
1149 
1150     test_asm(r, [&](A& a) {
1151         a.add(A::rax, 8);       // Always good to test rax.
1152         a.sub(A::rax, 32);
1153 
1154         a.add(A::rdi, 12);      // Last 0x48 REX
1155         a.sub(A::rdi, 8);
1156 
1157         a.add(A::r8 , 7);       // First 0x49 REX
1158         a.sub(A::r8 , 4);
1159 
1160         a.add(A::rsi, 128);     // Requires 4 byte immediate.
1161         a.sub(A::r8 , 1000000);
1162 
1163         a.add(A::Mem{A::rsi}, 7);                       // addq $7, (%rsi)
1164         a.add(A::Mem{A::rsi, 12}, 7);                   // addq $7, 12(%rsi)
1165         a.add(A::Mem{A::rsp, 12}, 7);                   // addq $7, 12(%rsp)
1166         a.add(A::Mem{A::r12, 12}, 7);                   // addq $7, 12(%r12)
1167         a.add(A::Mem{A::rsp, 12, A::rax, A::FOUR}, 7);  // addq $7, 12(%rsp,%rax,4)
1168         a.add(A::Mem{A::r12, 12, A::rax, A::FOUR}, 7);  // addq $7, 12(%r12,%rax,4)
1169         a.add(A::Mem{A::rax, 12, A::r12, A::FOUR}, 7);  // addq $7, 12(%rax,%r12,4)
1170         a.add(A::Mem{A::r11, 12, A::r8 , A::TWO }, 7);  // addq $7, 12(%r11,%r8,2)
1171         a.add(A::Mem{A::r11, 12, A::rax}         , 7);  // addq $7, 12(%r11,%rax)
1172         a.add(A::Mem{A::rax, 12, A::r11}         , 7);  // addq $7, 12(%rax,%r11)
1173 
1174         a.sub(A::Mem{A::rax, 12, A::r11}         , 7);  // subq $7, 12(%rax,%r11)
1175 
1176         a.add(       A::rax     , A::rcx);              // addq %rcx, %rax
1177         a.add(A::Mem{A::rax}    , A::rcx);              // addq %rcx, (%rax)
1178         a.add(A::Mem{A::rax, 12}, A::rcx);              // addq %rcx, 12(%rax)
1179         a.add(A::rcx, A::Mem{A::rax, 12});              // addq 12(%rax), %rcx
1180 
1181         a.sub(A::rcx, A::Mem{A::rax, 12});              // subq 12(%rax), %rcx
1182     },{
1183         0x48, 0x83, 0b11'000'000, 0x08,
1184         0x48, 0x83, 0b11'101'000, 0x20,
1185 
1186         0x48, 0x83, 0b11'000'111, 0x0c,
1187         0x48, 0x83, 0b11'101'111, 0x08,
1188 
1189         0x49, 0x83, 0b11'000'000, 0x07,
1190         0x49, 0x83, 0b11'101'000, 0x04,
1191 
1192         0x48, 0x81, 0b11'000'110, 0x80, 0x00, 0x00, 0x00,
1193         0x49, 0x81, 0b11'101'000, 0x40, 0x42, 0x0f, 0x00,
1194 
1195         0x48,0x83,0x06,0x07,
1196         0x48,0x83,0x46,0x0c,0x07,
1197         0x48,0x83,0x44,0x24,0x0c,0x07,
1198         0x49,0x83,0x44,0x24,0x0c,0x07,
1199         0x48,0x83,0x44,0x84,0x0c,0x07,
1200         0x49,0x83,0x44,0x84,0x0c,0x07,
1201         0x4a,0x83,0x44,0xa0,0x0c,0x07,
1202         0x4b,0x83,0x44,0x43,0x0c,0x07,
1203         0x49,0x83,0x44,0x03,0x0c,0x07,
1204         0x4a,0x83,0x44,0x18,0x0c,0x07,
1205 
1206         0x4a,0x83,0x6c,0x18,0x0c,0x07,
1207 
1208         0x48,0x01,0xc8,
1209         0x48,0x01,0x08,
1210         0x48,0x01,0x48,0x0c,
1211         0x48,0x03,0x48,0x0c,
1212         0x48,0x2b,0x48,0x0c,
1213     });
1214 
1215 
1216     test_asm(r, [&](A& a) {
1217         a.vpaddd (A::ymm0, A::ymm1, A::ymm2);  // Low registers and 0x0f map     -> 2-byte VEX.
1218         a.vpaddd (A::ymm8, A::ymm1, A::ymm2);  // A high dst register is ok      -> 2-byte VEX.
1219         a.vpaddd (A::ymm0, A::ymm8, A::ymm2);  // A high first argument register -> 2-byte VEX.
1220         a.vpaddd (A::ymm0, A::ymm1, A::ymm8);  // A high second argument         -> 3-byte VEX.
1221         a.vpmulld(A::ymm0, A::ymm1, A::ymm2);  // Using non-0x0f map instruction -> 3-byte VEX.
1222         a.vpsubd (A::ymm0, A::ymm1, A::ymm2);  // Test vpsubd to ensure argument order is right.
1223     },{
1224         /*    VEX     */ /*op*/ /*modRM*/
1225         0xc5,       0xf5, 0xfe, 0xc2,
1226         0xc5,       0x75, 0xfe, 0xc2,
1227         0xc5,       0xbd, 0xfe, 0xc2,
1228         0xc4, 0xc1, 0x75, 0xfe, 0xc0,
1229         0xc4, 0xe2, 0x75, 0x40, 0xc2,
1230         0xc5,       0xf5, 0xfa, 0xc2,
1231     });
1232 
1233     test_asm(r, [&](A& a) {
1234         a.vpaddw   (A::ymm4, A::ymm3, A::ymm2);
1235         a.vpavgw   (A::ymm4, A::ymm3, A::ymm2);
1236         a.vpcmpeqw (A::ymm4, A::ymm3, A::ymm2);
1237         a.vpcmpgtw (A::ymm4, A::ymm3, A::ymm2);
1238 
1239         a.vpminsw  (A::ymm4, A::ymm3, A::ymm2);
1240         a.vpmaxsw  (A::ymm4, A::ymm3, A::ymm2);
1241         a.vpminuw  (A::ymm4, A::ymm3, A::ymm2);
1242         a.vpmaxuw  (A::ymm4, A::ymm3, A::ymm2);
1243 
1244         a.vpmulhrsw(A::ymm4, A::ymm3, A::ymm2);
1245         a.vpabsw   (A::ymm4, A::ymm3);
1246         a.vpsllw   (A::ymm4, A::ymm3, 12);
1247         a.vpsraw   (A::ymm4, A::ymm3, 12);
1248     },{
1249         0xc5,     0xe5, 0xfd, 0xe2,
1250         0xc5,     0xe5, 0xe3, 0xe2,
1251         0xc5,     0xe5, 0x75, 0xe2,
1252         0xc5,     0xe5, 0x65, 0xe2,
1253 
1254         0xc5,     0xe5, 0xea, 0xe2,
1255         0xc5,     0xe5, 0xee, 0xe2,
1256         0xc4,0xe2,0x65, 0x3a, 0xe2,
1257         0xc4,0xe2,0x65, 0x3e, 0xe2,
1258 
1259         0xc4,0xe2,0x65, 0x0b, 0xe2,
1260         0xc4,0xe2,0x7d, 0x1d, 0xe3,
1261         0xc5,0xdd,0x71, 0xf3, 0x0c,
1262         0xc5,0xdd,0x71, 0xe3, 0x0c,
1263     });
1264 
1265     test_asm(r, [&](A& a) {
1266         A::Label l;
1267         a.vcmpeqps (A::ymm0, A::ymm1, &l);      // vcmpeqps 0x1c(%rip), %ymm1, %ymm0
1268         a.vpcmpeqd (A::ymm0, A::ymm1, A::ymm2);
1269         a.vpcmpgtd (A::ymm0, A::ymm1, A::ymm2);
1270         a.vcmpeqps (A::ymm0, A::ymm1, A::ymm2);
1271         a.vcmpltps (A::ymm0, A::ymm1, A::ymm2);
1272         a.vcmpleps (A::ymm0, A::ymm1, A::ymm2);
1273         a.vcmpneqps(A::ymm0, A::ymm1, A::ymm2);
1274         a.label(&l);   // 28 bytes after the vcmpeqps that uses it.
1275     },{
1276         0xc5,0xf4,0xc2,0x05,0x1c,0x00,0x00,0x00,0x00,
1277         0xc5,0xf5,0x76,0xc2,
1278         0xc5,0xf5,0x66,0xc2,
1279         0xc5,0xf4,0xc2,0xc2,0x00,
1280         0xc5,0xf4,0xc2,0xc2,0x01,
1281         0xc5,0xf4,0xc2,0xc2,0x02,
1282         0xc5,0xf4,0xc2,0xc2,0x04,
1283     });
1284 
1285     test_asm(r, [&](A& a) {
1286         a.vminps(A::ymm0, A::ymm1, A::ymm2);
1287         a.vmaxps(A::ymm0, A::ymm1, A::ymm2);
1288     },{
1289         0xc5,0xf4,0x5d,0xc2,
1290         0xc5,0xf4,0x5f,0xc2,
1291     });
1292 
1293     test_asm(r, [&](A& a) {
1294         a.vpblendvb(A::ymm0, A::ymm1, A::ymm2, A::ymm3);
1295     },{
1296         0xc4,0xe3,0x75, 0x4c, 0xc2, 0x30,
1297     });
1298 
1299     test_asm(r, [&](A& a) {
1300         a.vpsrld(A::ymm15, A::ymm2, 8);
1301         a.vpsrld(A::ymm0 , A::ymm8, 5);
1302     },{
1303         0xc5,     0x85, 0x72,0xd2, 0x08,
1304         0xc4,0xc1,0x7d, 0x72,0xd0, 0x05,
1305     });
1306 
1307     test_asm(r, [&](A& a) {
1308         A::Label l;
1309         a.vpermps(A::ymm1, A::ymm2, A::Mem{A::rdi, 32});
1310         a.vperm2f128(A::ymm1, A::ymm2, &l, 0x20);
1311         a.vpermq(A::ymm1, A::ymm2, 5);
1312         a.label(&l);  // 6 bytes after vperm2f128
1313     },{
1314         0xc4,0xe2,0x6d,0x16,0x4f,0x20,
1315         0xc4,0xe3,0x6d,0x06,0x0d,0x06,0x00,0x00,0x00,0x20,
1316         0xc4,0xe3,0xfd, 0x00,0xca, 0x05,
1317     });
1318 
1319     test_asm(r, [&](A& a) {
1320         a.vpunpckldq(A::ymm1, A::ymm2, A::Mem{A::rdi});
1321         a.vpunpckhdq(A::ymm1, A::ymm2, A::ymm3);
1322     },{
1323         0xc5,0xed,0x62,0x0f,
1324         0xc5,0xed,0x6a,0xcb,
1325     });
1326 
1327     test_asm(r, [&](A& a) {
1328         a.vroundps(A::ymm1, A::ymm2, A::NEAREST);
1329         a.vroundps(A::ymm1, A::ymm2, A::FLOOR);
1330         a.vroundps(A::ymm1, A::ymm2, A::CEIL);
1331         a.vroundps(A::ymm1, A::ymm2, A::TRUNC);
1332     },{
1333         0xc4,0xe3,0x7d,0x08,0xca,0x00,
1334         0xc4,0xe3,0x7d,0x08,0xca,0x01,
1335         0xc4,0xe3,0x7d,0x08,0xca,0x02,
1336         0xc4,0xe3,0x7d,0x08,0xca,0x03,
1337     });
1338 
1339     test_asm(r, [&](A& a) {
1340         A::Label l;
1341         a.label(&l);
1342         a.byte(1);
1343         a.byte(2);
1344         a.byte(3);
1345         a.byte(4);
1346 
1347         a.vbroadcastss(A::ymm0 , &l);
1348         a.vbroadcastss(A::ymm1 , &l);
1349         a.vbroadcastss(A::ymm8 , &l);
1350         a.vbroadcastss(A::ymm15, &l);
1351 
1352         a.vpshufb(A::ymm4, A::ymm3, &l);
1353         a.vpaddd (A::ymm4, A::ymm3, &l);
1354         a.vpsubd (A::ymm4, A::ymm3, &l);
1355 
1356         a.vptest(A::ymm4, &l);
1357 
1358         a.vmulps (A::ymm4, A::ymm3, &l);
1359     },{
1360         0x01, 0x02, 0x03, 0x4,
1361 
1362         /*     VEX    */  /*op*/ /*   ModRM    */  /*     offset     */
1363         0xc4, 0xe2, 0x7d,  0x18,   0b00'000'101,   0xf3,0xff,0xff,0xff,   // 0xfffffff3 == -13
1364         0xc4, 0xe2, 0x7d,  0x18,   0b00'001'101,   0xea,0xff,0xff,0xff,   // 0xffffffea == -22
1365         0xc4, 0x62, 0x7d,  0x18,   0b00'000'101,   0xe1,0xff,0xff,0xff,   // 0xffffffe1 == -31
1366         0xc4, 0x62, 0x7d,  0x18,   0b00'111'101,   0xd8,0xff,0xff,0xff,   // 0xffffffd8 == -40
1367 
1368         0xc4, 0xe2, 0x65,  0x00,   0b00'100'101,   0xcf,0xff,0xff,0xff,   // 0xffffffcf == -49
1369 
1370         0xc5, 0xe5,        0xfe,   0b00'100'101,   0xc7,0xff,0xff,0xff,   // 0xffffffc7 == -57
1371         0xc5, 0xe5,        0xfa,   0b00'100'101,   0xbf,0xff,0xff,0xff,   // 0xffffffbf == -65
1372 
1373         0xc4, 0xe2, 0x7d,  0x17,   0b00'100'101,   0xb6,0xff,0xff,0xff,   // 0xffffffb6 == -74
1374 
1375         0xc5, 0xe4,        0x59,   0b00'100'101,   0xae,0xff,0xff,0xff,   // 0xffffffaf == -82
1376     });
1377 
1378     test_asm(r, [&](A& a) {
1379         a.vbroadcastss(A::ymm0,  A::Mem{A::rdi,   0});
1380         a.vbroadcastss(A::ymm13, A::Mem{A::r14,   7});
1381         a.vbroadcastss(A::ymm8,  A::Mem{A::rdx, -12});
1382         a.vbroadcastss(A::ymm8,  A::Mem{A::rdx, 400});
1383 
1384         a.vbroadcastss(A::ymm8,  A::xmm0);
1385         a.vbroadcastss(A::ymm0,  A::xmm13);
1386     },{
1387         /*   VEX    */ /*op*/     /*ModRM*/   /*offset*/
1388         0xc4,0xe2,0x7d, 0x18,   0b00'000'111,
1389         0xc4,0x42,0x7d, 0x18,   0b01'101'110,  0x07,
1390         0xc4,0x62,0x7d, 0x18,   0b01'000'010,  0xf4,
1391         0xc4,0x62,0x7d, 0x18,   0b10'000'010,  0x90,0x01,0x00,0x00,
1392 
1393         0xc4,0x62,0x7d, 0x18,   0b11'000'000,
1394         0xc4,0xc2,0x7d, 0x18,   0b11'000'101,
1395     });
1396 
1397     test_asm(r, [&](A& a) {
1398         A::Label l;
1399         a.label(&l);
1400         a.jne(&l);
1401         a.jne(&l);
1402         a.je (&l);
1403         a.jmp(&l);
1404         a.jl (&l);
1405         a.jc (&l);
1406 
1407         a.cmp(A::rdx, 1);
1408         a.cmp(A::rax, 12);
1409         a.cmp(A::r14, 2000000000);
1410     },{
1411         0x0f,0x85, 0xfa,0xff,0xff,0xff,   // near jne -6 bytes
1412         0x0f,0x85, 0xf4,0xff,0xff,0xff,   // near jne -12 bytes
1413         0x0f,0x84, 0xee,0xff,0xff,0xff,   // near je  -18 bytes
1414         0xe9,      0xe9,0xff,0xff,0xff,   // near jmp -23 bytes
1415         0x0f,0x8c, 0xe3,0xff,0xff,0xff,   // near jl  -29 bytes
1416         0x0f,0x82, 0xdd,0xff,0xff,0xff,   // near jc  -35 bytes
1417 
1418         0x48,0x83,0xfa,0x01,
1419         0x48,0x83,0xf8,0x0c,
1420         0x49,0x81,0xfe,0x00,0x94,0x35,0x77,
1421     });
1422 
1423     test_asm(r, [&](A& a) {
1424         a.vmovups(A::ymm5, A::Mem{A::rsi});
1425         a.vmovups(A::Mem{A::rsi}, A::ymm5);
1426 
1427         a.vmovups(A::xmm5, A::Mem{A::rsi});
1428         a.vmovups(A::Mem{A::rsi}, A::xmm5);
1429 
1430         a.vpmovzxwd(A::ymm4, A::Mem{A::rsi});
1431         a.vpmovzxbd(A::ymm4, A::Mem{A::rsi});
1432 
1433         a.vmovq(A::Mem{A::rdx}, A::xmm15);
1434     },{
1435         /*    VEX    */  /*Op*/  /*  ModRM  */
1436         0xc5,     0xfc,   0x10,  0b00'101'110,
1437         0xc5,     0xfc,   0x11,  0b00'101'110,
1438 
1439         0xc5,     0xf8,   0x10,  0b00'101'110,
1440         0xc5,     0xf8,   0x11,  0b00'101'110,
1441 
1442         0xc4,0xe2,0x7d,   0x33,  0b00'100'110,
1443         0xc4,0xe2,0x7d,   0x31,  0b00'100'110,
1444 
1445         0xc5,     0x79,   0xd6,  0b00'111'010,
1446     });
1447 
1448     test_asm(r, [&](A& a) {
1449         a.vmovups(A::ymm5, A::Mem{A::rsp,  0});
1450         a.vmovups(A::ymm5, A::Mem{A::rsp, 64});
1451         a.vmovups(A::ymm5, A::Mem{A::rsp,128});
1452 
1453         a.vmovups(A::Mem{A::rsp,  0}, A::ymm5);
1454         a.vmovups(A::Mem{A::rsp, 64}, A::ymm5);
1455         a.vmovups(A::Mem{A::rsp,128}, A::ymm5);
1456     },{
1457         0xc5,0xfc,0x10,0x2c,0x24,
1458         0xc5,0xfc,0x10,0x6c,0x24,0x40,
1459         0xc5,0xfc,0x10,0xac,0x24,0x80,0x00,0x00,0x00,
1460 
1461         0xc5,0xfc,0x11,0x2c,0x24,
1462         0xc5,0xfc,0x11,0x6c,0x24,0x40,
1463         0xc5,0xfc,0x11,0xac,0x24,0x80,0x00,0x00,0x00,
1464     });
1465 
1466     test_asm(r, [&](A& a) {
1467         a.movzbq(A::rax, A::Mem{A::rsi});   // Low registers for src and dst.
1468         a.movzbq(A::rax, A::Mem{A::r8,});   // High src register.
1469         a.movzbq(A::r8 , A::Mem{A::rsi});   // High dst register.
1470         a.movzbq(A::r8,  A::Mem{A::rsi, 12});
1471         a.movzbq(A::r8,  A::Mem{A::rsi, 400});
1472 
1473         a.movzwq(A::rax, A::Mem{A::rsi});   // Low registers for src and dst.
1474         a.movzwq(A::rax, A::Mem{A::r8,});   // High src register.
1475         a.movzwq(A::r8 , A::Mem{A::rsi});   // High dst register.
1476         a.movzwq(A::r8,  A::Mem{A::rsi, 12});
1477         a.movzwq(A::r8,  A::Mem{A::rsi, 400});
1478 
1479         a.vmovd(A::Mem{A::rax}, A::xmm0);
1480         a.vmovd(A::Mem{A::rax}, A::xmm8);
1481         a.vmovd(A::Mem{A::r8 }, A::xmm0);
1482 
1483         a.vmovd(A::xmm0, A::Mem{A::rax});
1484         a.vmovd(A::xmm8, A::Mem{A::rax});
1485         a.vmovd(A::xmm0, A::Mem{A::r8 });
1486 
1487         a.vmovd(A::xmm0 , A::Mem{A::rax, 0, A::rcx, A::FOUR});
1488         a.vmovd(A::xmm15, A::Mem{A::rax, 0, A::r8,  A::TWO });
1489         a.vmovd(A::xmm0 , A::Mem{A::r8 , 0, A::rcx});
1490 
1491         a.vmovd(A::rax, A::xmm0);
1492         a.vmovd(A::rax, A::xmm8);
1493         a.vmovd(A::r8 ,  A::xmm0);
1494 
1495         a.vmovd(A::xmm0, A::rax);
1496         a.vmovd(A::xmm8, A::rax);
1497         a.vmovd(A::xmm0, A::r8 );
1498 
1499         a.movb(A::Mem{A::rdx}, A::rax);
1500         a.movb(A::Mem{A::rdx}, A::r8 );
1501         a.movb(A::Mem{A::r8 }, A::rax);
1502 
1503         a.movb(A::rdx, A::Mem{A::rax});
1504         a.movb(A::rdx, A::Mem{A::r8 });
1505         a.movb(A::r8 , A::Mem{A::rax});
1506 
1507         a.movb(A::rdx, 12);
1508         a.movb(A::rax,  4);
1509         a.movb(A::r8 , -1);
1510 
1511         a.movb(A::Mem{A::rdx}, 12);
1512         a.movb(A::Mem{A::rax},  4);
1513         a.movb(A::Mem{A::r8 }, -1);
1514     },{
1515         0x48,0x0f,0xb6,0x06,     // movzbq (%rsi), %rax
1516         0x49,0x0f,0xb6,0x00,
1517         0x4c,0x0f,0xb6,0x06,
1518         0x4c,0x0f,0xb6,0x46, 12,
1519         0x4c,0x0f,0xb6,0x86, 0x90,0x01,0x00,0x00,
1520 
1521         0x48,0x0f,0xb7,0x06,    // movzwq (%rsi), %rax
1522         0x49,0x0f,0xb7,0x00,
1523         0x4c,0x0f,0xb7,0x06,
1524         0x4c,0x0f,0xb7,0x46, 12,
1525         0x4c,0x0f,0xb7,0x86, 0x90,0x01,0x00,0x00,
1526 
1527         0xc5,0xf9,0x7e,0x00,
1528         0xc5,0x79,0x7e,0x00,
1529         0xc4,0xc1,0x79,0x7e,0x00,
1530 
1531         0xc5,0xf9,0x6e,0x00,
1532         0xc5,0x79,0x6e,0x00,
1533         0xc4,0xc1,0x79,0x6e,0x00,
1534 
1535         0xc5,0xf9,0x6e,0x04,0x88,
1536         0xc4,0x21,0x79,0x6e,0x3c,0x40,
1537         0xc4,0xc1,0x79,0x6e,0x04,0x08,
1538 
1539         0xc5,0xf9,0x7e,0xc0,
1540         0xc5,0x79,0x7e,0xc0,
1541         0xc4,0xc1,0x79,0x7e,0xc0,
1542 
1543         0xc5,0xf9,0x6e,0xc0,
1544         0xc5,0x79,0x6e,0xc0,
1545         0xc4,0xc1,0x79,0x6e,0xc0,
1546 
1547         0x48 ,0x88, 0x02,
1548         0x4c, 0x88, 0x02,
1549         0x49, 0x88, 0x00,
1550 
1551         0x48 ,0x8a, 0x10,
1552         0x49, 0x8a, 0x10,
1553         0x4c, 0x8a, 0x00,
1554 
1555         0x48, 0xc6, 0xc2, 0x0c,
1556         0x48, 0xc6, 0xc0, 0x04,
1557         0x49, 0xc6, 0xc0, 0xff,
1558 
1559         0x48, 0xc6, 0x02, 0x0c,
1560         0x48, 0xc6, 0x00, 0x04,
1561         0x49, 0xc6, 0x00, 0xff,
1562     });
1563 
1564     test_asm(r, [&](A& a) {
1565         a.vpinsrd(A::xmm1, A::xmm8, A::Mem{A::rsi}, 1);   // vpinsrd $1, (%rsi), %xmm8, %xmm1
1566         a.vpinsrd(A::xmm8, A::xmm1, A::Mem{A::r8 }, 3);   // vpinsrd $3, (%r8), %xmm1, %xmm8;
1567 
1568         a.vpinsrw(A::xmm1, A::xmm8, A::Mem{A::rsi}, 4);   // vpinsrw $4, (%rsi), %xmm8, %xmm1
1569         a.vpinsrw(A::xmm8, A::xmm1, A::Mem{A::r8 }, 12);  // vpinrsw $12, (%r8), %xmm1, %xmm8
1570 
1571         a.vpinsrb(A::xmm1, A::xmm8, A::Mem{A::rsi}, 4);   // vpinsrb $4, (%rsi), %xmm8, %xmm1
1572         a.vpinsrb(A::xmm8, A::xmm1, A::Mem{A::r8 }, 12);  // vpinsrb $12, (%r8), %xmm1, %xmm8
1573 
1574         a.vextracti128(A::xmm1, A::ymm8, 1);  // vextracti128 $1, %ymm8, %xmm1
1575         a.vextracti128(A::xmm8, A::ymm1, 0);  // vextracti128 $0, %ymm1, %xmm8
1576 
1577         a.vpextrd(A::Mem{A::rsi}, A::xmm8, 3);  // vpextrd  $3, %xmm8, (%rsi)
1578         a.vpextrd(A::Mem{A::r8 }, A::xmm1, 2);  // vpextrd  $2, %xmm1, (%r8)
1579 
1580         a.vpextrw(A::Mem{A::rsi}, A::xmm8, 7);
1581         a.vpextrw(A::Mem{A::r8 }, A::xmm1, 15);
1582 
1583         a.vpextrb(A::Mem{A::rsi}, A::xmm8, 7);
1584         a.vpextrb(A::Mem{A::r8 }, A::xmm1, 15);
1585     },{
1586         0xc4,0xe3,0x39, 0x22, 0x0e, 1,
1587         0xc4,0x43,0x71, 0x22, 0x00, 3,
1588 
1589         0xc5,0xb9,      0xc4, 0x0e,  4,
1590         0xc4,0x41,0x71, 0xc4, 0x00, 12,
1591 
1592         0xc4,0xe3,0x39, 0x20, 0x0e,  4,
1593         0xc4,0x43,0x71, 0x20, 0x00, 12,
1594 
1595         0xc4,0x63,0x7d,0x39,0xc1, 1,
1596         0xc4,0xc3,0x7d,0x39,0xc8, 0,
1597 
1598         0xc4,0x63,0x79,0x16,0x06, 3,
1599         0xc4,0xc3,0x79,0x16,0x08, 2,
1600 
1601         0xc4,0x63,0x79, 0x15, 0x06,  7,
1602         0xc4,0xc3,0x79, 0x15, 0x08, 15,
1603 
1604         0xc4,0x63,0x79, 0x14, 0x06,  7,
1605         0xc4,0xc3,0x79, 0x14, 0x08, 15,
1606     });
1607 
1608     test_asm(r, [&](A& a) {
1609         a.vpandn(A::ymm3, A::ymm12, A::ymm2);
1610     },{
1611         0xc5, 0x9d, 0xdf, 0xda,
1612     });
1613 
1614     test_asm(r, [&](A& a) {
1615         A::Label l;
1616         a.vmovdqa(A::ymm3, A::ymm2);                                // vmovdqa %ymm2         , %ymm3
1617 
1618         a.vmovdqa(A::ymm3, A::Mem{A::rsi});                         // vmovdqa  (%rsi)       , %ymm3
1619         a.vmovdqa(A::ymm3, A::Mem{A::rsp});                         // vmovdqa  (%rsp)       , %ymm3
1620         a.vmovdqa(A::ymm3, A::Mem{A::r11});                         // vmovdqa  (%r11)       , %ymm3
1621 
1622         a.vmovdqa(A::ymm3, A::Mem{A::rsi,  4});                     // vmovdqa 4(%rsi)       , %ymm3
1623         a.vmovdqa(A::ymm3, A::Mem{A::rsp,  4});                     // vmovdqa 4(%rsp)       , %ymm3
1624 
1625         a.vmovdqa(A::ymm3, A::Mem{A::rsi,  4, A::rax, A::EIGHT});   // vmovdqa 4(%rsi,%rax,8), %ymm3
1626         a.vmovdqa(A::ymm3, A::Mem{A::r11,  4, A::rax, A::TWO  });   // vmovdqa 4(%r11,%rax,2), %ymm3
1627         a.vmovdqa(A::ymm3, A::Mem{A::rsi,  4, A::r11, A::FOUR });   // vmovdqa 4(%rsi,%r11,4), %ymm3
1628         a.vmovdqa(A::ymm3, A::Mem{A::rsi,  4, A::r11, A::ONE  });   // vmovdqa 4(%rsi,%r11,1), %ymm3
1629         a.vmovdqa(A::ymm3, A::Mem{A::rsi,  4, A::r11});             // vmovdqa 4(%rsi,%r11)  , %ymm3
1630 
1631         a.vmovdqa(A::ymm3, A::Mem{A::rsi,  64, A::r11});            // vmovdqa  64(%rsi,%r11), %ymm3
1632         a.vmovdqa(A::ymm3, A::Mem{A::rsi, 128, A::r11});            // vmovdqa 128(%rsi,%r11), %ymm3
1633         a.vmovdqa(A::ymm3, &l);                                     // vmovdqa  16(%rip)     , %ymm3
1634 
1635         a.vcvttps2dq(A::ymm3, A::ymm2);
1636         a.vcvtdq2ps (A::ymm3, A::ymm2);
1637         a.vcvtps2dq (A::ymm3, A::ymm2);
1638         a.vsqrtps   (A::ymm3, A::ymm2);
1639         a.label(&l);
1640     },{
1641         0xc5,0xfd,0x6f,0xda,
1642 
1643         0xc5,0xfd,0x6f,0x1e,
1644         0xc5,0xfd,0x6f,0x1c,0x24,
1645         0xc4,0xc1,0x7d,0x6f,0x1b,
1646 
1647         0xc5,0xfd,0x6f,0x5e,0x04,
1648         0xc5,0xfd,0x6f,0x5c,0x24,0x04,
1649 
1650         0xc5,0xfd,0x6f,0x5c,0xc6,0x04,
1651         0xc4,0xc1,0x7d,0x6f,0x5c,0x43,0x04,
1652         0xc4,0xa1,0x7d,0x6f,0x5c,0x9e,0x04,
1653         0xc4,0xa1,0x7d,0x6f,0x5c,0x1e,0x04,
1654         0xc4,0xa1,0x7d,0x6f,0x5c,0x1e,0x04,
1655 
1656         0xc4,0xa1,0x7d,0x6f,0x5c,0x1e,0x40,
1657         0xc4,0xa1,0x7d,0x6f,0x9c,0x1e,0x80,0x00,0x00,0x00,
1658 
1659         0xc5,0xfd,0x6f,0x1d,0x10,0x00,0x00,0x00,
1660 
1661         0xc5,0xfe,0x5b,0xda,
1662         0xc5,0xfc,0x5b,0xda,
1663         0xc5,0xfd,0x5b,0xda,
1664         0xc5,0xfc,0x51,0xda,
1665     });
1666 
1667     test_asm(r, [&](A& a) {
1668         a.vcvtps2ph(A::xmm3, A::ymm2, A::CURRENT);
1669         a.vcvtps2ph(A::Mem{A::rsi, 32, A::rax, A::EIGHT}, A::ymm5, A::CEIL);
1670 
1671         a.vcvtph2ps(A::ymm15, A::Mem{A::rdi, 12, A::r9, A::ONE});
1672         a.vcvtph2ps(A::ymm2, A::xmm3);
1673     },{
1674         0xc4,0xe3,0x7d,0x1d,0xd3,0x04,
1675         0xc4,0xe3,0x7d,0x1d,0x6c,0xc6,0x20,0x02,
1676 
1677         0xc4,0x22,0x7d,0x13,0x7c,0x0f,0x0c,
1678         0xc4,0xe2,0x7d,0x13,0xd3,
1679     });
1680 
1681     test_asm(r, [&](A& a) {
1682         a.vgatherdps(A::ymm1 , A::FOUR , A::ymm0 , A::rdi, A::ymm2 );
1683         a.vgatherdps(A::ymm0 , A::ONE  , A::ymm2 , A::rax, A::ymm1 );
1684         a.vgatherdps(A::ymm10, A::ONE  , A::ymm2 , A::rax, A::ymm1 );
1685         a.vgatherdps(A::ymm0 , A::ONE  , A::ymm12, A::rax, A::ymm1 );
1686         a.vgatherdps(A::ymm0 , A::ONE  , A::ymm2 , A::r9 , A::ymm1 );
1687         a.vgatherdps(A::ymm0 , A::ONE  , A::ymm2 , A::rax, A::ymm12);
1688         a.vgatherdps(A::ymm0 , A::EIGHT, A::ymm2 , A::rax, A::ymm12);
1689     },{
1690         0xc4,0xe2,0x6d,0x92,0x0c,0x87,
1691         0xc4,0xe2,0x75,0x92,0x04,0x10,
1692         0xc4,0x62,0x75,0x92,0x14,0x10,
1693         0xc4,0xa2,0x75,0x92,0x04,0x20,
1694         0xc4,0xc2,0x75,0x92,0x04,0x11,
1695         0xc4,0xe2,0x1d,0x92,0x04,0x10,
1696         0xc4,0xe2,0x1d,0x92,0x04,0xd0,
1697     });
1698 
1699     test_asm(r, [&](A& a) {
1700         a.mov(A::rax, A::Mem{A::rdi,   0});
1701         a.mov(A::rax, A::Mem{A::rdi,   1});
1702         a.mov(A::rax, A::Mem{A::rdi, 512});
1703         a.mov(A::r15, A::Mem{A::r13,  42});
1704         a.mov(A::rax, A::Mem{A::r13,  42});
1705         a.mov(A::r15, A::Mem{A::rax,  42});
1706         a.mov(A::rax, 1);
1707         a.mov(A::rax, A::rcx);
1708     },{
1709         0x48, 0x8b, 0x07,
1710         0x48, 0x8b, 0x47, 0x01,
1711         0x48, 0x8b, 0x87, 0x00,0x02,0x00,0x00,
1712         0x4d, 0x8b, 0x7d, 0x2a,
1713         0x49, 0x8b, 0x45, 0x2a,
1714         0x4c, 0x8b, 0x78, 0x2a,
1715         0x48, 0xc7, 0xc0, 0x01,0x00,0x00,0x00,
1716         0x48, 0x89, 0xc8,
1717     });
1718 
1719     // echo "fmul v4.4s, v3.4s, v1.4s" | llvm-mc -show-encoding -arch arm64
1720 
1721     test_asm(r, [&](A& a) {
1722         a.and16b(A::v4, A::v3, A::v1);
1723         a.orr16b(A::v4, A::v3, A::v1);
1724         a.eor16b(A::v4, A::v3, A::v1);
1725         a.bic16b(A::v4, A::v3, A::v1);
1726         a.bsl16b(A::v4, A::v3, A::v1);
1727         a.not16b(A::v4, A::v3);
1728 
1729         a.add4s(A::v4, A::v3, A::v1);
1730         a.sub4s(A::v4, A::v3, A::v1);
1731         a.mul4s(A::v4, A::v3, A::v1);
1732 
1733         a.cmeq4s(A::v4, A::v3, A::v1);
1734         a.cmgt4s(A::v4, A::v3, A::v1);
1735 
1736         a.sub8h(A::v4, A::v3, A::v1);
1737         a.mul8h(A::v4, A::v3, A::v1);
1738 
1739         a.fadd4s(A::v4, A::v3, A::v1);
1740         a.fsub4s(A::v4, A::v3, A::v1);
1741         a.fmul4s(A::v4, A::v3, A::v1);
1742         a.fdiv4s(A::v4, A::v3, A::v1);
1743         a.fmin4s(A::v4, A::v3, A::v1);
1744         a.fmax4s(A::v4, A::v3, A::v1);
1745 
1746         a.fneg4s (A::v4, A::v3);
1747         a.fsqrt4s(A::v4, A::v3);
1748 
1749         a.fmla4s(A::v4, A::v3, A::v1);
1750         a.fmls4s(A::v4, A::v3, A::v1);
1751 
1752         a.fcmeq4s(A::v4, A::v3, A::v1);
1753         a.fcmgt4s(A::v4, A::v3, A::v1);
1754         a.fcmge4s(A::v4, A::v3, A::v1);
1755     },{
1756         0x64,0x1c,0x21,0x4e,
1757         0x64,0x1c,0xa1,0x4e,
1758         0x64,0x1c,0x21,0x6e,
1759         0x64,0x1c,0x61,0x4e,
1760         0x64,0x1c,0x61,0x6e,
1761         0x64,0x58,0x20,0x6e,
1762 
1763         0x64,0x84,0xa1,0x4e,
1764         0x64,0x84,0xa1,0x6e,
1765         0x64,0x9c,0xa1,0x4e,
1766 
1767         0x64,0x8c,0xa1,0x6e,
1768         0x64,0x34,0xa1,0x4e,
1769 
1770         0x64,0x84,0x61,0x6e,
1771         0x64,0x9c,0x61,0x4e,
1772 
1773         0x64,0xd4,0x21,0x4e,
1774         0x64,0xd4,0xa1,0x4e,
1775         0x64,0xdc,0x21,0x6e,
1776         0x64,0xfc,0x21,0x6e,
1777         0x64,0xf4,0xa1,0x4e,
1778         0x64,0xf4,0x21,0x4e,
1779 
1780         0x64,0xf8,0xa0,0x6e,
1781         0x64,0xf8,0xa1,0x6e,
1782 
1783         0x64,0xcc,0x21,0x4e,
1784         0x64,0xcc,0xa1,0x4e,
1785 
1786         0x64,0xe4,0x21,0x4e,
1787         0x64,0xe4,0xa1,0x6e,
1788         0x64,0xe4,0x21,0x6e,
1789     });
1790 
1791     test_asm(r, [&](A& a) {
1792         a.shl4s(A::v4, A::v3,  0);
1793         a.shl4s(A::v4, A::v3,  1);
1794         a.shl4s(A::v4, A::v3,  8);
1795         a.shl4s(A::v4, A::v3, 16);
1796         a.shl4s(A::v4, A::v3, 31);
1797 
1798         a.sshr4s(A::v4, A::v3,  1);
1799         a.sshr4s(A::v4, A::v3,  8);
1800         a.sshr4s(A::v4, A::v3, 31);
1801 
1802         a.ushr4s(A::v4, A::v3,  1);
1803         a.ushr4s(A::v4, A::v3,  8);
1804         a.ushr4s(A::v4, A::v3, 31);
1805 
1806         a.ushr8h(A::v4, A::v3,  1);
1807         a.ushr8h(A::v4, A::v3,  8);
1808         a.ushr8h(A::v4, A::v3, 15);
1809     },{
1810         0x64,0x54,0x20,0x4f,
1811         0x64,0x54,0x21,0x4f,
1812         0x64,0x54,0x28,0x4f,
1813         0x64,0x54,0x30,0x4f,
1814         0x64,0x54,0x3f,0x4f,
1815 
1816         0x64,0x04,0x3f,0x4f,
1817         0x64,0x04,0x38,0x4f,
1818         0x64,0x04,0x21,0x4f,
1819 
1820         0x64,0x04,0x3f,0x6f,
1821         0x64,0x04,0x38,0x6f,
1822         0x64,0x04,0x21,0x6f,
1823 
1824         0x64,0x04,0x1f,0x6f,
1825         0x64,0x04,0x18,0x6f,
1826         0x64,0x04,0x11,0x6f,
1827     });
1828 
1829     test_asm(r, [&](A& a) {
1830         a.sli4s(A::v4, A::v3,  0);
1831         a.sli4s(A::v4, A::v3,  1);
1832         a.sli4s(A::v4, A::v3,  8);
1833         a.sli4s(A::v4, A::v3, 16);
1834         a.sli4s(A::v4, A::v3, 31);
1835     },{
1836         0x64,0x54,0x20,0x6f,
1837         0x64,0x54,0x21,0x6f,
1838         0x64,0x54,0x28,0x6f,
1839         0x64,0x54,0x30,0x6f,
1840         0x64,0x54,0x3f,0x6f,
1841     });
1842 
1843     test_asm(r, [&](A& a) {
1844         a.scvtf4s (A::v4, A::v3);
1845         a.fcvtzs4s(A::v4, A::v3);
1846         a.fcvtns4s(A::v4, A::v3);
1847         a.frintp4s(A::v4, A::v3);
1848         a.frintm4s(A::v4, A::v3);
1849         a.fcvtn   (A::v4, A::v3);
1850         a.fcvtl   (A::v4, A::v3);
1851     },{
1852         0x64,0xd8,0x21,0x4e,
1853         0x64,0xb8,0xa1,0x4e,
1854         0x64,0xa8,0x21,0x4e,
1855         0x64,0x88,0xa1,0x4e,
1856         0x64,0x98,0x21,0x4e,
1857         0x64,0x68,0x21,0x0e,
1858         0x64,0x78,0x21,0x0e,
1859     });
1860 
1861     test_asm(r, [&](A& a) {
1862         a.sub (A::sp, A::sp, 32);  // sub   sp, sp, #32
1863         a.strq(A::v0, A::sp, 1);   // str   q0, [sp, #16]
1864         a.strq(A::v1, A::sp);      // str   q1, [sp]
1865         a.strd(A::v0, A::sp, 6);   // str   s0, [sp, #48]
1866         a.strs(A::v0, A::sp, 6);   // str   s0, [sp, #24]
1867         a.strh(A::v0, A::sp, 10);  // str   h0, [sp, #20]
1868         a.strb(A::v0, A::sp, 47);  // str   b0, [sp, #47]
1869         a.ldrb(A::v9, A::sp, 42);  // ldr   b9, [sp, #42]
1870         a.ldrh(A::v9, A::sp, 47);  // ldr   h9, [sp, #94]
1871         a.ldrs(A::v7, A::sp, 10);  // ldr   s7, [sp, #40]
1872         a.ldrd(A::v7, A::sp,  1);  // ldr   d7, [sp, #8]
1873         a.ldrq(A::v5, A::sp, 128); // ldr   q5, [sp, #2048]
1874         a.add (A::sp, A::sp, 32);  // add   sp, sp, #32
1875     },{
1876          0xff,0x83,0x00,0xd1,
1877          0xe0,0x07,0x80,0x3d,
1878          0xe1,0x03,0x80,0x3d,
1879          0xe0,0x1b,0x00,0xfd,
1880          0xe0,0x1b,0x00,0xbd,
1881          0xe0,0x2b,0x00,0x7d,
1882          0xe0,0xbf,0x00,0x3d,
1883          0xe9,0xab,0x40,0x3d,
1884          0xe9,0xbf,0x40,0x7d,
1885          0xe7,0x2b,0x40,0xbd,
1886          0xe7,0x07,0x40,0xfd,
1887          0xe5,0x03,0xc2,0x3d,
1888          0xff,0x83,0x00,0x91,
1889     });
1890 
1891     test_asm(r, [&](A& a) {
1892         a.brk(0);
1893         a.brk(65535);
1894 
1895         a.ret(A::x30);   // Conventional ret using link register.
1896         a.ret(A::x13);   // Can really return using any register if we like.
1897 
1898         a.add(A::x2, A::x2,  4);
1899         a.add(A::x3, A::x2, 32);
1900 
1901         a.sub(A::x2, A::x2, 4);
1902         a.sub(A::x3, A::x2, 32);
1903 
1904         a.subs(A::x2, A::x2,  4);
1905         a.subs(A::x3, A::x2, 32);
1906 
1907         a.subs(A::xzr, A::x2, 4);  // These are actually the same instruction!
1908         a.cmp(A::x2, 4);
1909 
1910         A::Label l;
1911         a.label(&l);
1912         a.bne(&l);
1913         a.bne(&l);
1914         a.blt(&l);
1915         a.b(&l);
1916         a.cbnz(A::x2, &l);
1917         a.cbz(A::x2, &l);
1918 
1919         a.add(A::x3, A::x2, A::x1);             // add x3,x2,x1
1920         a.add(A::x3, A::x2, A::x1, A::ASR, 3);  // add x3,x2,x1, asr #3
1921     },{
1922         0x00,0x00,0x20,0xd4,
1923         0xe0,0xff,0x3f,0xd4,
1924 
1925         0xc0,0x03,0x5f,0xd6,
1926         0xa0,0x01,0x5f,0xd6,
1927 
1928         0x42,0x10,0x00,0x91,
1929         0x43,0x80,0x00,0x91,
1930 
1931         0x42,0x10,0x00,0xd1,
1932         0x43,0x80,0x00,0xd1,
1933 
1934         0x42,0x10,0x00,0xf1,
1935         0x43,0x80,0x00,0xf1,
1936 
1937         0x5f,0x10,0x00,0xf1,
1938         0x5f,0x10,0x00,0xf1,
1939 
1940         0x01,0x00,0x00,0x54,   // b.ne #0
1941         0xe1,0xff,0xff,0x54,   // b.ne #-4
1942         0xcb,0xff,0xff,0x54,   // b.lt #-8
1943         0xae,0xff,0xff,0x54,   // b.al #-12
1944         0x82,0xff,0xff,0xb5,   // cbnz x2, #-16
1945         0x62,0xff,0xff,0xb4,   // cbz x2, #-20
1946 
1947         0x43,0x00,0x01,0x8b,
1948         0x43,0x0c,0x81,0x8b,
1949     });
1950 
1951     // Can we cbz() to a not-yet-defined label?
1952     test_asm(r, [&](A& a) {
1953         A::Label l;
1954         a.cbz(A::x2, &l);
1955         a.add(A::x3, A::x2, 32);
1956         a.label(&l);
1957         a.ret(A::x30);
1958     },{
1959         0x42,0x00,0x00,0xb4,  // cbz x2, #8
1960         0x43,0x80,0x00,0x91,  // add x3, x2, #32
1961         0xc0,0x03,0x5f,0xd6,  // ret
1962     });
1963 
1964     // If we start a label as a backward label,
1965     // can we redefine it to be a future label?
1966     // (Not sure this is useful... just want to test it works.)
1967     test_asm(r, [&](A& a) {
1968         A::Label l1;
1969         a.label(&l1);
1970         a.add(A::x3, A::x2, 32);
1971         a.cbz(A::x2, &l1);          // This will jump backward... nothing sneaky.
1972 
1973         A::Label l2;                // Start off the same...
1974         a.label(&l2);
1975         a.add(A::x3, A::x2, 32);
1976         a.cbz(A::x2, &l2);          // Looks like this will go backward...
1977         a.add(A::x2, A::x2, 4);
1978         a.add(A::x3, A::x2, 32);
1979         a.label(&l2);               // But no... actually forward!  What a switcheroo!
1980     },{
1981         0x43,0x80,0x00,0x91,  // add x3, x2, #32
1982         0xe2,0xff,0xff,0xb4,  // cbz x2, #-4
1983 
1984         0x43,0x80,0x00,0x91,  // add x3, x2, #32
1985         0x62,0x00,0x00,0xb4,  // cbz x2, #12
1986         0x42,0x10,0x00,0x91,  // add x2, x2, #4
1987         0x43,0x80,0x00,0x91,  // add x3, x2, #32
1988     });
1989 
1990     // Loading from a label on ARM.
1991     test_asm(r, [&](A& a) {
1992         A::Label fore,aft;
1993         a.label(&fore);
1994         a.word(0x01234567);
1995         a.ldrq(A::v1, &fore);
1996         a.ldrq(A::v2, &aft);
1997         a.label(&aft);
1998         a.word(0x76543210);
1999     },{
2000         0x67,0x45,0x23,0x01,
2001         0xe1,0xff,0xff,0x9c,  // ldr q1, #-4
2002         0x22,0x00,0x00,0x9c,  // ldr q2, #4
2003         0x10,0x32,0x54,0x76,
2004     });
2005 
2006     test_asm(r, [&](A& a) {
2007         a.ldrq(A::v0, A::x8);
2008         a.strq(A::v0, A::x8);
2009     },{
2010         0x00,0x01,0xc0,0x3d,
2011         0x00,0x01,0x80,0x3d,
2012     });
2013 
2014     test_asm(r, [&](A& a) {
2015         a.dup4s  (A::v0, A::x8);
2016         a.ld1r4s (A::v0, A::x8);  // echo 'ld1r.4s {v0}, [x8]' | llvm-mc --show-encoding
2017         a.ld1r8h (A::v0, A::x8);
2018         a.ld1r16b(A::v0, A::x8);
2019     },{
2020         0x00,0x0d,0x04,0x4e,
2021         0x00,0xc9,0x40,0x4d,
2022         0x00,0xc5,0x40,0x4d,
2023         0x00,0xc1,0x40,0x4d,
2024     });
2025 
2026     test_asm(r, [&](A& a) {
2027         a.ld24s(A::v0, A::x8);  // echo 'ld2.4s {v0,v1}, [x8]' | llvm-mc --show-encoding
2028         a.ld44s(A::v0, A::x8);
2029         a.st24s(A::v0, A::x8);
2030         a.st44s(A::v0, A::x8);  // echo 'st4.4s {v0,v1,v2,v3}, [x8]' | llvm-mc --show-encoding
2031 
2032         a.ld24s(A::v0, A::x8, 0);  //echo 'ld2 {v0.s,v1.s}[0], [x8]' | llvm-mc --show-encoding
2033         a.ld24s(A::v0, A::x8, 1);
2034         a.ld24s(A::v0, A::x8, 2);
2035         a.ld24s(A::v0, A::x8, 3);
2036 
2037         a.ld44s(A::v0, A::x8, 0);  // ld4 {v0.s,v1.s,v2.s,v3.s}[0], [x8]
2038         a.ld44s(A::v0, A::x8, 1);
2039         a.ld44s(A::v0, A::x8, 2);
2040         a.ld44s(A::v0, A::x8, 3);
2041     },{
2042         0x00,0x89,0x40,0x4c,
2043         0x00,0x09,0x40,0x4c,
2044         0x00,0x89,0x00,0x4c,
2045         0x00,0x09,0x00,0x4c,
2046 
2047         0x00,0x81,0x60,0x0d,
2048         0x00,0x91,0x60,0x0d,
2049         0x00,0x81,0x60,0x4d,
2050         0x00,0x91,0x60,0x4d,
2051 
2052         0x00,0xa1,0x60,0x0d,
2053         0x00,0xb1,0x60,0x0d,
2054         0x00,0xa1,0x60,0x4d,
2055         0x00,0xb1,0x60,0x4d,
2056     });
2057 
2058     test_asm(r, [&](A& a) {
2059         a.xtns2h(A::v0, A::v0);
2060         a.xtnh2b(A::v0, A::v0);
2061         a.strs  (A::v0, A::x0);
2062 
2063         a.ldrs   (A::v0, A::x0);
2064         a.uxtlb2h(A::v0, A::v0);
2065         a.uxtlh2s(A::v0, A::v0);
2066 
2067         a.uminv4s(A::v3, A::v4);
2068         a.movs   (A::x3, A::v4,0);  // mov.s w3,v4[0]
2069         a.movs   (A::x3, A::v4,1);  // mov.s w3,v4[1]
2070         a.inss   (A::v4, A::x3,3);  // ins.s v4[3],w3
2071     },{
2072         0x00,0x28,0x61,0x0e,
2073         0x00,0x28,0x21,0x0e,
2074         0x00,0x00,0x00,0xbd,
2075 
2076         0x00,0x00,0x40,0xbd,
2077         0x00,0xa4,0x08,0x2f,
2078         0x00,0xa4,0x10,0x2f,
2079 
2080         0x83,0xa8,0xb1,0x6e,
2081         0x83,0x3c,0x04,0x0e,
2082         0x83,0x3c,0x0c,0x0e,
2083         0x64,0x1c,0x1c,0x4e,
2084     });
2085 
2086     test_asm(r, [&](A& a) {
2087         a.ldrb(A::v0, A::x8);
2088         a.strb(A::v0, A::x8);
2089     },{
2090         0x00,0x01,0x40,0x3d,
2091         0x00,0x01,0x00,0x3d,
2092     });
2093 
2094     test_asm(r, [&](A& a) {
2095         a.ldrd(A::x0, A::x1, 3);   // ldr  x0, [x1, #24]
2096         a.ldrs(A::x0, A::x1, 3);   // ldr  w0, [x1, #12]
2097         a.ldrh(A::x0, A::x1, 3);   // ldrh w0, [x1, #6]
2098         a.ldrb(A::x0, A::x1, 3);   // ldrb w0, [x1, #3]
2099 
2100         a.strs(A::x0, A::x1, 3);   // str  w0, [x1, #12]
2101     },{
2102         0x20,0x0c,0x40,0xf9,
2103         0x20,0x0c,0x40,0xb9,
2104         0x20,0x0c,0x40,0x79,
2105         0x20,0x0c,0x40,0x39,
2106 
2107         0x20,0x0c,0x00,0xb9,
2108     });
2109 
2110     test_asm(r, [&](A& a) {
2111         a.tbl   (A::v0, A::v1, A::v2);
2112         a.uzp14s(A::v0, A::v1, A::v2);
2113         a.uzp24s(A::v0, A::v1, A::v2);
2114         a.zip14s(A::v0, A::v1, A::v2);
2115         a.zip24s(A::v0, A::v1, A::v2);
2116     },{
2117         0x20,0x00,0x02,0x4e,
2118         0x20,0x18,0x82,0x4e,
2119         0x20,0x58,0x82,0x4e,
2120         0x20,0x38,0x82,0x4e,
2121         0x20,0x78,0x82,0x4e,
2122     });
2123 }
2124 
DEF_TEST(SkVM_approx_math,r)2125 DEF_TEST(SkVM_approx_math, r) {
2126     auto eval = [](int N, float values[], auto fn) {
2127         skvm::Builder b;
2128         skvm::Ptr inout  = b.varying<float>();
2129 
2130         b.storeF(inout, fn(&b, b.loadF(inout)));
2131 
2132         b.done().eval(N, values);
2133     };
2134 
2135     auto compare = [r](int N, const float values[], const float expected[]) {
2136         for (int i = 0; i < N; ++i) {
2137             REPORTER_ASSERT(r, (values[i] == expected[i]) ||
2138                                SkScalarNearlyEqual(values[i], expected[i], 0.001f),
2139                                "evaluated to %g, but expected %g", values[i], expected[i]);
2140         }
2141     };
2142 
2143     // log2
2144     {
2145         float values[] = {0.25f, 0.5f, 1, 2, 4, 8};
2146         constexpr int N = SK_ARRAY_COUNT(values);
2147         eval(N, values, [](skvm::Builder* b, skvm::F32 v) {
2148             return b->approx_log2(v);
2149         });
2150         const float expected[] = {-2, -1, 0, 1, 2, 3};
2151         compare(N, values, expected);
2152     }
2153 
2154     // pow2
2155     {
2156         float values[] = {-80, -5, -2, -1, 0, 1, 2, 3, 5, 160};
2157         constexpr int N = SK_ARRAY_COUNT(values);
2158         eval(N, values, [](skvm::Builder* b, skvm::F32 v) {
2159             return b->approx_pow2(v);
2160         });
2161         const float expected[] = {0, 0.03125f, 0.25f, 0.5f, 1, 2, 4, 8, 32, INFINITY};
2162         compare(N, values, expected);
2163     }
2164     // powf -- 1^x
2165     {
2166         float exps[] = {-2, -1, 0, 1, 2};
2167         constexpr int N = SK_ARRAY_COUNT(exps);
2168         eval(N, exps, [](skvm::Builder* b, skvm::F32 exp) {
2169             return b->approx_powf(b->splat(1.0f), exp);
2170         });
2171         const float expected[] = {1, 1, 1, 1, 1};
2172         compare(N, exps, expected);
2173     }
2174     // powf -- 2^x
2175     {
2176         float exps[] = {-80, -5, -2, -1, 0, 1, 2, 3, 5, 160};
2177         constexpr int N = SK_ARRAY_COUNT(exps);
2178         eval(N, exps, [](skvm::Builder* b, skvm::F32 exp) {
2179             return b->approx_powf(2.0, exp);
2180         });
2181         const float expected[] = {0, 0.03125f, 0.25f, 0.5f, 1, 2, 4, 8, 32, INFINITY};
2182         compare(N, exps, expected);
2183     }
2184     // powf -- 3^x
2185     {
2186         float exps[] = {-2, -1, 0, 1, 2};
2187         constexpr int N = SK_ARRAY_COUNT(exps);
2188         eval(N, exps, [](skvm::Builder* b, skvm::F32 exp) {
2189             return b->approx_powf(b->splat(3.0f), exp);
2190         });
2191         const float expected[] = {1/9.0f, 1/3.0f, 1, 3, 9};
2192         compare(N, exps, expected);
2193     }
2194     // powf -- x^0.5
2195     {
2196         float bases[] = {0, 1, 4, 9, 16};
2197         constexpr int N = SK_ARRAY_COUNT(bases);
2198         eval(N, bases, [](skvm::Builder* b, skvm::F32 base) {
2199             return b->approx_powf(base, b->splat(0.5f));
2200         });
2201         const float expected[] = {0, 1, 2, 3, 4};
2202         compare(N, bases, expected);
2203     }
2204     // powf -- x^1
2205     {
2206         float bases[] = {0, 1, 2, 3, 4};
2207         constexpr int N = SK_ARRAY_COUNT(bases);
2208         eval(N, bases, [](skvm::Builder* b, skvm::F32 base) {
2209             return b->approx_powf(base, b->splat(1.0f));
2210         });
2211         const float expected[] = {0, 1, 2, 3, 4};
2212         compare(N, bases, expected);
2213     }
2214     // powf -- x^2
2215     {
2216         float bases[] = {0, 1, 2, 3, 4};
2217         constexpr int N = SK_ARRAY_COUNT(bases);
2218         eval(N, bases, [](skvm::Builder* b, skvm::F32 base) {
2219             return b->approx_powf(base, b->splat(2.0f));
2220         });
2221         const float expected[] = {0, 1, 4, 9, 16};
2222         compare(N, bases, expected);
2223     }
2224 
2225     auto test = [r](float arg, float expected, float tolerance, auto prog) {
2226         skvm::Builder b;
2227         skvm::Ptr inout  = b.varying<float>();
2228         b.storeF(inout, prog(b.loadF(inout)));
2229         float actual = arg;
2230         b.done().eval(1, &actual);
2231 
2232         float err = std::abs(actual - expected);
2233 
2234         if (err > tolerance) {
2235     //        SkDebugf("arg %g, expected %g, actual %g\n", arg, expected, actual);
2236             REPORTER_ASSERT(r, true);
2237         }
2238         return err;
2239     };
2240 
2241     auto test2 = [r](float arg0, float arg1, float expected, float tolerance, auto prog) {
2242         skvm::Builder b;
2243         skvm::Ptr in0  = b.varying<float>();
2244         skvm::Ptr in1  = b.varying<float>();
2245         skvm::Ptr out  = b.varying<float>();
2246         b.storeF(out, prog(b.loadF(in0), b.loadF(in1)));
2247         float actual;
2248         b.done().eval(1, &arg0, &arg1, &actual);
2249 
2250         float err = std::abs(actual - expected);
2251 
2252         if (err > tolerance) {
2253     //        SkDebugf("[%g, %g]: expected %g, actual %g\n", arg0, arg1, expected, actual);
2254             REPORTER_ASSERT(r, true);
2255         }
2256         return err;
2257     };
2258 
2259     // sine, cosine, tangent
2260     {
2261         constexpr float P = SK_ScalarPI;
2262         constexpr float tol = 0.00175f;
2263         for (float rad = -5*P; rad <= 5*P; rad += 0.1f) {
2264             test(rad, sk_float_sin(rad), tol, [](skvm::F32 x) {
2265                 return approx_sin(x);
2266             });
2267             test(rad, sk_float_cos(rad), tol, [](skvm::F32 x) {
2268                 return approx_cos(x);
2269             });
2270         }
2271 
2272         // Our tangent diverge more as we get near infinities (x near +- Pi/2),
2273         // so bring in the domain a little.
2274         constexpr float eps = 0.16f;
2275         float err = 0;
2276         for (float rad = -P/2 + eps; rad <= P/2 - eps; rad += 0.01f) {
2277             err += test(rad, sk_float_tan(rad), tol, [](skvm::F32 x) {
2278                 return approx_tan(x);
2279             });
2280             // try again with some multiples of P, to check our periodicity
2281             test(rad, sk_float_tan(rad), tol, [=](skvm::F32 x) {
2282                 return approx_tan(x + 3*P);
2283             });
2284             test(rad, sk_float_tan(rad), tol, [=](skvm::F32 x) {
2285                 return approx_tan(x - 3*P);
2286             });
2287         }
2288         if ((false)) { SkDebugf("tan error %g\n", err); }
2289     }
2290 
2291     // asin, acos, atan
2292     {
2293         constexpr float tol = 0.00175f;
2294         float err = 0;
2295         for (float x = -1; x <= 1; x += 1.0f/64) {
2296             err += test(x, asin(x), tol, [](skvm::F32 x) {
2297                 return approx_asin(x);
2298             });
2299             test(x, acos(x), tol, [](skvm::F32 x) {
2300                 return approx_acos(x);
2301             });
2302         }
2303         if ((false)) { SkDebugf("asin error %g\n", err); }
2304 
2305         err = 0;
2306         for (float x = -10; x <= 10; x += 1.0f/16) {
2307             err += test(x, atan(x), tol, [](skvm::F32 x) {
2308                 return approx_atan(x);
2309             });
2310         }
2311         if ((false)) { SkDebugf("atan error %g\n", err); }
2312 
2313         for (float y = -3; y <= 3; y += 1) {
2314             for (float x = -3; x <= 3; x += 1) {
2315                 err += test2(y, x, atan2(y,x), tol, [](skvm::F32 y, skvm::F32 x) {
2316                     return approx_atan2(y,x);
2317                 });
2318             }
2319         }
2320         if ((false)) { SkDebugf("atan2 error %g\n", err); }
2321     }
2322 }
2323 
DEF_TEST(SkVM_min_max,r)2324 DEF_TEST(SkVM_min_max, r) {
2325     // min() and max() have subtle behavior when one argument is NaN and
2326     // the other isn't.  It's not sound to blindly swap their arguments.
2327     //
2328     // All backends must behave like std::min() and std::max(), which are
2329     //
2330     //    min(x,y) = y<x ? y : x
2331     //    max(x,y) = x<y ? y : x
2332 
2333     // ±NaN, ±0, ±1, ±inf
2334     const uint32_t bits[] = {0x7f80'0001, 0xff80'0001, 0x0000'0000, 0x8000'0000,
2335                              0x3f80'0000, 0xbf80'0000, 0x7f80'0000, 0xff80'0000};
2336 
2337     float f[8];
2338     memcpy(f, bits, sizeof(bits));
2339 
2340     auto identical = [&](float x, float y) {
2341         uint32_t X,Y;
2342         memcpy(&X, &x, 4);
2343         memcpy(&Y, &y, 4);
2344         return X == Y;
2345     };
2346 
2347     // Test min/max with non-constant x, non-constant y.
2348     // (Whether x and y are varying or uniform shouldn't make any difference.)
2349     {
2350         skvm::Builder b;
2351         {
2352             skvm::Ptr src = b.varying<float>(),
2353                        mn = b.varying<float>(),
2354                        mx = b.varying<float>();
2355 
2356             skvm::F32 x = b.loadF(src),
2357                       y = b.uniformF(b.uniform(), 0);
2358 
2359             b.storeF(mn, b.min(x,y));
2360             b.storeF(mx, b.max(x,y));
2361         }
2362 
2363         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2364             float mn[8], mx[8];
2365             for (int i = 0; i < 8; i++) {
2366                 // min() and max() everything with f[i].
2367                 program.eval(8, f,mn,mx, &f[i]);
2368 
2369                 for (int j = 0; j < 8; j++) {
2370                     REPORTER_ASSERT(r, identical(mn[j], std::min(f[j], f[i])));
2371                     REPORTER_ASSERT(r, identical(mx[j], std::max(f[j], f[i])));
2372                 }
2373             }
2374         });
2375     }
2376 
2377     // Test each with constant on the right.
2378     for (int i = 0; i < 8; i++) {
2379         skvm::Builder b;
2380         {
2381             skvm::Ptr src = b.varying<float>(),
2382                        mn = b.varying<float>(),
2383                        mx = b.varying<float>();
2384 
2385             skvm::F32 x = b.loadF(src),
2386                       y = b.splat(f[i]);
2387 
2388             b.storeF(mn, b.min(x,y));
2389             b.storeF(mx, b.max(x,y));
2390         }
2391 
2392         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2393             float mn[8], mx[8];
2394             program.eval(8, f,mn,mx);
2395             for (int j = 0; j < 8; j++) {
2396                 REPORTER_ASSERT(r, identical(mn[j], std::min(f[j], f[i])));
2397                 REPORTER_ASSERT(r, identical(mx[j], std::max(f[j], f[i])));
2398             }
2399         });
2400     }
2401 
2402     // Test each with constant on the left.
2403     for (int i = 0; i < 8; i++) {
2404         skvm::Builder b;
2405         {
2406             skvm::Ptr src = b.varying<float>(),
2407                        mn = b.varying<float>(),
2408                        mx = b.varying<float>();
2409 
2410             skvm::F32 x = b.splat(f[i]),
2411                       y = b.loadF(src);
2412 
2413             b.storeF(mn, b.min(x,y));
2414             b.storeF(mx, b.max(x,y));
2415         }
2416 
2417         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2418             float mn[8], mx[8];
2419             program.eval(8, f,mn,mx);
2420             for (int j = 0; j < 8; j++) {
2421                 REPORTER_ASSERT(r, identical(mn[j], std::min(f[i], f[j])));
2422                 REPORTER_ASSERT(r, identical(mx[j], std::max(f[i], f[j])));
2423             }
2424         });
2425     }
2426 }
2427 
DEF_TEST(SkVM_halfs,r)2428 DEF_TEST(SkVM_halfs, r) {
2429     const uint16_t hs[] = {0x0000,0x3800,0x3c00,0x4000,
2430                            0xc400,0xb800,0xbc00,0xc000};
2431     const float fs[] = {+0.0f,+0.5f,+1.0f,+2.0f,
2432                         -4.0f,-0.5f,-1.0f,-2.0f};
2433     {
2434         skvm::Builder b;
2435         skvm::Ptr src = b.varying<uint16_t>(),
2436                   dst = b.varying<float>();
2437         b.storeF(dst, b.from_fp16(b.load16(src)));
2438 
2439         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2440             float dst[8];
2441             program.eval(8, hs, dst);
2442             for (int i = 0; i < 8; i++) {
2443                 REPORTER_ASSERT(r, dst[i] == fs[i]);
2444             }
2445         });
2446     }
2447     {
2448         skvm::Builder b;
2449         skvm::Ptr src = b.varying<float>(),
2450                   dst = b.varying<uint16_t>();
2451         b.store16(dst, b.to_fp16(b.loadF(src)));
2452 
2453         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2454             uint16_t dst[8];
2455             program.eval(8, fs, dst);
2456             for (int i = 0; i < 8; i++) {
2457                 REPORTER_ASSERT(r, dst[i] == hs[i]);
2458             }
2459         });
2460     }
2461 }
2462 
DEF_TEST(SkVM_64bit,r)2463 DEF_TEST(SkVM_64bit, r) {
2464     uint32_t lo[65],
2465              hi[65];
2466     uint64_t wide[65];
2467     for (int i = 0; i < 65; i++) {
2468         lo[i] = 2*i+0;
2469         hi[i] = 2*i+1;
2470         wide[i] = ((uint64_t)lo[i] <<  0)
2471                 | ((uint64_t)hi[i] << 32);
2472     }
2473 
2474     {
2475         skvm::Builder b;
2476         {
2477             skvm::Ptr widePtr = b.varying<uint64_t>(),
2478                         loPtr = b.varying<int>(),
2479                         hiPtr = b.varying<int>();
2480             b.store32(loPtr, b.load64(widePtr, 0));
2481             b.store32(hiPtr, b.load64(widePtr, 1));
2482         }
2483         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2484             uint32_t l[65], h[65];
2485             program.eval(65, wide,l,h);
2486             for (int i = 0; i < 65; i++) {
2487                 REPORTER_ASSERT(r, l[i] == lo[i]);
2488                 REPORTER_ASSERT(r, h[i] == hi[i]);
2489             }
2490         });
2491     }
2492 
2493     {
2494         skvm::Builder b;
2495         {
2496             skvm::Ptr widePtr = b.varying<uint64_t>(),
2497                         loPtr = b.varying<int>(),
2498                         hiPtr = b.varying<int>();
2499             b.store64(widePtr, b.load32(loPtr), b.load32(hiPtr));
2500         }
2501         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2502             uint64_t w[65];
2503             program.eval(65, w,lo,hi);
2504             for (int i = 0; i < 65; i++) {
2505                 REPORTER_ASSERT(r, w[i] == wide[i]);
2506             }
2507         });
2508     }
2509 }
2510 
DEF_TEST(SkVM_128bit,r)2511 DEF_TEST(SkVM_128bit, r) {
2512     float   floats[4*63];
2513     uint8_t packed[4*63];
2514 
2515     for (int i = 0; i < 4*63; i++) {
2516         floats[i] = i * (1/255.0f);
2517     }
2518 
2519     skvm::PixelFormat rgba_ffff = skvm::SkColorType_to_PixelFormat(kRGBA_F32_SkColorType),
2520                       rgba_8888 = skvm::SkColorType_to_PixelFormat(kRGBA_8888_SkColorType);
2521 
2522     {  // Convert RGBA F32 to RGBA 8888, testing 128-bit loads.
2523         skvm::Builder b;
2524         {
2525             skvm::Ptr dst = b.varying(4),
2526                       src = b.varying(16);
2527 
2528             skvm::Color c = b.load(rgba_ffff, src);
2529             b.store(rgba_8888, dst, c);
2530         }
2531         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2532             memset(packed, 0, sizeof(packed));
2533             program.eval(63, packed, floats);
2534             for (int i = 0; i < 4*63; i++) {
2535                 REPORTER_ASSERT(r, packed[i] == i);
2536             }
2537         });
2538     }
2539 
2540 
2541     {  // Convert RGBA 8888 to RGBA F32, testing 128-bit stores.
2542         skvm::Builder b;
2543         {
2544             skvm::Ptr dst = b.varying(16),
2545                       src = b.varying(4);
2546 
2547             skvm::Color c = b.load(rgba_8888, src);
2548             b.store(rgba_ffff, dst, c);
2549         }
2550         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2551             memset(floats, 0, sizeof(floats));
2552             program.eval(63, floats, packed);
2553             for (int i = 0; i < 4*63; i++) {
2554                 REPORTER_ASSERT(r, floats[i] == i * (1/255.0f));
2555             }
2556         });
2557     }
2558 
2559 }
2560 
DEF_TEST(SkVM_is_NaN_is_finite,r)2561 DEF_TEST(SkVM_is_NaN_is_finite, r) {
2562     skvm::Builder b;
2563     {
2564         skvm::Ptr src = b.varying<float>(),
2565                   nan = b.varying<int>(),
2566                   fin = b.varying<int>();
2567         b.store32(nan, is_NaN   (b.loadF(src)));
2568         b.store32(fin, is_finite(b.loadF(src)));
2569     }
2570     test_jit_and_interpreter(b, [&](const skvm::Program& program){
2571         // ±NaN, ±0, ±1, ±inf
2572         const uint32_t bits[] = {0x7f80'0001, 0xff80'0001, 0x0000'0000, 0x8000'0000,
2573                                  0x3f80'0000, 0xbf80'0000, 0x7f80'0000, 0xff80'0000};
2574         uint32_t nan[8], fin[8];
2575         program.eval(8, bits, nan,fin);
2576 
2577         for (int i = 0; i < 8; i++) {
2578             REPORTER_ASSERT(r, nan[i] == ((i == 0 || i == 1) ? 0xffffffff : 0));
2579             REPORTER_ASSERT(r, fin[i] == ((i == 2 || i == 3 ||
2580                                            i == 4 || i == 5) ? 0xffffffff : 0));
2581         }
2582     });
2583 }
2584 
DEF_TEST(SkVM_args,r)2585 DEF_TEST(SkVM_args, r) {
2586     // Test we can handle at least six arguments.
2587     skvm::Builder b;
2588     {
2589         skvm::Ptr dst = b.varying<float>(),
2590                     A = b.varying<float>(),
2591                     B = b.varying<float>(),
2592                     C = b.varying<float>(),
2593                     D = b.varying<float>(),
2594                     E = b.varying<float>();
2595         storeF(dst, b.loadF(A)
2596                   + b.loadF(B)
2597                   + b.loadF(C)
2598                   + b.loadF(D)
2599                   + b.loadF(E));
2600     }
2601 
2602     test_jit_and_interpreter(b, [&](const skvm::Program& program){
2603         float dst[17],A[17],B[17],C[17],D[17],E[17];
2604         for (int i = 0; i < 17; i++) {
2605             A[i] = B[i] = C[i] = D[i] = E[i] = (float)i;
2606         }
2607         program.eval(17, dst,A,B,C,D,E);
2608         for (int i = 0; i < 17; i++) {
2609             REPORTER_ASSERT(r, dst[i] == 5.0f*i);
2610         }
2611     });
2612 }
2613 
DEF_TEST(SkVM_badpack,reporter)2614 DEF_TEST(SkVM_badpack, reporter) {
2615     // Test case distilled from actual failing draw,
2616     // originally with a bad arm64 implementation of pack().
2617     skvm::Builder p;
2618     {
2619         skvm::UPtr uniforms = p.uniform();
2620         skvm::Ptr dst = p.varying<uint16_t>();
2621 
2622         skvm::I32 r = round(p.uniformF(uniforms, 8) * 15),
2623                   a = p.splat(0xf);
2624 
2625         skvm::I32 _4444 = p.splat(0);
2626         _4444 = pack(_4444, r, 12);
2627         _4444 = pack(_4444, a,  0);
2628         store16(dst, _4444);
2629     }
2630 
2631     test_jit_and_interpreter(p, [&](const skvm::Program& program){
2632         const float uniforms[] = { 0.0f, 0.0f,
2633                                    1.0f, 0.0f, 0.0f, 1.0f };
2634 
2635         uint16_t dst[17] = {0};
2636         program.eval(17, uniforms,dst);
2637         for (int i = 0; i < 17; i++) {
2638             REPORTER_ASSERT(reporter, dst[i] == 0xf00f, "got %04x, want %04x\n", dst[i], 0xf00f);
2639         }
2640     });
2641 }
2642 
DEF_TEST(SkVM_features,r)2643 DEF_TEST(SkVM_features, r) {
2644     auto build_program = [](skvm::Builder* b) {
2645         skvm::F32 x = b->loadF(b->varying<float>());
2646         b->storeF(b->varying<float>(), x*x+x);
2647     };
2648 
2649     {   // load-fma-store with FMA available.
2650         skvm::Features features;
2651         features.fma = true;
2652         skvm::Builder b(features);
2653         build_program(&b);
2654         REPORTER_ASSERT(r, b.optimize().size() == 3);
2655     }
2656 
2657     {   // load-mul-add-store without FMA.
2658         skvm::Features features;
2659         features.fma = false;
2660         skvm::Builder b(features);
2661         build_program(&b);
2662         REPORTER_ASSERT(r, b.optimize().size() == 4);
2663     }
2664 
2665     {   // Auto-detected, could be either.
2666         skvm::Builder b;
2667         build_program(&b);
2668         REPORTER_ASSERT(r, b.optimize().size() == 3
2669                         || b.optimize().size() == 4);
2670     }
2671 }
2672 
DEF_TEST(SkVM_gather_can_hoist,r)2673 DEF_TEST(SkVM_gather_can_hoist, r) {
2674     // A gather instruction isn't necessarily varying... it's whatever its index is.
2675     // First a typical gather scenario with varying index.
2676     {
2677         skvm::Builder b;
2678         skvm::UPtr uniforms = b.uniform();
2679         skvm::Ptr buf = b.varying<int>();
2680         skvm::I32 ix = b.load32(buf);
2681         b.store32(buf, b.gather32(uniforms,0, ix));
2682 
2683         skvm::Program p = b.done();
2684 
2685         // ix is varying, so the gather is too.
2686         //
2687         // loop:
2688         //     v0 = load32 buf
2689         //     v1 = gather32 uniforms+0 v0
2690         //     store32 buf v1
2691         REPORTER_ASSERT(r, p.instructions().size() == 3);
2692         REPORTER_ASSERT(r, p.loop() == 0);
2693     }
2694 
2695     // Now the same but with a uniform index instead.
2696     {
2697         skvm::Builder b;
2698         skvm::UPtr uniforms = b.uniform();
2699         skvm::Ptr buf = b.varying<int>();
2700         skvm::I32 ix = b.uniform32(uniforms,8);
2701         b.store32(buf, b.gather32(uniforms,0, ix));
2702 
2703         skvm::Program p = b.done();
2704 
2705         // ix is uniform, so the gather is too.
2706         //
2707         // v0 = uniform32 uniforms+8
2708         // v1 = gather32 uniforms+0 v0
2709         // loop:
2710         //     store32 buf v1
2711         REPORTER_ASSERT(r, p.instructions().size() == 3);
2712         REPORTER_ASSERT(r, p.loop() == 2);
2713     }
2714 }
2715 
DEF_TEST(SkVM_dont_dedup_loads,r)2716 DEF_TEST(SkVM_dont_dedup_loads, r) {
2717     // We've been assuming that all Ops with the same arguments produce the same value
2718     // and deduplicating them, which results in a simple common subexpression eliminator.
2719     //
2720     // But we can't soundly dedup two identical loads with a store between.
2721     // If we dedup the loads in this test program it will always increment by 1, not K.
2722     constexpr int K = 2;
2723     skvm::Builder b;
2724     {
2725         skvm::Ptr buf = b.varying<int>();
2726         for (int i = 0; i < K; i++) {
2727             b.store32(buf, b.load32(buf) + 1);
2728         }
2729     }
2730 
2731     test_jit_and_interpreter(b, [&](const skvm::Program& program){
2732         int buf[] = { 0,1,2,3,4 };
2733         program.eval(SK_ARRAY_COUNT(buf), buf);
2734         for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
2735             REPORTER_ASSERT(r, buf[i] == i+K);
2736         }
2737     });
2738 }
2739 
DEF_TEST(SkVM_dont_dedup_stores,r)2740 DEF_TEST(SkVM_dont_dedup_stores, r) {
2741     // Following a similar line of reasoning to SkVM_dont_dedup_loads,
2742     // we cannot dedup stores either.  A different store between two identical stores
2743     // will invalidate the first store, meaning we do need to reissue that store operation.
2744     skvm::Builder b;
2745     {
2746         skvm::Ptr buf = b.varying<int>();
2747         b.store32(buf, b.splat(4));
2748         b.store32(buf, b.splat(5));
2749         b.store32(buf, b.splat(4));   // If we dedup'd, we'd skip this store.
2750     }
2751 
2752     test_jit_and_interpreter(b, [&](const skvm::Program& program){
2753         int buf[42];
2754         program.eval(SK_ARRAY_COUNT(buf), buf);
2755         for (int x : buf) {
2756             REPORTER_ASSERT(r, x == 4);
2757         }
2758     });
2759 }
2760 
DEF_TEST(SkVM_fast_mul,r)2761 DEF_TEST(SkVM_fast_mul, r) {
2762     skvm::Builder b;
2763     {
2764         skvm::Ptr src = b.varying<float>(),
2765                  fast = b.varying<float>(),
2766                  slow = b.varying<float>();
2767         skvm::F32 x = b.loadF(src);
2768         b.storeF(fast, fast_mul(0.0f, x));
2769         b.storeF(slow, 0.0f * x);
2770     }
2771     test_jit_and_interpreter(b, [&](const skvm::Program& program){
2772         const uint32_t bits[] = {
2773             0x0000'0000, 0x8000'0000, //±0
2774             0x3f80'0000, 0xbf80'0000, //±1
2775             0x7f80'0000, 0xff80'0000, //±inf
2776             0x7f80'0001, 0xff80'0001, //±NaN
2777         };
2778         float fast[8],
2779               slow[8];
2780         program.eval(8,bits,fast,slow);
2781 
2782         for (int i = 0; i < 8; i++) {
2783             REPORTER_ASSERT(r, fast[i] == 0.0f);
2784 
2785             if (i < 4) {
2786                 REPORTER_ASSERT(r, slow[i] == 0.0f);
2787             } else {
2788                 REPORTER_ASSERT(r, isnan(slow[i]));
2789             }
2790         }
2791     });
2792 }
2793 
DEF_TEST(SkVM_duplicates,reporter)2794 DEF_TEST(SkVM_duplicates, reporter) {
2795     {
2796         skvm::Builder p(true);
2797         auto rptr = p.varying<int>();
2798 
2799         skvm::F32 r = p.loadF(rptr),
2800                   g = p.splat(0.0f),
2801                   b = p.splat(0.0f),
2802                   a = p.splat(1.0f);
2803 
2804         p.unpremul(&r, &g, &b, a);
2805         p.storeF(rptr, r);
2806 
2807         std::vector<skvm::Instruction> program = b->program();
2808 
2809         auto withDuplicates = skvm::finalize(program);
2810         int duplicates = 0;
2811         for (const auto& instr : withDuplicates) {
2812             if (instr.op == skvm::Op::duplicate) {
2813                 ++duplicates;
2814             }
2815         }
2816         REPORTER_ASSERT(reporter, duplicates > 0);
2817 
2818         auto eliminatedAsDeadCode = skvm::eliminate_dead_code(program);
2819         for (const auto& instr : eliminatedAsDeadCode) {
2820             REPORTER_ASSERT(reporter, instr.op != skvm::Op::duplicate);
2821         }
2822     }
2823 
2824     {
2825         skvm::Builder p(false);
2826         auto rptr = p.varying<int>();
2827 
2828         skvm::F32 r = p.loadF(rptr),
2829                   g = p.splat(0.0f),
2830                   b = p.splat(0.0f),
2831                   a = p.splat(1.0f);
2832 
2833         p.unpremul(&r, &g, &b, a);
2834         p.storeF(rptr, r);
2835 
2836         auto withoutDuplicates = p.done().instructions();
2837         for (const auto& instr : withoutDuplicates) {
2838             REPORTER_ASSERT(reporter, instr.op != skvm::Op::duplicate);
2839         }
2840     }
2841 }
2842 
DEF_TEST(SkVM_Visualizer,r)2843 DEF_TEST(SkVM_Visualizer, r) {
2844     const char* src =
2845             "int main(int x, int y) {\n"
2846             "   int a = 99;\n"
2847             "   if (x > 0) a += 100;\n"
2848             "   if (y > 0) a += 101;\n"
2849             "   a = 102;\n"
2850             "   return a;\n"
2851             "}";
2852     GrShaderCaps caps;
2853     SkSL::Compiler compiler(&caps);
2854     SkSL::Program::Settings settings;
2855     auto program = compiler.convertProgram(SkSL::ProgramKind::kGeneric,
2856                                            std::string(src), settings);
2857     const SkSL::FunctionDefinition* main = SkSL::Program_GetFunction(*program, "main");
2858     SkSL::SkVMDebugTrace d;
2859     d.setSource(src);
2860     auto v = std::make_unique<skvm::viz::Visualizer>(&d);
2861     skvm::Builder b(skvm::Features{}, /*createDuplicates=*/true);
2862     SkSL::ProgramToSkVM(*program, *main, &b, &d, /*uniforms=*/{});
2863 
2864     skvm::Program p = b.done(nullptr, true, std::move(v));
2865 #if defined(SKVM_JIT)
2866     SkDynamicMemoryWStream asmFile;
2867     p.disassemble(&asmFile);
2868     auto dumpData = asmFile.detachAsData();
2869     std::string dumpString((const char*)dumpData->data(), dumpData->size());
2870 #else
2871     std::string dumpString;
2872 #endif
2873     SkDynamicMemoryWStream vizFile;
2874     p.visualizer()->dump(&vizFile, dumpString.c_str());
2875     auto vizData = vizFile.detachAsData();
2876     std::string html((const char*)vizData->data(), vizData->size());
2877     //b.dump();
2878     //std::printf(html.c_str());
2879     // Check that html contains all types of information:
2880     if (!dumpString.empty() && !std::strstr(dumpString.c_str(), "Program not JIT'd.")) {
2881         REPORTER_ASSERT(r, std::strstr(html.c_str(), "<tr class='machine'>"));  // machine commands
2882     }
2883     REPORTER_ASSERT(r, std::strstr(html.c_str(), "<tr class='normal'>"));       // SkVM byte code
2884     REPORTER_ASSERT(r, std::strstr(html.c_str(), "<tr class='source'>"));       // C++ source
2885     REPORTER_ASSERT(r, std::strstr(html.c_str(), "<tr class='dead'>"));         // dead code
2886     REPORTER_ASSERT(r, std::strstr(html.c_str(), "<tr class='dead deduped'>")); // deduped removed
2887     REPORTER_ASSERT(r, std::strstr(html.c_str(),                                // deduped origins
2888                        "<tr class='normal origin'>"
2889                        "<td>&#8593;&#8593;&#8593; *13</td>"
2890                        "<td>v2 = splat 0 (0)</td></tr>"));
2891     REPORTER_ASSERT(r, std::strstr(html.c_str(),                                // trace enter
2892                        "<tr class='source'><td class='mask'>&#8618;v9</td>"
2893                                    "<td colspan=2>int main(int x, int y)</td></tr>"));
2894     REPORTER_ASSERT(r, std::strstr(html.c_str(),                                // trace exit
2895                        "<tr class='source'><td class='mask'>&#8617;v9</td>"
2896                        "<td colspan=2>int main(int x, int y)</td></tr>"));
2897 }
2898