• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2019 Google LLC
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "include/core/SkColorType.h"
9 #include "include/core/SkData.h"
10 #include "include/core/SkRefCnt.h"
11 #include "include/core/SkScalar.h"
12 #include "include/core/SkSpan.h"
13 #include "include/core/SkStream.h"
14 #include "include/core/SkTypes.h"
15 #include "include/private/SkSLProgramKind.h"
16 #include "include/private/base/SkDebug.h"
17 #include "include/private/base/SkFloatingPoint.h"
18 #include "src/base/SkMSAN.h"
19 #include "src/core/SkVM.h"
20 #include "src/sksl/SkSLCompiler.h"
21 #include "src/sksl/SkSLProgramSettings.h"
22 #include "src/sksl/SkSLUtil.h"
23 #include "src/sksl/codegen/SkSLVMCodeGenerator.h"
24 #include "src/sksl/ir/SkSLFunctionDeclaration.h"
25 #include "src/sksl/ir/SkSLProgram.h"
26 #include "src/sksl/tracing/SkVMDebugTrace.h"
27 #include "src/utils/SkVMVisualizer.h"
28 #include "tests/Test.h"
29 
30 #include <algorithm>
31 #include <cmath>
32 #include <cstdint>
33 #include <cstring>
34 #include <initializer_list>
35 #include <memory>
36 #include <string>
37 #include <utility>
38 #include <vector>
39 
40 template <typename Fn>
test_jit_and_interpreter(const skvm::Builder & b,Fn && test)41 static void test_jit_and_interpreter(const skvm::Builder& b, Fn&& test) {
42     skvm::Program p = b.done();
43     test(p);
44     if (p.hasJIT()) {
45         test(b.done(/*debug_name=*/nullptr, /*allow_jit=*/false));
46     }
47 }
48 
DEF_TEST(SkVM_eliminate_dead_code,r)49 DEF_TEST(SkVM_eliminate_dead_code, r) {
50     skvm::Builder b;
51     {
52         skvm::Ptr arg = b.varying<int>();
53         skvm::I32 l = b.load32(arg);
54         skvm::I32 a = b.add(l, l);
55         b.add(a, b.splat(7));
56     }
57 
58     std::vector<skvm::Instruction> program = b.program();
59     REPORTER_ASSERT(r, program.size() == 4);
60 
61     program = skvm::eliminate_dead_code(program);
62     REPORTER_ASSERT(r, program.size() == 0);
63 }
64 
DEF_TEST(SkVM_Pointless,r)65 DEF_TEST(SkVM_Pointless, r) {
66     // Let's build a program with no memory arguments.
67     // It should all be pegged as dead code, but we should be able to "run" it.
68     skvm::Builder b;
69     {
70         b.add(b.splat(5.0f),
71               b.splat(4.0f));
72     }
73 
74     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
75         for (int N = 0; N < 64; N++) {
76             program.eval(N);
77         }
78     });
79 
80     for (const skvm::OptimizedInstruction& inst : b.optimize()) {
81         REPORTER_ASSERT(r, inst.death == 0 && inst.can_hoist == true);
82     }
83 }
84 
DEF_TEST(SkVM_memset,r)85 DEF_TEST(SkVM_memset, r) {
86     skvm::Builder b;
87     b.store32(b.varying<int>(), b.splat(42));
88 
89     test_jit_and_interpreter(b, [&](const skvm::Program& p) {
90         int buf[18];
91         buf[17] = 47;
92 
93         p.eval(17, buf);
94         for (int i = 0; i < 17; i++) {
95             REPORTER_ASSERT(r, buf[i] == 42);
96         }
97         REPORTER_ASSERT(r, buf[17] == 47);
98     });
99 }
100 
DEF_TEST(SkVM_memcpy,r)101 DEF_TEST(SkVM_memcpy, r) {
102     skvm::Builder b;
103     {
104         auto src = b.varying<int>(),
105              dst = b.varying<int>();
106         b.store32(dst, b.load32(src));
107     }
108 
109     test_jit_and_interpreter(b, [&](const skvm::Program& p) {
110         int src[] = {1,2,3,4,5,6,7,8,9},
111             dst[] = {0,0,0,0,0,0,0,0,0};
112 
113         p.eval(std::size(src)-1, src, dst);
114         for (size_t i = 0; i < std::size(src)-1; i++) {
115             REPORTER_ASSERT(r, dst[i] == src[i]);
116         }
117         size_t i = std::size(src)-1;
118         REPORTER_ASSERT(r, dst[i] == 0);
119     });
120 }
121 
DEF_TEST(SkVM_allow_jit,r)122 DEF_TEST(SkVM_allow_jit, r) {
123     skvm::Builder b;
124     {
125         auto src = b.varying<int>(),
126              dst = b.varying<int>();
127         b.store32(dst, b.load32(src));
128     }
129 
130     if (b.done("test-allow_jit", /*allow_jit=*/true).hasJIT()) {
131         REPORTER_ASSERT(r, !b.done("", false).hasJIT());
132     }
133 }
134 
DEF_TEST(SkVM_LoopCounts,r)135 DEF_TEST(SkVM_LoopCounts, r) {
136     // Make sure we cover all the exact N we want.
137 
138     // buf[i] += 1
139     skvm::Builder b;
140     skvm::Ptr arg = b.varying<int>();
141     b.store32(arg,
142               b.add(b.splat(1),
143                     b.load32(arg)));
144 
145     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
146         int buf[64];
147         for (int N = 0; N <= (int)std::size(buf); N++) {
148             for (int i = 0; i < (int)std::size(buf); i++) {
149                 buf[i] = i;
150             }
151             program.eval(N, buf);
152 
153             for (int i = 0; i < N; i++) {
154                 REPORTER_ASSERT(r, buf[i] == i+1);
155             }
156             for (int i = N; i < (int)std::size(buf); i++) {
157                 REPORTER_ASSERT(r, buf[i] == i);
158             }
159         }
160     });
161 }
162 
DEF_TEST(SkVM_gather32,r)163 DEF_TEST(SkVM_gather32, r) {
164     skvm::Builder b;
165     {
166         skvm::UPtr uniforms = b.uniform();
167         skvm::Ptr buf = b.varying<int>();
168         skvm::I32 x = b.load32(buf);
169         b.store32(buf, b.gather32(uniforms,0, b.bit_and(x, b.splat(7))));
170     }
171 
172     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
173         const int img[] = {12,34,56,78, 90,98,76,54};
174 
175         int buf[20];
176         for (int i = 0; i < 20; i++) {
177             buf[i] = i;
178         }
179 
180         struct Uniforms {
181             const int* img;
182         } uniforms{img};
183 
184         program.eval(20, &uniforms, buf);
185         int i = 0;
186         REPORTER_ASSERT(r, buf[i] == 12); i++;
187         REPORTER_ASSERT(r, buf[i] == 34); i++;
188         REPORTER_ASSERT(r, buf[i] == 56); i++;
189         REPORTER_ASSERT(r, buf[i] == 78); i++;
190         REPORTER_ASSERT(r, buf[i] == 90); i++;
191         REPORTER_ASSERT(r, buf[i] == 98); i++;
192         REPORTER_ASSERT(r, buf[i] == 76); i++;
193         REPORTER_ASSERT(r, buf[i] == 54); i++;
194 
195         REPORTER_ASSERT(r, buf[i] == 12); i++;
196         REPORTER_ASSERT(r, buf[i] == 34); i++;
197         REPORTER_ASSERT(r, buf[i] == 56); i++;
198         REPORTER_ASSERT(r, buf[i] == 78); i++;
199         REPORTER_ASSERT(r, buf[i] == 90); i++;
200         REPORTER_ASSERT(r, buf[i] == 98); i++;
201         REPORTER_ASSERT(r, buf[i] == 76); i++;
202         REPORTER_ASSERT(r, buf[i] == 54); i++;
203 
204         REPORTER_ASSERT(r, buf[i] == 12); i++;
205         REPORTER_ASSERT(r, buf[i] == 34); i++;
206         REPORTER_ASSERT(r, buf[i] == 56); i++;
207         REPORTER_ASSERT(r, buf[i] == 78); i++;
208     });
209 }
210 
DEF_TEST(SkVM_gathers,r)211 DEF_TEST(SkVM_gathers, r) {
212     skvm::Builder b;
213     {
214         skvm::UPtr uniforms = b.uniform();
215         skvm::Ptr buf32    = b.varying<int>(),
216                   buf16    = b.varying<uint16_t>(),
217                   buf8     = b.varying<uint8_t>();
218 
219         skvm::I32 x = b.load32(buf32);
220 
221         b.store32(buf32, b.gather32(uniforms,0, b.bit_and(x, b.splat( 7))));
222         b.store16(buf16, b.gather16(uniforms,0, b.bit_and(x, b.splat(15))));
223         b.store8 (buf8 , b.gather8 (uniforms,0, b.bit_and(x, b.splat(31))));
224     }
225 
226     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
227         const int img[] = {12,34,56,78, 90,98,76,54};
228 
229         constexpr int N = 20;
230         int      buf32[N];
231         uint16_t buf16[N];
232         uint8_t  buf8 [N];
233 
234         for (int i = 0; i < 20; i++) {
235             buf32[i] = i;
236         }
237 
238         struct Uniforms {
239             const int* img;
240         } uniforms{img};
241 
242         program.eval(N, &uniforms, buf32, buf16, buf8);
243         int i = 0;
244         REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 12 && buf8[i] == 12); i++;
245         REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] ==  0 && buf8[i] ==  0); i++;
246         REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 34 && buf8[i] ==  0); i++;
247         REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] ==  0 && buf8[i] ==  0); i++;
248         REPORTER_ASSERT(r, buf32[i] == 90 && buf16[i] == 56 && buf8[i] == 34); i++;
249         REPORTER_ASSERT(r, buf32[i] == 98 && buf16[i] ==  0 && buf8[i] ==  0); i++;
250         REPORTER_ASSERT(r, buf32[i] == 76 && buf16[i] == 78 && buf8[i] ==  0); i++;
251         REPORTER_ASSERT(r, buf32[i] == 54 && buf16[i] ==  0 && buf8[i] ==  0); i++;
252 
253         REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 90 && buf8[i] == 56); i++;
254         REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] ==  0 && buf8[i] ==  0); i++;
255         REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 98 && buf8[i] ==  0); i++;
256         REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] ==  0 && buf8[i] ==  0); i++;
257         REPORTER_ASSERT(r, buf32[i] == 90 && buf16[i] == 76 && buf8[i] == 78); i++;
258         REPORTER_ASSERT(r, buf32[i] == 98 && buf16[i] ==  0 && buf8[i] ==  0); i++;
259         REPORTER_ASSERT(r, buf32[i] == 76 && buf16[i] == 54 && buf8[i] ==  0); i++;
260         REPORTER_ASSERT(r, buf32[i] == 54 && buf16[i] ==  0 && buf8[i] ==  0); i++;
261 
262         REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 12 && buf8[i] == 90); i++;
263         REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] ==  0 && buf8[i] ==  0); i++;
264         REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 34 && buf8[i] ==  0); i++;
265         REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] ==  0 && buf8[i] ==  0); i++;
266     });
267 }
268 
DEF_TEST(SkVM_gathers2,r)269 DEF_TEST(SkVM_gathers2, r) {
270     skvm::Builder b;
271     {
272         skvm::UPtr uniforms = b.uniform();
273         skvm::Ptr buf32    = b.varying<int>(),
274                   buf16    = b.varying<uint16_t>(),
275                   buf8     = b.varying<uint8_t>();
276 
277         skvm::I32 x = b.load32(buf32);
278 
279         b.store32(buf32, b.gather32(uniforms,0, x));
280         b.store16(buf16, b.gather16(uniforms,0, x));
281         b.store8 (buf8 , b.gather8 (uniforms,0, x));
282     }
283 
284     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
285         uint8_t img[256];
286         for (int i = 0; i < 256; i++) {
287             img[i] = i;
288         }
289 
290         int      buf32[64];
291         uint16_t buf16[64];
292         uint8_t  buf8 [64];
293 
294         for (int i = 0; i < 64; i++) {
295             buf32[i] = (i*47)&63;
296             buf16[i] = 0;
297             buf8 [i] = 0;
298         }
299 
300         struct Uniforms {
301             const uint8_t* img;
302         } uniforms{img};
303 
304         program.eval(64, &uniforms, buf32, buf16, buf8);
305 
306         for (int i = 0; i < 64; i++) {
307             REPORTER_ASSERT(r, buf8[i] == ((i*47)&63));  // 0,47,30,13,60,...
308         }
309 
310         REPORTER_ASSERT(r, buf16[ 0] == 0x0100);
311         REPORTER_ASSERT(r, buf16[63] == 0x2322);
312 
313         REPORTER_ASSERT(r, buf32[ 0] == 0x03020100);
314         REPORTER_ASSERT(r, buf32[63] == 0x47464544);
315     });
316 }
317 
DEF_TEST(SkVM_bitops,r)318 DEF_TEST(SkVM_bitops, r) {
319     skvm::Builder b;
320     {
321         skvm::Ptr ptr = b.varying<int>();
322 
323         skvm::I32 x = b.load32(ptr);
324 
325         x = b.bit_and  (x, b.splat(0xf1));  // 0x40
326         x = b.bit_or   (x, b.splat(0x80));  // 0xc0
327         x = b.bit_xor  (x, b.splat(0xfe));  // 0x3e
328         x = b.bit_clear(x, b.splat(0x30));  // 0x0e
329 
330         x = b.shl(x, 28);  // 0xe000'0000
331         x = b.sra(x, 28);  // 0xffff'fffe
332         x = b.shr(x,  1);  // 0x7fff'ffff
333 
334         b.store32(ptr, x);
335     }
336 
337     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
338         int x = 0x42;
339         program.eval(1, &x);
340         REPORTER_ASSERT(r, x == 0x7fff'ffff);
341     });
342 }
343 
DEF_TEST(SkVM_select_is_NaN,r)344 DEF_TEST(SkVM_select_is_NaN, r) {
345     skvm::Builder b;
346     {
347         skvm::Ptr src = b.varying<float>(),
348                   dst = b.varying<float>();
349 
350         skvm::F32 x = b.loadF(src);
351         x = select(is_NaN(x), b.splat(0.0f)
352                             , x);
353         b.storeF(dst, x);
354     }
355 
356     std::vector<skvm::OptimizedInstruction> program = b.optimize();
357     REPORTER_ASSERT(r, program.size() == 4);
358     REPORTER_ASSERT(r, program[0].op == skvm::Op::load32);
359     REPORTER_ASSERT(r, program[1].op == skvm::Op::neq_f32);
360     REPORTER_ASSERT(r, program[2].op == skvm::Op::bit_clear);
361     REPORTER_ASSERT(r, program[3].op == skvm::Op::store32);
362 
363     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
364         // ±NaN, ±0, ±1, ±inf
365         uint32_t src[] = {0x7f80'0001, 0xff80'0001, 0x0000'0000, 0x8000'0000,
366                           0x3f80'0000, 0xbf80'0000, 0x7f80'0000, 0xff80'0000};
367         uint32_t dst[std::size(src)];
368         program.eval(std::size(src), src, dst);
369 
370         for (int i = 0; i < (int)std::size(src); i++) {
371             REPORTER_ASSERT(r, dst[i] == (i < 2 ? 0 : src[i]));
372         }
373     });
374 }
375 
DEF_TEST(SkVM_f32,r)376 DEF_TEST(SkVM_f32, r) {
377     skvm::Builder b;
378     {
379         skvm::Ptr arg = b.varying<float>();
380 
381         skvm::F32 x = b.loadF(arg),
382                   y = b.add(x,x),   // y = 2x
383                   z = b.sub(y,x),   // z = 2x-x = x
384                   w = b.div(z,x);   // w = x/x = 1
385         b.storeF(arg, w);
386     }
387 
388     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
389         float buf[] = { 1,2,3,4,5,6,7,8,9 };
390         program.eval(std::size(buf), buf);
391         for (float v : buf) {
392             REPORTER_ASSERT(r, v == 1.0f);
393         }
394     });
395 }
396 
DEF_TEST(SkVM_cmp_i32,r)397 DEF_TEST(SkVM_cmp_i32, r) {
398     skvm::Builder b;
399     {
400         skvm::I32 x = b.load32(b.varying<int>());
401 
402         auto to_bit = [&](int shift, skvm::I32 mask) {
403             return b.shl(b.bit_and(mask, b.splat(0x1)), shift);
404         };
405 
406         skvm::I32 m = b.splat(0);
407         m = b.bit_or(m, to_bit(0, b. eq(x, b.splat(0))));
408         m = b.bit_or(m, to_bit(1, b.neq(x, b.splat(1))));
409         m = b.bit_or(m, to_bit(2, b. lt(x, b.splat(2))));
410         m = b.bit_or(m, to_bit(3, b.lte(x, b.splat(3))));
411         m = b.bit_or(m, to_bit(4, b. gt(x, b.splat(4))));
412         m = b.bit_or(m, to_bit(5, b.gte(x, b.splat(5))));
413 
414         b.store32(b.varying<int>(), m);
415     }
416     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
417         int in[] = { 0,1,2,3,4,5,6,7,8,9 };
418         int out[std::size(in)];
419 
420         program.eval(std::size(in), in, out);
421 
422         REPORTER_ASSERT(r, out[0] == 0b001111);
423         REPORTER_ASSERT(r, out[1] == 0b001100);
424         REPORTER_ASSERT(r, out[2] == 0b001010);
425         REPORTER_ASSERT(r, out[3] == 0b001010);
426         REPORTER_ASSERT(r, out[4] == 0b000010);
427         for (int i = 5; i < (int)std::size(out); i++) {
428             REPORTER_ASSERT(r, out[i] == 0b110010);
429         }
430     });
431 }
432 
DEF_TEST(SkVM_cmp_f32,r)433 DEF_TEST(SkVM_cmp_f32, r) {
434     skvm::Builder b;
435     {
436         skvm::F32 x = b.loadF(b.varying<float>());
437 
438         auto to_bit = [&](int shift, skvm::I32 mask) {
439             return b.shl(b.bit_and(mask, b.splat(0x1)), shift);
440         };
441 
442         skvm::I32 m = b.splat(0);
443         m = b.bit_or(m, to_bit(0, b. eq(x, b.splat(0.0f))));
444         m = b.bit_or(m, to_bit(1, b.neq(x, b.splat(1.0f))));
445         m = b.bit_or(m, to_bit(2, b. lt(x, b.splat(2.0f))));
446         m = b.bit_or(m, to_bit(3, b.lte(x, b.splat(3.0f))));
447         m = b.bit_or(m, to_bit(4, b. gt(x, b.splat(4.0f))));
448         m = b.bit_or(m, to_bit(5, b.gte(x, b.splat(5.0f))));
449 
450         b.store32(b.varying<int>(), m);
451     }
452 
453     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
454         float in[] = { 0,1,2,3,4,5,6,7,8,9 };
455         int out[std::size(in)];
456 
457         program.eval(std::size(in), in, out);
458 
459         REPORTER_ASSERT(r, out[0] == 0b001111);
460         REPORTER_ASSERT(r, out[1] == 0b001100);
461         REPORTER_ASSERT(r, out[2] == 0b001010);
462         REPORTER_ASSERT(r, out[3] == 0b001010);
463         REPORTER_ASSERT(r, out[4] == 0b000010);
464         for (int i = 5; i < (int)std::size(out); i++) {
465             REPORTER_ASSERT(r, out[i] == 0b110010);
466         }
467     });
468 }
469 
DEF_TEST(SkVM_index,r)470 DEF_TEST(SkVM_index, r) {
471     skvm::Builder b;
472     b.store32(b.varying<int>(), b.index());
473 
474     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
475         int buf[23];
476         program.eval(std::size(buf), buf);
477         for (int i = 0; i < (int)std::size(buf); i++) {
478             REPORTER_ASSERT(r, buf[i] == (int)std::size(buf)-i);
479         }
480     });
481 }
482 
DEF_TEST(SkVM_mad,r)483 DEF_TEST(SkVM_mad, r) {
484     // This program is designed to exercise the tricky corners of instruction
485     // and register selection for Op::mad_f32.
486 
487     skvm::Builder b;
488     {
489         skvm::Ptr arg = b.varying<int>();
490 
491         skvm::F32 x = b.to_F32(b.load32(arg)),
492                   y = b.mad(x,x,x),   // x is needed in the future, so r[x] != r[y].
493                   z = b.mad(y,y,x),   // y is needed in the future, but r[z] = r[x] is ok.
494                   w = b.mad(z,z,y),   // w can alias z but not y.
495                   v = b.mad(w,y,w);   // Got to stop somewhere.
496         b.store32(arg, b.trunc(v));
497     }
498 
499     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
500         int x = 2;
501         program.eval(1, &x);
502         // x = 2
503         // y = 2*2 + 2 = 6
504         // z = 6*6 + 2 = 38
505         // w = 38*38 + 6 = 1450
506         // v = 1450*6 + 1450 = 10150
507         REPORTER_ASSERT(r, x == 10150);
508     });
509 }
510 
DEF_TEST(SkVM_fms,r)511 DEF_TEST(SkVM_fms, r) {
512     // Create a pattern that can be peepholed into an Op::fms_f32.
513     skvm::Builder b;
514     {
515         skvm::Ptr arg = b.varying<int>();
516 
517         skvm::F32 x = b.to_F32(b.load32(arg)),
518                   v = b.sub(b.mul(x, b.splat(2.0f)),
519                             b.splat(1.0f));
520         b.store32(arg, b.trunc(v));
521     }
522 
523     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
524         int buf[] = {0,1,2,3,4,5,6,7,8,9,10};
525         program.eval((int)std::size(buf), &buf);
526 
527         for (int i = 0; i < (int)std::size(buf); i++) {
528             REPORTER_ASSERT(r, buf[i] = 2*i-1);
529         }
530     });
531 }
532 
DEF_TEST(SkVM_fnma,r)533 DEF_TEST(SkVM_fnma, r) {
534     // Create a pattern that can be peepholed into an Op::fnma_f32.
535     skvm::Builder b;
536     {
537         skvm::Ptr arg = b.varying<int>();
538 
539         skvm::F32 x = b.to_F32(b.load32(arg)),
540                   v = b.sub(b.splat(1.0f),
541                             b.mul(x, b.splat(2.0f)));
542         b.store32(arg, b.trunc(v));
543     }
544 
545     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
546         int buf[] = {0,1,2,3,4,5,6,7,8,9,10};
547         program.eval((int)std::size(buf), &buf);
548 
549         for (int i = 0; i < (int)std::size(buf); i++) {
550             REPORTER_ASSERT(r, buf[i] = 1-2*i);
551         }
552     });
553 }
554 
DEF_TEST(SkVM_madder,r)555 DEF_TEST(SkVM_madder, r) {
556     skvm::Builder b;
557     {
558         skvm::Ptr arg = b.varying<float>();
559 
560         skvm::F32 x = b.loadF(arg),
561                   y = b.mad(x,x,x),   // x is needed in the future, so r[x] != r[y].
562                   z = b.mad(y,x,y),   // r[x] can be reused after this instruction, but not r[y].
563                   w = b.mad(y,y,z);
564         b.storeF(arg, w);
565     }
566 
567     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
568         float x = 2.0f;
569         // y = 2*2 + 2 = 6
570         // z = 6*2 + 6 = 18
571         // w = 6*6 + 18 = 54
572         program.eval(1, &x);
573         REPORTER_ASSERT(r, x == 54.0f);
574     });
575 }
576 
DEF_TEST(SkVM_floor,r)577 DEF_TEST(SkVM_floor, r) {
578     skvm::Builder b;
579     {
580         skvm::Ptr arg = b.varying<float>();
581         b.storeF(arg, b.floor(b.loadF(arg)));
582     }
583 
584     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
585         float buf[]  = { -2.0f, -1.5f, -1.0f, 0.0f, 1.0f, 1.5f, 2.0f };
586         float want[] = { -2.0f, -2.0f, -1.0f, 0.0f, 1.0f, 1.0f, 2.0f };
587         program.eval(std::size(buf), buf);
588         for (int i = 0; i < (int)std::size(buf); i++) {
589             REPORTER_ASSERT(r, buf[i] == want[i]);
590         }
591     });
592 }
593 
DEF_TEST(SkVM_round,r)594 DEF_TEST(SkVM_round, r) {
595     skvm::Builder b;
596     {
597         skvm::Ptr src = b.varying<float>();
598         skvm::Ptr dst = b.varying<int>();
599         b.store32(dst, b.round(b.loadF(src)));
600     }
601 
602     // The test cases on exact 0.5f boundaries assume the current rounding mode is nearest even.
603     // We haven't explicitly guaranteed that here... it just probably is.
604     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
605         float buf[]  = { -1.5f, -0.5f, 0.0f, 0.5f, 0.2f, 0.6f, 1.0f, 1.4f, 1.5f, 2.0f };
606         int want[] =   { -2   ,  0   , 0   , 0   , 0   , 1   , 1   , 1   , 2   , 2    };
607         int dst[std::size(buf)];
608 
609         program.eval(std::size(buf), buf, dst);
610         for (int i = 0; i < (int)std::size(dst); i++) {
611             REPORTER_ASSERT(r, dst[i] == want[i]);
612         }
613     });
614 }
615 
DEF_TEST(SkVM_min,r)616 DEF_TEST(SkVM_min, r) {
617     skvm::Builder b;
618     {
619         skvm::Ptr src1 = b.varying<float>();
620         skvm::Ptr src2 = b.varying<float>();
621         skvm::Ptr dst = b.varying<float>();
622 
623         b.storeF(dst, b.min(b.loadF(src1), b.loadF(src2)));
624     }
625 
626     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
627         float s1[]  =  { 0.0f, 1.0f, 4.0f, -1.0f, -1.0f};
628         float s2[]  =  { 0.0f, 2.0f, 3.0f,  1.0f, -2.0f};
629         float want[] = { 0.0f, 1.0f, 3.0f, -1.0f, -2.0f};
630         float d[std::size(s1)];
631         program.eval(std::size(d), s1, s2, d);
632         for (int i = 0; i < (int)std::size(d); i++) {
633           REPORTER_ASSERT(r, d[i] == want[i]);
634         }
635     });
636 }
637 
DEF_TEST(SkVM_max,r)638 DEF_TEST(SkVM_max, r) {
639     skvm::Builder b;
640     {
641         skvm::Ptr src1 = b.varying<float>();
642         skvm::Ptr src2 = b.varying<float>();
643         skvm::Ptr dst = b.varying<float>();
644 
645         b.storeF(dst, b.max(b.loadF(src1), b.loadF(src2)));
646     }
647 
648     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
649         float s1[]  =  { 0.0f, 1.0f, 4.0f, -1.0f, -1.0f};
650         float s2[]  =  { 0.0f, 2.0f, 3.0f,  1.0f, -2.0f};
651         float want[] = { 0.0f, 2.0f, 4.0f,  1.0f, -1.0f};
652         float d[std::size(s1)];
653         program.eval(std::size(d), s1, s2, d);
654         for (int i = 0; i < (int)std::size(d); i++) {
655           REPORTER_ASSERT(r, d[i] == want[i]);
656         }
657     });
658 }
659 
DEF_TEST(SkVM_hoist,r)660 DEF_TEST(SkVM_hoist, r) {
661     // This program uses enough constants that it will fail to JIT if we hoist them.
662     // The JIT will try again without hoisting, and that'll just need 2 registers.
663     skvm::Builder b;
664     {
665         skvm::Ptr arg = b.varying<int>();
666         skvm::I32 x = b.load32(arg);
667         for (int i = 0; i < 32; i++) {
668             x = b.add(x, b.splat(i));
669         }
670         b.store32(arg, x);
671     }
672 
673     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
674         int x = 4;
675         program.eval(1, &x);
676         // x += 0 + 1 + 2 + 3 + ... + 30 + 31
677         // x += 496
678         REPORTER_ASSERT(r, x == 500);
679     });
680 }
681 
DEF_TEST(SkVM_select,r)682 DEF_TEST(SkVM_select, r) {
683     skvm::Builder b;
684     {
685         skvm::Ptr buf = b.varying<int>();
686 
687         skvm::I32 x = b.load32(buf);
688 
689         x = b.select( b.gt(x, b.splat(4)), x, b.splat(42) );
690 
691         b.store32(buf, x);
692     }
693 
694     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
695         int buf[] = { 0,1,2,3,4,5,6,7,8 };
696         program.eval(std::size(buf), buf);
697         for (int i = 0; i < (int)std::size(buf); i++) {
698             REPORTER_ASSERT(r, buf[i] == (i > 4 ? i : 42));
699         }
700     });
701 }
702 
DEF_TEST(SkVM_swap,r)703 DEF_TEST(SkVM_swap, r) {
704     skvm::Builder b;
705     {
706         // This program is the equivalent of
707         //     x = *X
708         //     y = *Y
709         //     *X = y
710         //     *Y = x
711         // One rescheduling of the program based only on data flow of Op arguments is
712         //     x = *X
713         //     *Y = x
714         //     y = *Y
715         //     *X = y
716         // but this reordering does not produce the same results and is invalid.
717         skvm::Ptr X = b.varying<int>(),
718                   Y = b.varying<int>();
719 
720         skvm::I32 x = b.load32(X),
721                   y = b.load32(Y);
722 
723         b.store32(X, y);
724         b.store32(Y, x);
725     }
726 
727     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
728         int b1[] = { 0,1,2,3 };
729         int b2[] = { 4,5,6,7 };
730         program.eval(std::size(b1), b1, b2);
731         for (int i = 0; i < (int)std::size(b1); i++) {
732             REPORTER_ASSERT(r, b1[i] == 4 + i);
733             REPORTER_ASSERT(r, b2[i] == i);
734         }
735     });
736 }
737 
DEF_TEST(SkVM_NewOps,r)738 DEF_TEST(SkVM_NewOps, r) {
739     // Exercise a somewhat arbitrary set of new ops.
740     skvm::Builder b;
741     {
742         skvm::Ptr buf = b.varying<int16_t>();
743         skvm::UPtr uniforms = b.uniform();
744 
745         skvm::I32 x = b.load16(buf);
746 
747         const size_t kPtr = sizeof(const int*);
748 
749         x = b.add(x, b.uniform32(uniforms, kPtr+0));
750         x = b.mul(x, b.uniform32(uniforms, kPtr+4));
751         x = b.sub(x, b.uniform32(uniforms, kPtr+8));
752 
753         skvm::I32 limit = b.uniform32(uniforms, kPtr+12);
754         x = b.select(b.lt(x, b.splat(0)), b.splat(0), x);
755         x = b.select(b.gt(x, limit     ), limit     , x);
756 
757         x = b.gather8(uniforms,0, x);
758 
759         b.store16(buf, x);
760     }
761 
762     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
763         const int N = 31;
764         int16_t buf[N];
765         for (int i = 0; i < N; i++) {
766             buf[i] = i;
767         }
768 
769         const int M = 16;
770         uint8_t img[M];
771         for (int i = 0; i < M; i++) {
772             img[i] = i*i;
773         }
774 
775         struct {
776             const uint8_t* img;
777             int      add   = 5;
778             int      mul   = 3;
779             int      sub   = 18;
780             int      limit = M-1;
781         } uniforms{img};
782 
783         program.eval(N, buf, &uniforms);
784 
785         for (int i = 0; i < N; i++) {
786             // Our first math calculates x = (i+5)*3 - 18 a.k.a 3*(i-1).
787             int x = 3*(i-1);
788 
789             // Then that's pinned to the limits of img.
790             if (i < 2) { x =  0; }  // Notice i == 1 hits x == 0 exactly...
791             if (i > 5) { x = 15; }  // ...and i == 6 hits x == 15 exactly
792             REPORTER_ASSERT(r, buf[i] == img[x]);
793         }
794     });
795 }
796 
DEF_TEST(SKVM_array32,r)797 DEF_TEST(SKVM_array32, r) {
798 
799 
800 
801     skvm::Builder b;
802     skvm::Uniforms uniforms(b.uniform(), 0);
803     // Take up the first slot, so other uniforms are not at 0 offset.
804     uniforms.push(0);
805     int i[] = {3, 7};
806     skvm::Uniform array = uniforms.pushArray(i);
807     float f[] = {5, 9};
808     skvm::Uniform arrayF = uniforms.pushArrayF(f);
809     {
810         skvm::Ptr buf0     = b.varying<int32_t>(),
811                   buf1     = b.varying<int32_t>(),
812                   buf2     = b.varying<int32_t>();
813 
814         skvm::I32 j = b.array32(array, 0);
815         b.store32(buf0, j);
816         skvm::I32 k = b.array32(array, 1);
817         b.store32(buf1, k);
818 
819         skvm::F32 x = b.arrayF(arrayF, 0);
820         skvm::F32 y = b.arrayF(arrayF, 1);
821         b.store32(buf2, b.trunc(b.add(x, y)));
822     }
823 
824     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
825         const int K = 10;
826         int32_t buf0[K],
827                 buf1[K],
828                 buf2[K];
829 
830         // reset the i[0] for the two tests.
831         i[0] = 3;
832         f[1] = 9;
833         program.eval(K, uniforms.buf.data(), buf0, buf1, buf2);
834         for (auto v : buf0) {
835             REPORTER_ASSERT(r, v == 3);
836         }
837         for (auto v : buf1) {
838             REPORTER_ASSERT(r, v == 7);
839         }
840         for (auto v : buf2) {
841             REPORTER_ASSERT(r, v == 14);
842         }
843         i[0] = 4;
844         f[1] = 10;
845         program.eval(K, uniforms.buf.data(), buf0, buf1, buf2);
846         for (auto v : buf0) {
847             REPORTER_ASSERT(r, v == 4);
848         }
849         for (auto v : buf1) {
850             REPORTER_ASSERT(r, v == 7);
851         }
852         for (auto v : buf2) {
853             REPORTER_ASSERT(r, v == 15);
854         }
855     });
856 }
857 
DEF_TEST(SkVM_sqrt,r)858 DEF_TEST(SkVM_sqrt, r) {
859     skvm::Builder b;
860     auto buf = b.varying<int>();
861     b.storeF(buf, b.sqrt(b.loadF(buf)));
862 
863     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
864         constexpr int K = 17;
865         float buf[K];
866         for (int i = 0; i < K; i++) {
867             buf[i] = (float)(i*i);
868         }
869 
870         // x^2 -> x
871         program.eval(K, buf);
872 
873         for (int i = 0; i < K; i++) {
874             REPORTER_ASSERT(r, buf[i] == (float)i);
875         }
876     });
877 }
878 
DEF_TEST(SkVM_MSAN,r)879 DEF_TEST(SkVM_MSAN, r) {
880     // This little memset32() program should be able to JIT, but if we run that
881     // JIT code in an MSAN build, it won't see the writes initialize buf.  So
882     // this tests that we're using the interpreter instead.
883     skvm::Builder b;
884     b.store32(b.varying<int>(), b.splat(42));
885 
886     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
887         constexpr int K = 17;
888         int buf[K];                 // Intentionally uninitialized.
889         program.eval(K, buf);
890         sk_msan_assert_initialized(buf, buf+K);
891         for (int x : buf) {
892             REPORTER_ASSERT(r, x == 42);
893         }
894     });
895 }
896 
DEF_TEST(SkVM_assert,r)897 DEF_TEST(SkVM_assert, r) {
898     skvm::Builder b;
899     b.assert_true(b.lt(b.load32(b.varying<int>()),
900                        b.splat(42)));
901 
902     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
903         int buf[] = { 0,1,2,3,4,5,6,7,8,9 };
904         program.eval(std::size(buf), buf);
905     });
906 }
907 
DEF_TEST(SkVM_trace_line,r)908 DEF_TEST(SkVM_trace_line, r) {
909     class TestTraceHook : public skvm::TraceHook {
910     public:
911         void var(int, int32_t) override { fBuffer.push_back(-9999999); }
912         void enter(int) override        { fBuffer.push_back(-9999999); }
913         void exit(int) override         { fBuffer.push_back(-9999999); }
914         void scope(int) override        { fBuffer.push_back(-9999999); }
915         void line(int lineNum) override { fBuffer.push_back(lineNum); }
916 
917         std::vector<int> fBuffer;
918     };
919 
920     skvm::Builder b;
921     TestTraceHook testTrace;
922     int traceHookID = b.attachTraceHook(&testTrace);
923     b.trace_line(traceHookID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 123);
924     b.trace_line(traceHookID, b.splat(0x00000000), b.splat(0xFFFFFFFF), 456);
925     b.trace_line(traceHookID, b.splat(0xFFFFFFFF), b.splat(0x00000000), 567);
926     b.trace_line(traceHookID, b.splat(0x00000000), b.splat(0x00000000), 678);
927     b.trace_line(traceHookID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 789);
928     skvm::Program p = b.done();
929     p.eval(1);
930 
931     REPORTER_ASSERT(r, (testTrace.fBuffer == std::vector<int>{123, 789}));
932 }
933 
DEF_TEST(SkVM_trace_var,r)934 DEF_TEST(SkVM_trace_var, r) {
935     class TestTraceHook : public skvm::TraceHook {
936     public:
937         void line(int) override                  { fBuffer.push_back(-9999999); }
938         void enter(int) override                 { fBuffer.push_back(-9999999); }
939         void exit(int) override                  { fBuffer.push_back(-9999999); }
940         void scope(int) override                 { fBuffer.push_back(-9999999); }
941         void var(int slot, int32_t val) override {
942             fBuffer.push_back(slot);
943             fBuffer.push_back(val);
944         }
945 
946         std::vector<int> fBuffer;
947     };
948 
949     skvm::Builder b;
950     TestTraceHook testTrace;
951     int traceHookID = b.attachTraceHook(&testTrace);
952     b.trace_var(traceHookID, b.splat(0x00000000), b.splat(0xFFFFFFFF), 2, b.splat(333));
953     b.trace_var(traceHookID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 4, b.splat(555));
954     b.trace_var(traceHookID, b.splat(0x00000000), b.splat(0x00000000), 5, b.splat(666));
955     b.trace_var(traceHookID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 6, b.splat(777));
956     b.trace_var(traceHookID, b.splat(0xFFFFFFFF), b.splat(0x00000000), 8, b.splat(999));
957     skvm::Program p = b.done();
958     p.eval(1);
959 
960     REPORTER_ASSERT(r, (testTrace.fBuffer == std::vector<int>{4, 555, 6, 777}));
961 }
962 
DEF_TEST(SkVM_trace_enter_exit,r)963 DEF_TEST(SkVM_trace_enter_exit, r) {
964     class TestTraceHook : public skvm::TraceHook {
965     public:
966         void line(int) override                   { fBuffer.push_back(-9999999); }
967         void var(int, int32_t) override           { fBuffer.push_back(-9999999); }
968         void scope(int) override                  { fBuffer.push_back(-9999999); }
969         void enter(int fnIdx) override {
970             fBuffer.push_back(fnIdx);
971             fBuffer.push_back(1);
972         }
973         void exit(int fnIdx) override {
974             fBuffer.push_back(fnIdx);
975             fBuffer.push_back(0);
976         }
977 
978         std::vector<int> fBuffer;
979     };
980 
981     skvm::Builder b;
982     TestTraceHook testTrace;
983     int traceHookID = b.attachTraceHook(&testTrace);
984     b.trace_enter(traceHookID, b.splat(0x00000000), b.splat(0x00000000), 99);
985     b.trace_enter(traceHookID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 12);
986     b.trace_enter(traceHookID, b.splat(0x00000000), b.splat(0xFFFFFFFF), 34);
987     b.trace_exit(traceHookID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 56);
988     b.trace_exit(traceHookID, b.splat(0xFFFFFFFF), b.splat(0x00000000), 78);
989     b.trace_exit(traceHookID, b.splat(0x00000000), b.splat(0x00000000), 90);
990     skvm::Program p = b.done();
991     p.eval(1);
992 
993     REPORTER_ASSERT(r, (testTrace.fBuffer == std::vector<int>{12, 1, 56, 0}));
994 }
995 
DEF_TEST(SkVM_trace_scope,r)996 DEF_TEST(SkVM_trace_scope, r) {
997     class TestTraceHook : public skvm::TraceHook {
998     public:
999         void var(int, int32_t) override { fBuffer.push_back(-9999999); }
1000         void enter(int) override        { fBuffer.push_back(-9999999); }
1001         void exit(int) override         { fBuffer.push_back(-9999999); }
1002         void line(int) override         { fBuffer.push_back(-9999999); }
1003         void scope(int delta) override  { fBuffer.push_back(delta); }
1004 
1005         std::vector<int> fBuffer;
1006     };
1007 
1008     skvm::Builder b;
1009     TestTraceHook testTrace;
1010     int traceHookID = b.attachTraceHook(&testTrace);
1011     b.trace_scope(traceHookID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 1);
1012     b.trace_scope(traceHookID, b.splat(0xFFFFFFFF), b.splat(0x00000000), -2);
1013     b.trace_scope(traceHookID, b.splat(0x00000000), b.splat(0x00000000), 3);
1014     b.trace_scope(traceHookID, b.splat(0x00000000), b.splat(0xFFFFFFFF), 4);
1015     b.trace_scope(traceHookID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), -5);
1016     skvm::Program p = b.done();
1017     p.eval(1);
1018 
1019     REPORTER_ASSERT(r, (testTrace.fBuffer == std::vector<int>{1, -5}));
1020 }
1021 
DEF_TEST(SkVM_trace_multiple_hooks,r)1022 DEF_TEST(SkVM_trace_multiple_hooks, r) {
1023     class TestTraceHook : public skvm::TraceHook {
1024     public:
1025         void var(int, int32_t) override { fBuffer.push_back(-9999999); }
1026         void enter(int) override        { fBuffer.push_back(-9999999); }
1027         void exit(int) override         { fBuffer.push_back(-9999999); }
1028         void scope(int) override        { fBuffer.push_back(-9999999); }
1029         void line(int lineNum) override { fBuffer.push_back(lineNum); }
1030 
1031         std::vector<int> fBuffer;
1032     };
1033 
1034     skvm::Builder b;
1035     TestTraceHook testTraceA, testTraceB, testTraceC;
1036     int traceHookAID = b.attachTraceHook(&testTraceA);
1037     int traceHookBID = b.attachTraceHook(&testTraceB);
1038     int traceHookCID = b.attachTraceHook(&testTraceC);
1039     b.trace_line(traceHookCID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 111);
1040     b.trace_line(traceHookAID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 222);
1041     b.trace_line(traceHookCID, b.splat(0x00000000), b.splat(0x00000000), 333);
1042     b.trace_line(traceHookBID, b.splat(0xFFFFFFFF), b.splat(0x00000000), 444);
1043     b.trace_line(traceHookAID, b.splat(0x00000000), b.splat(0xFFFFFFFF), 555);
1044     b.trace_line(traceHookBID, b.splat(0xFFFFFFFF), b.splat(0xFFFFFFFF), 666);
1045     skvm::Program p = b.done();
1046     p.eval(1);
1047 
1048     REPORTER_ASSERT(r, (testTraceA.fBuffer == std::vector<int>{222}));
1049     REPORTER_ASSERT(r, (testTraceB.fBuffer == std::vector<int>{666}));
1050     REPORTER_ASSERT(r, (testTraceC.fBuffer == std::vector<int>{111}));
1051 }
1052 
DEF_TEST(SkVM_premul,reporter)1053 DEF_TEST(SkVM_premul, reporter) {
1054     // Test that premul is short-circuited when alpha is known opaque.
1055     {
1056         skvm::Builder p;
1057         auto rptr = p.varying<int>(),
1058              aptr = p.varying<int>();
1059 
1060         skvm::F32 r = p.loadF(rptr),
1061                   g = p.splat(0.0f),
1062                   b = p.splat(0.0f),
1063                   a = p.loadF(aptr);
1064 
1065         p.premul(&r, &g, &b, a);
1066         p.storeF(rptr, r);
1067 
1068         // load red, load alpha, red *= alpha, store red
1069         REPORTER_ASSERT(reporter, p.done().instructions().size() == 4);
1070     }
1071 
1072     {
1073         skvm::Builder p;
1074         auto rptr = p.varying<int>();
1075 
1076         skvm::F32 r = p.loadF(rptr),
1077                   g = p.splat(0.0f),
1078                   b = p.splat(0.0f),
1079                   a = p.splat(1.0f);
1080 
1081         p.premul(&r, &g, &b, a);
1082         p.storeF(rptr, r);
1083 
1084         // load red, store red
1085         REPORTER_ASSERT(reporter, p.done().instructions().size() == 2);
1086     }
1087 
1088     // Same deal for unpremul.
1089     {
1090         skvm::Builder p;
1091         auto rptr = p.varying<int>(),
1092              aptr = p.varying<int>();
1093 
1094         skvm::F32 r = p.loadF(rptr),
1095                   g = p.splat(0.0f),
1096                   b = p.splat(0.0f),
1097                   a = p.loadF(aptr);
1098 
1099         p.unpremul(&r, &g, &b, a);
1100         p.storeF(rptr, r);
1101 
1102         // load red, load alpha, a bunch of unpremul instructions, store red
1103         REPORTER_ASSERT(reporter, p.done().instructions().size() >= 4);
1104     }
1105 
1106     {
1107         skvm::Builder p;
1108         auto rptr = p.varying<int>();
1109 
1110         skvm::F32 r = p.loadF(rptr),
1111                   g = p.splat(0.0f),
1112                   b = p.splat(0.0f),
1113                   a = p.splat(1.0f);
1114 
1115         p.unpremul(&r, &g, &b, a);
1116         p.storeF(rptr, r);
1117 
1118         // load red, store red
1119         REPORTER_ASSERT(reporter, p.done().instructions().size() == 2);
1120     }
1121 }
1122 
1123 template <typename Fn>
test_asm(skiatest::Reporter * r,Fn && fn,std::initializer_list<uint8_t> expected)1124 static void test_asm(skiatest::Reporter* r, Fn&& fn, std::initializer_list<uint8_t> expected) {
1125     uint8_t buf[4096];
1126     skvm::Assembler a{buf};
1127     fn(a);
1128 
1129     REPORTER_ASSERT(r, a.size() == expected.size());
1130 
1131     auto got = (const uint8_t*)buf,
1132          want = expected.begin();
1133     for (int i = 0; i < (int)std::min(a.size(), expected.size()); i++) {
1134         REPORTER_ASSERT(r, got[i] == want[i],
1135                         "byte %d was %02x, want %02x", i, got[i], want[i]);
1136     }
1137 }
1138 
DEF_TEST(SkVM_Assembler,r)1139 DEF_TEST(SkVM_Assembler, r) {
1140     // Easiest way to generate test cases is
1141     //
1142     //   echo '...some asm...' | llvm-mc -show-encoding -x86-asm-syntax=intel
1143     //
1144     // The -x86-asm-syntax=intel bit is optional, controlling the
1145     // input syntax only; the output will always be AT&T  op x,y,dst style.
1146     // Our APIs read more like Intel op dst,x,y as op(dst,x,y), so I find
1147     // that a bit easier to use here, despite maybe favoring AT&T overall.
1148 
1149     using A = skvm::Assembler;
1150     // Our exit strategy from AVX code.
1151     test_asm(r, [&](A& a) {
1152         a.int3();
1153         a.vzeroupper();
1154         a.ret();
1155     },{
1156         0xcc,
1157         0xc5, 0xf8, 0x77,
1158         0xc3,
1159     });
1160 
1161     // Align should pad with zero
1162     test_asm(r, [&](A& a) {
1163         a.ret();
1164         a.align(4);
1165     },{
1166         0xc3,
1167         0x00, 0x00, 0x00,
1168     });
1169 
1170     test_asm(r, [&](A& a) {
1171         a.add(A::rax, 8);       // Always good to test rax.
1172         a.sub(A::rax, 32);
1173 
1174         a.add(A::rdi, 12);      // Last 0x48 REX
1175         a.sub(A::rdi, 8);
1176 
1177         a.add(A::r8 , 7);       // First 0x49 REX
1178         a.sub(A::r8 , 4);
1179 
1180         a.add(A::rsi, 128);     // Requires 4 byte immediate.
1181         a.sub(A::r8 , 1000000);
1182 
1183         a.add(A::Mem{A::rsi}, 7);                       // addq $7, (%rsi)
1184         a.add(A::Mem{A::rsi, 12}, 7);                   // addq $7, 12(%rsi)
1185         a.add(A::Mem{A::rsp, 12}, 7);                   // addq $7, 12(%rsp)
1186         a.add(A::Mem{A::r12, 12}, 7);                   // addq $7, 12(%r12)
1187         a.add(A::Mem{A::rsp, 12, A::rax, A::FOUR}, 7);  // addq $7, 12(%rsp,%rax,4)
1188         a.add(A::Mem{A::r12, 12, A::rax, A::FOUR}, 7);  // addq $7, 12(%r12,%rax,4)
1189         a.add(A::Mem{A::rax, 12, A::r12, A::FOUR}, 7);  // addq $7, 12(%rax,%r12,4)
1190         a.add(A::Mem{A::r11, 12, A::r8 , A::TWO }, 7);  // addq $7, 12(%r11,%r8,2)
1191         a.add(A::Mem{A::r11, 12, A::rax}         , 7);  // addq $7, 12(%r11,%rax)
1192         a.add(A::Mem{A::rax, 12, A::r11}         , 7);  // addq $7, 12(%rax,%r11)
1193 
1194         a.sub(A::Mem{A::rax, 12, A::r11}         , 7);  // subq $7, 12(%rax,%r11)
1195 
1196         a.add(       A::rax     , A::rcx);              // addq %rcx, %rax
1197         a.add(A::Mem{A::rax}    , A::rcx);              // addq %rcx, (%rax)
1198         a.add(A::Mem{A::rax, 12}, A::rcx);              // addq %rcx, 12(%rax)
1199         a.add(A::rcx, A::Mem{A::rax, 12});              // addq 12(%rax), %rcx
1200 
1201         a.sub(A::rcx, A::Mem{A::rax, 12});              // subq 12(%rax), %rcx
1202     },{
1203         0x48, 0x83, 0b11'000'000, 0x08,
1204         0x48, 0x83, 0b11'101'000, 0x20,
1205 
1206         0x48, 0x83, 0b11'000'111, 0x0c,
1207         0x48, 0x83, 0b11'101'111, 0x08,
1208 
1209         0x49, 0x83, 0b11'000'000, 0x07,
1210         0x49, 0x83, 0b11'101'000, 0x04,
1211 
1212         0x48, 0x81, 0b11'000'110, 0x80, 0x00, 0x00, 0x00,
1213         0x49, 0x81, 0b11'101'000, 0x40, 0x42, 0x0f, 0x00,
1214 
1215         0x48,0x83,0x06,0x07,
1216         0x48,0x83,0x46,0x0c,0x07,
1217         0x48,0x83,0x44,0x24,0x0c,0x07,
1218         0x49,0x83,0x44,0x24,0x0c,0x07,
1219         0x48,0x83,0x44,0x84,0x0c,0x07,
1220         0x49,0x83,0x44,0x84,0x0c,0x07,
1221         0x4a,0x83,0x44,0xa0,0x0c,0x07,
1222         0x4b,0x83,0x44,0x43,0x0c,0x07,
1223         0x49,0x83,0x44,0x03,0x0c,0x07,
1224         0x4a,0x83,0x44,0x18,0x0c,0x07,
1225 
1226         0x4a,0x83,0x6c,0x18,0x0c,0x07,
1227 
1228         0x48,0x01,0xc8,
1229         0x48,0x01,0x08,
1230         0x48,0x01,0x48,0x0c,
1231         0x48,0x03,0x48,0x0c,
1232         0x48,0x2b,0x48,0x0c,
1233     });
1234 
1235 
1236     test_asm(r, [&](A& a) {
1237         a.vpaddd (A::ymm0, A::ymm1, A::ymm2);  // Low registers and 0x0f map     -> 2-byte VEX.
1238         a.vpaddd (A::ymm8, A::ymm1, A::ymm2);  // A high dst register is ok      -> 2-byte VEX.
1239         a.vpaddd (A::ymm0, A::ymm8, A::ymm2);  // A high first argument register -> 2-byte VEX.
1240         a.vpaddd (A::ymm0, A::ymm1, A::ymm8);  // A high second argument         -> 3-byte VEX.
1241         a.vpmulld(A::ymm0, A::ymm1, A::ymm2);  // Using non-0x0f map instruction -> 3-byte VEX.
1242         a.vpsubd (A::ymm0, A::ymm1, A::ymm2);  // Test vpsubd to ensure argument order is right.
1243     },{
1244         /*    VEX     */ /*op*/ /*modRM*/
1245         0xc5,       0xf5, 0xfe, 0xc2,
1246         0xc5,       0x75, 0xfe, 0xc2,
1247         0xc5,       0xbd, 0xfe, 0xc2,
1248         0xc4, 0xc1, 0x75, 0xfe, 0xc0,
1249         0xc4, 0xe2, 0x75, 0x40, 0xc2,
1250         0xc5,       0xf5, 0xfa, 0xc2,
1251     });
1252 
1253     test_asm(r, [&](A& a) {
1254         a.vpaddw   (A::ymm4, A::ymm3, A::ymm2);
1255         a.vpavgw   (A::ymm4, A::ymm3, A::ymm2);
1256         a.vpcmpeqw (A::ymm4, A::ymm3, A::ymm2);
1257         a.vpcmpgtw (A::ymm4, A::ymm3, A::ymm2);
1258 
1259         a.vpminsw  (A::ymm4, A::ymm3, A::ymm2);
1260         a.vpmaxsw  (A::ymm4, A::ymm3, A::ymm2);
1261         a.vpminuw  (A::ymm4, A::ymm3, A::ymm2);
1262         a.vpmaxuw  (A::ymm4, A::ymm3, A::ymm2);
1263 
1264         a.vpmulhrsw(A::ymm4, A::ymm3, A::ymm2);
1265         a.vpabsw   (A::ymm4, A::ymm3);
1266         a.vpsllw   (A::ymm4, A::ymm3, 12);
1267         a.vpsraw   (A::ymm4, A::ymm3, 12);
1268     },{
1269         0xc5,     0xe5, 0xfd, 0xe2,
1270         0xc5,     0xe5, 0xe3, 0xe2,
1271         0xc5,     0xe5, 0x75, 0xe2,
1272         0xc5,     0xe5, 0x65, 0xe2,
1273 
1274         0xc5,     0xe5, 0xea, 0xe2,
1275         0xc5,     0xe5, 0xee, 0xe2,
1276         0xc4,0xe2,0x65, 0x3a, 0xe2,
1277         0xc4,0xe2,0x65, 0x3e, 0xe2,
1278 
1279         0xc4,0xe2,0x65, 0x0b, 0xe2,
1280         0xc4,0xe2,0x7d, 0x1d, 0xe3,
1281         0xc5,0xdd,0x71, 0xf3, 0x0c,
1282         0xc5,0xdd,0x71, 0xe3, 0x0c,
1283     });
1284 
1285     test_asm(r, [&](A& a) {
1286         A::Label l;
1287         a.vcmpeqps (A::ymm0, A::ymm1, &l);      // vcmpeqps 0x1c(%rip), %ymm1, %ymm0
1288         a.vpcmpeqd (A::ymm0, A::ymm1, A::ymm2);
1289         a.vpcmpgtd (A::ymm0, A::ymm1, A::ymm2);
1290         a.vcmpeqps (A::ymm0, A::ymm1, A::ymm2);
1291         a.vcmpltps (A::ymm0, A::ymm1, A::ymm2);
1292         a.vcmpleps (A::ymm0, A::ymm1, A::ymm2);
1293         a.vcmpneqps(A::ymm0, A::ymm1, A::ymm2);
1294         a.label(&l);   // 28 bytes after the vcmpeqps that uses it.
1295     },{
1296         0xc5,0xf4,0xc2,0x05,0x1c,0x00,0x00,0x00,0x00,
1297         0xc5,0xf5,0x76,0xc2,
1298         0xc5,0xf5,0x66,0xc2,
1299         0xc5,0xf4,0xc2,0xc2,0x00,
1300         0xc5,0xf4,0xc2,0xc2,0x01,
1301         0xc5,0xf4,0xc2,0xc2,0x02,
1302         0xc5,0xf4,0xc2,0xc2,0x04,
1303     });
1304 
1305     test_asm(r, [&](A& a) {
1306         a.vminps(A::ymm0, A::ymm1, A::ymm2);
1307         a.vmaxps(A::ymm0, A::ymm1, A::ymm2);
1308     },{
1309         0xc5,0xf4,0x5d,0xc2,
1310         0xc5,0xf4,0x5f,0xc2,
1311     });
1312 
1313     test_asm(r, [&](A& a) {
1314         a.vpblendvb(A::ymm0, A::ymm1, A::ymm2, A::ymm3);
1315     },{
1316         0xc4,0xe3,0x75, 0x4c, 0xc2, 0x30,
1317     });
1318 
1319     test_asm(r, [&](A& a) {
1320         a.vpsrld(A::ymm15, A::ymm2, 8);
1321         a.vpsrld(A::ymm0 , A::ymm8, 5);
1322     },{
1323         0xc5,     0x85, 0x72,0xd2, 0x08,
1324         0xc4,0xc1,0x7d, 0x72,0xd0, 0x05,
1325     });
1326 
1327     test_asm(r, [&](A& a) {
1328         A::Label l;
1329         a.vpermps(A::ymm1, A::ymm2, A::Mem{A::rdi, 32});
1330         a.vperm2f128(A::ymm1, A::ymm2, &l, 0x20);
1331         a.vpermq(A::ymm1, A::ymm2, 5);
1332         a.label(&l);  // 6 bytes after vperm2f128
1333     },{
1334         0xc4,0xe2,0x6d,0x16,0x4f,0x20,
1335         0xc4,0xe3,0x6d,0x06,0x0d,0x06,0x00,0x00,0x00,0x20,
1336         0xc4,0xe3,0xfd, 0x00,0xca, 0x05,
1337     });
1338 
1339     test_asm(r, [&](A& a) {
1340         a.vpunpckldq(A::ymm1, A::ymm2, A::Mem{A::rdi});
1341         a.vpunpckhdq(A::ymm1, A::ymm2, A::ymm3);
1342     },{
1343         0xc5,0xed,0x62,0x0f,
1344         0xc5,0xed,0x6a,0xcb,
1345     });
1346 
1347     test_asm(r, [&](A& a) {
1348         a.vroundps(A::ymm1, A::ymm2, A::NEAREST);
1349         a.vroundps(A::ymm1, A::ymm2, A::FLOOR);
1350         a.vroundps(A::ymm1, A::ymm2, A::CEIL);
1351         a.vroundps(A::ymm1, A::ymm2, A::TRUNC);
1352     },{
1353         0xc4,0xe3,0x7d,0x08,0xca,0x00,
1354         0xc4,0xe3,0x7d,0x08,0xca,0x01,
1355         0xc4,0xe3,0x7d,0x08,0xca,0x02,
1356         0xc4,0xe3,0x7d,0x08,0xca,0x03,
1357     });
1358 
1359     test_asm(r, [&](A& a) {
1360         A::Label l;
1361         a.label(&l);
1362         a.byte(1);
1363         a.byte(2);
1364         a.byte(3);
1365         a.byte(4);
1366 
1367         a.vbroadcastss(A::ymm0 , &l);
1368         a.vbroadcastss(A::ymm1 , &l);
1369         a.vbroadcastss(A::ymm8 , &l);
1370         a.vbroadcastss(A::ymm15, &l);
1371 
1372         a.vpshufb(A::ymm4, A::ymm3, &l);
1373         a.vpaddd (A::ymm4, A::ymm3, &l);
1374         a.vpsubd (A::ymm4, A::ymm3, &l);
1375 
1376         a.vptest(A::ymm4, &l);
1377 
1378         a.vmulps (A::ymm4, A::ymm3, &l);
1379     },{
1380         0x01, 0x02, 0x03, 0x4,
1381 
1382         /*     VEX    */  /*op*/ /*   ModRM    */  /*     offset     */
1383         0xc4, 0xe2, 0x7d,  0x18,   0b00'000'101,   0xf3,0xff,0xff,0xff,   // 0xfffffff3 == -13
1384         0xc4, 0xe2, 0x7d,  0x18,   0b00'001'101,   0xea,0xff,0xff,0xff,   // 0xffffffea == -22
1385         0xc4, 0x62, 0x7d,  0x18,   0b00'000'101,   0xe1,0xff,0xff,0xff,   // 0xffffffe1 == -31
1386         0xc4, 0x62, 0x7d,  0x18,   0b00'111'101,   0xd8,0xff,0xff,0xff,   // 0xffffffd8 == -40
1387 
1388         0xc4, 0xe2, 0x65,  0x00,   0b00'100'101,   0xcf,0xff,0xff,0xff,   // 0xffffffcf == -49
1389 
1390         0xc5, 0xe5,        0xfe,   0b00'100'101,   0xc7,0xff,0xff,0xff,   // 0xffffffc7 == -57
1391         0xc5, 0xe5,        0xfa,   0b00'100'101,   0xbf,0xff,0xff,0xff,   // 0xffffffbf == -65
1392 
1393         0xc4, 0xe2, 0x7d,  0x17,   0b00'100'101,   0xb6,0xff,0xff,0xff,   // 0xffffffb6 == -74
1394 
1395         0xc5, 0xe4,        0x59,   0b00'100'101,   0xae,0xff,0xff,0xff,   // 0xffffffaf == -82
1396     });
1397 
1398     test_asm(r, [&](A& a) {
1399         a.vbroadcastss(A::ymm0,  A::Mem{A::rdi,   0});
1400         a.vbroadcastss(A::ymm13, A::Mem{A::r14,   7});
1401         a.vbroadcastss(A::ymm8,  A::Mem{A::rdx, -12});
1402         a.vbroadcastss(A::ymm8,  A::Mem{A::rdx, 400});
1403 
1404         a.vbroadcastss(A::ymm8,  A::xmm0);
1405         a.vbroadcastss(A::ymm0,  A::xmm13);
1406     },{
1407         /*   VEX    */ /*op*/     /*ModRM*/   /*offset*/
1408         0xc4,0xe2,0x7d, 0x18,   0b00'000'111,
1409         0xc4,0x42,0x7d, 0x18,   0b01'101'110,  0x07,
1410         0xc4,0x62,0x7d, 0x18,   0b01'000'010,  0xf4,
1411         0xc4,0x62,0x7d, 0x18,   0b10'000'010,  0x90,0x01,0x00,0x00,
1412 
1413         0xc4,0x62,0x7d, 0x18,   0b11'000'000,
1414         0xc4,0xc2,0x7d, 0x18,   0b11'000'101,
1415     });
1416 
1417     test_asm(r, [&](A& a) {
1418         A::Label l;
1419         a.label(&l);
1420         a.jne(&l);
1421         a.jne(&l);
1422         a.je (&l);
1423         a.jmp(&l);
1424         a.jl (&l);
1425         a.jc (&l);
1426 
1427         a.cmp(A::rdx, 1);
1428         a.cmp(A::rax, 12);
1429         a.cmp(A::r14, 2000000000);
1430     },{
1431         0x0f,0x85, 0xfa,0xff,0xff,0xff,   // near jne -6 bytes
1432         0x0f,0x85, 0xf4,0xff,0xff,0xff,   // near jne -12 bytes
1433         0x0f,0x84, 0xee,0xff,0xff,0xff,   // near je  -18 bytes
1434         0xe9,      0xe9,0xff,0xff,0xff,   // near jmp -23 bytes
1435         0x0f,0x8c, 0xe3,0xff,0xff,0xff,   // near jl  -29 bytes
1436         0x0f,0x82, 0xdd,0xff,0xff,0xff,   // near jc  -35 bytes
1437 
1438         0x48,0x83,0xfa,0x01,
1439         0x48,0x83,0xf8,0x0c,
1440         0x49,0x81,0xfe,0x00,0x94,0x35,0x77,
1441     });
1442 
1443     test_asm(r, [&](A& a) {
1444         a.vmovups(A::ymm5, A::Mem{A::rsi});
1445         a.vmovups(A::Mem{A::rsi}, A::ymm5);
1446 
1447         a.vmovups(A::xmm5, A::Mem{A::rsi});
1448         a.vmovups(A::Mem{A::rsi}, A::xmm5);
1449 
1450         a.vpmovzxwd(A::ymm4, A::Mem{A::rsi});
1451         a.vpmovzxbd(A::ymm4, A::Mem{A::rsi});
1452 
1453         a.vmovq(A::Mem{A::rdx}, A::xmm15);
1454     },{
1455         /*    VEX    */  /*Op*/  /*  ModRM  */
1456         0xc5,     0xfc,   0x10,  0b00'101'110,
1457         0xc5,     0xfc,   0x11,  0b00'101'110,
1458 
1459         0xc5,     0xf8,   0x10,  0b00'101'110,
1460         0xc5,     0xf8,   0x11,  0b00'101'110,
1461 
1462         0xc4,0xe2,0x7d,   0x33,  0b00'100'110,
1463         0xc4,0xe2,0x7d,   0x31,  0b00'100'110,
1464 
1465         0xc5,     0x79,   0xd6,  0b00'111'010,
1466     });
1467 
1468     test_asm(r, [&](A& a) {
1469         a.vmovups(A::ymm5, A::Mem{A::rsp,  0});
1470         a.vmovups(A::ymm5, A::Mem{A::rsp, 64});
1471         a.vmovups(A::ymm5, A::Mem{A::rsp,128});
1472 
1473         a.vmovups(A::Mem{A::rsp,  0}, A::ymm5);
1474         a.vmovups(A::Mem{A::rsp, 64}, A::ymm5);
1475         a.vmovups(A::Mem{A::rsp,128}, A::ymm5);
1476     },{
1477         0xc5,0xfc,0x10,0x2c,0x24,
1478         0xc5,0xfc,0x10,0x6c,0x24,0x40,
1479         0xc5,0xfc,0x10,0xac,0x24,0x80,0x00,0x00,0x00,
1480 
1481         0xc5,0xfc,0x11,0x2c,0x24,
1482         0xc5,0xfc,0x11,0x6c,0x24,0x40,
1483         0xc5,0xfc,0x11,0xac,0x24,0x80,0x00,0x00,0x00,
1484     });
1485 
1486     test_asm(r, [&](A& a) {
1487         a.movzbq(A::rax, A::Mem{A::rsi});   // Low registers for src and dst.
1488         a.movzbq(A::rax, A::Mem{A::r8,});   // High src register.
1489         a.movzbq(A::r8 , A::Mem{A::rsi});   // High dst register.
1490         a.movzbq(A::r8,  A::Mem{A::rsi, 12});
1491         a.movzbq(A::r8,  A::Mem{A::rsi, 400});
1492 
1493         a.movzwq(A::rax, A::Mem{A::rsi});   // Low registers for src and dst.
1494         a.movzwq(A::rax, A::Mem{A::r8,});   // High src register.
1495         a.movzwq(A::r8 , A::Mem{A::rsi});   // High dst register.
1496         a.movzwq(A::r8,  A::Mem{A::rsi, 12});
1497         a.movzwq(A::r8,  A::Mem{A::rsi, 400});
1498 
1499         a.vmovd(A::Mem{A::rax}, A::xmm0);
1500         a.vmovd(A::Mem{A::rax}, A::xmm8);
1501         a.vmovd(A::Mem{A::r8 }, A::xmm0);
1502 
1503         a.vmovd(A::xmm0, A::Mem{A::rax});
1504         a.vmovd(A::xmm8, A::Mem{A::rax});
1505         a.vmovd(A::xmm0, A::Mem{A::r8 });
1506 
1507         a.vmovd(A::xmm0 , A::Mem{A::rax, 0, A::rcx, A::FOUR});
1508         a.vmovd(A::xmm15, A::Mem{A::rax, 0, A::r8,  A::TWO });
1509         a.vmovd(A::xmm0 , A::Mem{A::r8 , 0, A::rcx});
1510 
1511         a.vmovd(A::rax, A::xmm0);
1512         a.vmovd(A::rax, A::xmm8);
1513         a.vmovd(A::r8 ,  A::xmm0);
1514 
1515         a.vmovd(A::xmm0, A::rax);
1516         a.vmovd(A::xmm8, A::rax);
1517         a.vmovd(A::xmm0, A::r8 );
1518 
1519         a.movb(A::Mem{A::rdx}, A::rax);
1520         a.movb(A::Mem{A::rdx}, A::r8 );
1521         a.movb(A::Mem{A::r8 }, A::rax);
1522 
1523         a.movb(A::rdx, A::Mem{A::rax});
1524         a.movb(A::rdx, A::Mem{A::r8 });
1525         a.movb(A::r8 , A::Mem{A::rax});
1526 
1527         a.movb(A::rdx, 12);
1528         a.movb(A::rax,  4);
1529         a.movb(A::r8 , -1);
1530 
1531         a.movb(A::Mem{A::rdx}, 12);
1532         a.movb(A::Mem{A::rax},  4);
1533         a.movb(A::Mem{A::r8 }, -1);
1534     },{
1535         0x48,0x0f,0xb6,0x06,     // movzbq (%rsi), %rax
1536         0x49,0x0f,0xb6,0x00,
1537         0x4c,0x0f,0xb6,0x06,
1538         0x4c,0x0f,0xb6,0x46, 12,
1539         0x4c,0x0f,0xb6,0x86, 0x90,0x01,0x00,0x00,
1540 
1541         0x48,0x0f,0xb7,0x06,    // movzwq (%rsi), %rax
1542         0x49,0x0f,0xb7,0x00,
1543         0x4c,0x0f,0xb7,0x06,
1544         0x4c,0x0f,0xb7,0x46, 12,
1545         0x4c,0x0f,0xb7,0x86, 0x90,0x01,0x00,0x00,
1546 
1547         0xc5,0xf9,0x7e,0x00,
1548         0xc5,0x79,0x7e,0x00,
1549         0xc4,0xc1,0x79,0x7e,0x00,
1550 
1551         0xc5,0xf9,0x6e,0x00,
1552         0xc5,0x79,0x6e,0x00,
1553         0xc4,0xc1,0x79,0x6e,0x00,
1554 
1555         0xc5,0xf9,0x6e,0x04,0x88,
1556         0xc4,0x21,0x79,0x6e,0x3c,0x40,
1557         0xc4,0xc1,0x79,0x6e,0x04,0x08,
1558 
1559         0xc5,0xf9,0x7e,0xc0,
1560         0xc5,0x79,0x7e,0xc0,
1561         0xc4,0xc1,0x79,0x7e,0xc0,
1562 
1563         0xc5,0xf9,0x6e,0xc0,
1564         0xc5,0x79,0x6e,0xc0,
1565         0xc4,0xc1,0x79,0x6e,0xc0,
1566 
1567         0x48 ,0x88, 0x02,
1568         0x4c, 0x88, 0x02,
1569         0x49, 0x88, 0x00,
1570 
1571         0x48 ,0x8a, 0x10,
1572         0x49, 0x8a, 0x10,
1573         0x4c, 0x8a, 0x00,
1574 
1575         0x48, 0xc6, 0xc2, 0x0c,
1576         0x48, 0xc6, 0xc0, 0x04,
1577         0x49, 0xc6, 0xc0, 0xff,
1578 
1579         0x48, 0xc6, 0x02, 0x0c,
1580         0x48, 0xc6, 0x00, 0x04,
1581         0x49, 0xc6, 0x00, 0xff,
1582     });
1583 
1584     test_asm(r, [&](A& a) {
1585         a.vpinsrd(A::xmm1, A::xmm8, A::Mem{A::rsi}, 1);   // vpinsrd $1, (%rsi), %xmm8, %xmm1
1586         a.vpinsrd(A::xmm8, A::xmm1, A::Mem{A::r8 }, 3);   // vpinsrd $3, (%r8), %xmm1, %xmm8;
1587 
1588         a.vpinsrw(A::xmm1, A::xmm8, A::Mem{A::rsi}, 4);   // vpinsrw $4, (%rsi), %xmm8, %xmm1
1589         a.vpinsrw(A::xmm8, A::xmm1, A::Mem{A::r8 }, 12);  // vpinrsw $12, (%r8), %xmm1, %xmm8
1590 
1591         a.vpinsrb(A::xmm1, A::xmm8, A::Mem{A::rsi}, 4);   // vpinsrb $4, (%rsi), %xmm8, %xmm1
1592         a.vpinsrb(A::xmm8, A::xmm1, A::Mem{A::r8 }, 12);  // vpinsrb $12, (%r8), %xmm1, %xmm8
1593 
1594         a.vextracti128(A::xmm1, A::ymm8, 1);  // vextracti128 $1, %ymm8, %xmm1
1595         a.vextracti128(A::xmm8, A::ymm1, 0);  // vextracti128 $0, %ymm1, %xmm8
1596 
1597         a.vpextrd(A::Mem{A::rsi}, A::xmm8, 3);  // vpextrd  $3, %xmm8, (%rsi)
1598         a.vpextrd(A::Mem{A::r8 }, A::xmm1, 2);  // vpextrd  $2, %xmm1, (%r8)
1599 
1600         a.vpextrw(A::Mem{A::rsi}, A::xmm8, 7);
1601         a.vpextrw(A::Mem{A::r8 }, A::xmm1, 15);
1602 
1603         a.vpextrb(A::Mem{A::rsi}, A::xmm8, 7);
1604         a.vpextrb(A::Mem{A::r8 }, A::xmm1, 15);
1605     },{
1606         0xc4,0xe3,0x39, 0x22, 0x0e, 1,
1607         0xc4,0x43,0x71, 0x22, 0x00, 3,
1608 
1609         0xc5,0xb9,      0xc4, 0x0e,  4,
1610         0xc4,0x41,0x71, 0xc4, 0x00, 12,
1611 
1612         0xc4,0xe3,0x39, 0x20, 0x0e,  4,
1613         0xc4,0x43,0x71, 0x20, 0x00, 12,
1614 
1615         0xc4,0x63,0x7d,0x39,0xc1, 1,
1616         0xc4,0xc3,0x7d,0x39,0xc8, 0,
1617 
1618         0xc4,0x63,0x79,0x16,0x06, 3,
1619         0xc4,0xc3,0x79,0x16,0x08, 2,
1620 
1621         0xc4,0x63,0x79, 0x15, 0x06,  7,
1622         0xc4,0xc3,0x79, 0x15, 0x08, 15,
1623 
1624         0xc4,0x63,0x79, 0x14, 0x06,  7,
1625         0xc4,0xc3,0x79, 0x14, 0x08, 15,
1626     });
1627 
1628     test_asm(r, [&](A& a) {
1629         a.vpandn(A::ymm3, A::ymm12, A::ymm2);
1630     },{
1631         0xc5, 0x9d, 0xdf, 0xda,
1632     });
1633 
1634     test_asm(r, [&](A& a) {
1635         A::Label l;
1636         a.vmovdqa(A::ymm3, A::ymm2);                                // vmovdqa %ymm2         , %ymm3
1637 
1638         a.vmovdqa(A::ymm3, A::Mem{A::rsi});                         // vmovdqa  (%rsi)       , %ymm3
1639         a.vmovdqa(A::ymm3, A::Mem{A::rsp});                         // vmovdqa  (%rsp)       , %ymm3
1640         a.vmovdqa(A::ymm3, A::Mem{A::r11});                         // vmovdqa  (%r11)       , %ymm3
1641 
1642         a.vmovdqa(A::ymm3, A::Mem{A::rsi,  4});                     // vmovdqa 4(%rsi)       , %ymm3
1643         a.vmovdqa(A::ymm3, A::Mem{A::rsp,  4});                     // vmovdqa 4(%rsp)       , %ymm3
1644 
1645         a.vmovdqa(A::ymm3, A::Mem{A::rsi,  4, A::rax, A::EIGHT});   // vmovdqa 4(%rsi,%rax,8), %ymm3
1646         a.vmovdqa(A::ymm3, A::Mem{A::r11,  4, A::rax, A::TWO  });   // vmovdqa 4(%r11,%rax,2), %ymm3
1647         a.vmovdqa(A::ymm3, A::Mem{A::rsi,  4, A::r11, A::FOUR });   // vmovdqa 4(%rsi,%r11,4), %ymm3
1648         a.vmovdqa(A::ymm3, A::Mem{A::rsi,  4, A::r11, A::ONE  });   // vmovdqa 4(%rsi,%r11,1), %ymm3
1649         a.vmovdqa(A::ymm3, A::Mem{A::rsi,  4, A::r11});             // vmovdqa 4(%rsi,%r11)  , %ymm3
1650 
1651         a.vmovdqa(A::ymm3, A::Mem{A::rsi,  64, A::r11});            // vmovdqa  64(%rsi,%r11), %ymm3
1652         a.vmovdqa(A::ymm3, A::Mem{A::rsi, 128, A::r11});            // vmovdqa 128(%rsi,%r11), %ymm3
1653         a.vmovdqa(A::ymm3, &l);                                     // vmovdqa  16(%rip)     , %ymm3
1654 
1655         a.vcvttps2dq(A::ymm3, A::ymm2);
1656         a.vcvtdq2ps (A::ymm3, A::ymm2);
1657         a.vcvtps2dq (A::ymm3, A::ymm2);
1658         a.vsqrtps   (A::ymm3, A::ymm2);
1659         a.label(&l);
1660     },{
1661         0xc5,0xfd,0x6f,0xda,
1662 
1663         0xc5,0xfd,0x6f,0x1e,
1664         0xc5,0xfd,0x6f,0x1c,0x24,
1665         0xc4,0xc1,0x7d,0x6f,0x1b,
1666 
1667         0xc5,0xfd,0x6f,0x5e,0x04,
1668         0xc5,0xfd,0x6f,0x5c,0x24,0x04,
1669 
1670         0xc5,0xfd,0x6f,0x5c,0xc6,0x04,
1671         0xc4,0xc1,0x7d,0x6f,0x5c,0x43,0x04,
1672         0xc4,0xa1,0x7d,0x6f,0x5c,0x9e,0x04,
1673         0xc4,0xa1,0x7d,0x6f,0x5c,0x1e,0x04,
1674         0xc4,0xa1,0x7d,0x6f,0x5c,0x1e,0x04,
1675 
1676         0xc4,0xa1,0x7d,0x6f,0x5c,0x1e,0x40,
1677         0xc4,0xa1,0x7d,0x6f,0x9c,0x1e,0x80,0x00,0x00,0x00,
1678 
1679         0xc5,0xfd,0x6f,0x1d,0x10,0x00,0x00,0x00,
1680 
1681         0xc5,0xfe,0x5b,0xda,
1682         0xc5,0xfc,0x5b,0xda,
1683         0xc5,0xfd,0x5b,0xda,
1684         0xc5,0xfc,0x51,0xda,
1685     });
1686 
1687     test_asm(r, [&](A& a) {
1688         a.vcvtps2ph(A::xmm3, A::ymm2, A::CURRENT);
1689         a.vcvtps2ph(A::Mem{A::rsi, 32, A::rax, A::EIGHT}, A::ymm5, A::CEIL);
1690 
1691         a.vcvtph2ps(A::ymm15, A::Mem{A::rdi, 12, A::r9, A::ONE});
1692         a.vcvtph2ps(A::ymm2, A::xmm3);
1693     },{
1694         0xc4,0xe3,0x7d,0x1d,0xd3,0x04,
1695         0xc4,0xe3,0x7d,0x1d,0x6c,0xc6,0x20,0x02,
1696 
1697         0xc4,0x22,0x7d,0x13,0x7c,0x0f,0x0c,
1698         0xc4,0xe2,0x7d,0x13,0xd3,
1699     });
1700 
1701     test_asm(r, [&](A& a) {
1702         a.vgatherdps(A::ymm1 , A::FOUR , A::ymm0 , A::rdi, A::ymm2 );
1703         a.vgatherdps(A::ymm0 , A::ONE  , A::ymm2 , A::rax, A::ymm1 );
1704         a.vgatherdps(A::ymm10, A::ONE  , A::ymm2 , A::rax, A::ymm1 );
1705         a.vgatherdps(A::ymm0 , A::ONE  , A::ymm12, A::rax, A::ymm1 );
1706         a.vgatherdps(A::ymm0 , A::ONE  , A::ymm2 , A::r9 , A::ymm1 );
1707         a.vgatherdps(A::ymm0 , A::ONE  , A::ymm2 , A::rax, A::ymm12);
1708         a.vgatherdps(A::ymm0 , A::EIGHT, A::ymm2 , A::rax, A::ymm12);
1709     },{
1710         0xc4,0xe2,0x6d,0x92,0x0c,0x87,
1711         0xc4,0xe2,0x75,0x92,0x04,0x10,
1712         0xc4,0x62,0x75,0x92,0x14,0x10,
1713         0xc4,0xa2,0x75,0x92,0x04,0x20,
1714         0xc4,0xc2,0x75,0x92,0x04,0x11,
1715         0xc4,0xe2,0x1d,0x92,0x04,0x10,
1716         0xc4,0xe2,0x1d,0x92,0x04,0xd0,
1717     });
1718 
1719     test_asm(r, [&](A& a) {
1720         a.mov(A::rax, A::Mem{A::rdi,   0});
1721         a.mov(A::rax, A::Mem{A::rdi,   1});
1722         a.mov(A::rax, A::Mem{A::rdi, 512});
1723         a.mov(A::r15, A::Mem{A::r13,  42});
1724         a.mov(A::rax, A::Mem{A::r13,  42});
1725         a.mov(A::r15, A::Mem{A::rax,  42});
1726         a.mov(A::rax, 1);
1727         a.mov(A::rax, A::rcx);
1728     },{
1729         0x48, 0x8b, 0x07,
1730         0x48, 0x8b, 0x47, 0x01,
1731         0x48, 0x8b, 0x87, 0x00,0x02,0x00,0x00,
1732         0x4d, 0x8b, 0x7d, 0x2a,
1733         0x49, 0x8b, 0x45, 0x2a,
1734         0x4c, 0x8b, 0x78, 0x2a,
1735         0x48, 0xc7, 0xc0, 0x01,0x00,0x00,0x00,
1736         0x48, 0x89, 0xc8,
1737     });
1738 
1739     // echo "fmul v4.4s, v3.4s, v1.4s" | llvm-mc -show-encoding -arch arm64
1740 
1741     test_asm(r, [&](A& a) {
1742         a.and16b(A::v4, A::v3, A::v1);
1743         a.orr16b(A::v4, A::v3, A::v1);
1744         a.eor16b(A::v4, A::v3, A::v1);
1745         a.bic16b(A::v4, A::v3, A::v1);
1746         a.bsl16b(A::v4, A::v3, A::v1);
1747         a.not16b(A::v4, A::v3);
1748 
1749         a.add4s(A::v4, A::v3, A::v1);
1750         a.sub4s(A::v4, A::v3, A::v1);
1751         a.mul4s(A::v4, A::v3, A::v1);
1752 
1753         a.cmeq4s(A::v4, A::v3, A::v1);
1754         a.cmgt4s(A::v4, A::v3, A::v1);
1755 
1756         a.sub8h(A::v4, A::v3, A::v1);
1757         a.mul8h(A::v4, A::v3, A::v1);
1758 
1759         a.fadd4s(A::v4, A::v3, A::v1);
1760         a.fsub4s(A::v4, A::v3, A::v1);
1761         a.fmul4s(A::v4, A::v3, A::v1);
1762         a.fdiv4s(A::v4, A::v3, A::v1);
1763         a.fmin4s(A::v4, A::v3, A::v1);
1764         a.fmax4s(A::v4, A::v3, A::v1);
1765 
1766         a.fneg4s (A::v4, A::v3);
1767         a.fsqrt4s(A::v4, A::v3);
1768 
1769         a.fmla4s(A::v4, A::v3, A::v1);
1770         a.fmls4s(A::v4, A::v3, A::v1);
1771 
1772         a.fcmeq4s(A::v4, A::v3, A::v1);
1773         a.fcmgt4s(A::v4, A::v3, A::v1);
1774         a.fcmge4s(A::v4, A::v3, A::v1);
1775     },{
1776         0x64,0x1c,0x21,0x4e,
1777         0x64,0x1c,0xa1,0x4e,
1778         0x64,0x1c,0x21,0x6e,
1779         0x64,0x1c,0x61,0x4e,
1780         0x64,0x1c,0x61,0x6e,
1781         0x64,0x58,0x20,0x6e,
1782 
1783         0x64,0x84,0xa1,0x4e,
1784         0x64,0x84,0xa1,0x6e,
1785         0x64,0x9c,0xa1,0x4e,
1786 
1787         0x64,0x8c,0xa1,0x6e,
1788         0x64,0x34,0xa1,0x4e,
1789 
1790         0x64,0x84,0x61,0x6e,
1791         0x64,0x9c,0x61,0x4e,
1792 
1793         0x64,0xd4,0x21,0x4e,
1794         0x64,0xd4,0xa1,0x4e,
1795         0x64,0xdc,0x21,0x6e,
1796         0x64,0xfc,0x21,0x6e,
1797         0x64,0xf4,0xa1,0x4e,
1798         0x64,0xf4,0x21,0x4e,
1799 
1800         0x64,0xf8,0xa0,0x6e,
1801         0x64,0xf8,0xa1,0x6e,
1802 
1803         0x64,0xcc,0x21,0x4e,
1804         0x64,0xcc,0xa1,0x4e,
1805 
1806         0x64,0xe4,0x21,0x4e,
1807         0x64,0xe4,0xa1,0x6e,
1808         0x64,0xe4,0x21,0x6e,
1809     });
1810 
1811     test_asm(r, [&](A& a) {
1812         a.shl4s(A::v4, A::v3,  0);
1813         a.shl4s(A::v4, A::v3,  1);
1814         a.shl4s(A::v4, A::v3,  8);
1815         a.shl4s(A::v4, A::v3, 16);
1816         a.shl4s(A::v4, A::v3, 31);
1817 
1818         a.sshr4s(A::v4, A::v3,  1);
1819         a.sshr4s(A::v4, A::v3,  8);
1820         a.sshr4s(A::v4, A::v3, 31);
1821 
1822         a.ushr4s(A::v4, A::v3,  1);
1823         a.ushr4s(A::v4, A::v3,  8);
1824         a.ushr4s(A::v4, A::v3, 31);
1825 
1826         a.ushr8h(A::v4, A::v3,  1);
1827         a.ushr8h(A::v4, A::v3,  8);
1828         a.ushr8h(A::v4, A::v3, 15);
1829     },{
1830         0x64,0x54,0x20,0x4f,
1831         0x64,0x54,0x21,0x4f,
1832         0x64,0x54,0x28,0x4f,
1833         0x64,0x54,0x30,0x4f,
1834         0x64,0x54,0x3f,0x4f,
1835 
1836         0x64,0x04,0x3f,0x4f,
1837         0x64,0x04,0x38,0x4f,
1838         0x64,0x04,0x21,0x4f,
1839 
1840         0x64,0x04,0x3f,0x6f,
1841         0x64,0x04,0x38,0x6f,
1842         0x64,0x04,0x21,0x6f,
1843 
1844         0x64,0x04,0x1f,0x6f,
1845         0x64,0x04,0x18,0x6f,
1846         0x64,0x04,0x11,0x6f,
1847     });
1848 
1849     test_asm(r, [&](A& a) {
1850         a.sli4s(A::v4, A::v3,  0);
1851         a.sli4s(A::v4, A::v3,  1);
1852         a.sli4s(A::v4, A::v3,  8);
1853         a.sli4s(A::v4, A::v3, 16);
1854         a.sli4s(A::v4, A::v3, 31);
1855     },{
1856         0x64,0x54,0x20,0x6f,
1857         0x64,0x54,0x21,0x6f,
1858         0x64,0x54,0x28,0x6f,
1859         0x64,0x54,0x30,0x6f,
1860         0x64,0x54,0x3f,0x6f,
1861     });
1862 
1863     test_asm(r, [&](A& a) {
1864         a.scvtf4s (A::v4, A::v3);
1865         a.fcvtzs4s(A::v4, A::v3);
1866         a.fcvtns4s(A::v4, A::v3);
1867         a.frintp4s(A::v4, A::v3);
1868         a.frintm4s(A::v4, A::v3);
1869         a.fcvtn   (A::v4, A::v3);
1870         a.fcvtl   (A::v4, A::v3);
1871     },{
1872         0x64,0xd8,0x21,0x4e,
1873         0x64,0xb8,0xa1,0x4e,
1874         0x64,0xa8,0x21,0x4e,
1875         0x64,0x88,0xa1,0x4e,
1876         0x64,0x98,0x21,0x4e,
1877         0x64,0x68,0x21,0x0e,
1878         0x64,0x78,0x21,0x0e,
1879     });
1880 
1881     test_asm(r, [&](A& a) {
1882         a.sub (A::sp, A::sp, 32);  // sub   sp, sp, #32
1883         a.strq(A::v0, A::sp, 1);   // str   q0, [sp, #16]
1884         a.strq(A::v1, A::sp);      // str   q1, [sp]
1885         a.strd(A::v0, A::sp, 6);   // str   s0, [sp, #48]
1886         a.strs(A::v0, A::sp, 6);   // str   s0, [sp, #24]
1887         a.strh(A::v0, A::sp, 10);  // str   h0, [sp, #20]
1888         a.strb(A::v0, A::sp, 47);  // str   b0, [sp, #47]
1889         a.ldrb(A::v9, A::sp, 42);  // ldr   b9, [sp, #42]
1890         a.ldrh(A::v9, A::sp, 47);  // ldr   h9, [sp, #94]
1891         a.ldrs(A::v7, A::sp, 10);  // ldr   s7, [sp, #40]
1892         a.ldrd(A::v7, A::sp,  1);  // ldr   d7, [sp, #8]
1893         a.ldrq(A::v5, A::sp, 128); // ldr   q5, [sp, #2048]
1894         a.add (A::sp, A::sp, 32);  // add   sp, sp, #32
1895     },{
1896          0xff,0x83,0x00,0xd1,
1897          0xe0,0x07,0x80,0x3d,
1898          0xe1,0x03,0x80,0x3d,
1899          0xe0,0x1b,0x00,0xfd,
1900          0xe0,0x1b,0x00,0xbd,
1901          0xe0,0x2b,0x00,0x7d,
1902          0xe0,0xbf,0x00,0x3d,
1903          0xe9,0xab,0x40,0x3d,
1904          0xe9,0xbf,0x40,0x7d,
1905          0xe7,0x2b,0x40,0xbd,
1906          0xe7,0x07,0x40,0xfd,
1907          0xe5,0x03,0xc2,0x3d,
1908          0xff,0x83,0x00,0x91,
1909     });
1910 
1911     test_asm(r, [&](A& a) {
1912         a.brk(0);
1913         a.brk(65535);
1914 
1915         a.ret(A::x30);   // Conventional ret using link register.
1916         a.ret(A::x13);   // Can really return using any register if we like.
1917 
1918         a.add(A::x2, A::x2,  4);
1919         a.add(A::x3, A::x2, 32);
1920 
1921         a.sub(A::x2, A::x2, 4);
1922         a.sub(A::x3, A::x2, 32);
1923 
1924         a.subs(A::x2, A::x2,  4);
1925         a.subs(A::x3, A::x2, 32);
1926 
1927         a.subs(A::xzr, A::x2, 4);  // These are actually the same instruction!
1928         a.cmp(A::x2, 4);
1929 
1930         A::Label l;
1931         a.label(&l);
1932         a.bne(&l);
1933         a.bne(&l);
1934         a.blt(&l);
1935         a.b(&l);
1936         a.cbnz(A::x2, &l);
1937         a.cbz(A::x2, &l);
1938 
1939         a.add(A::x3, A::x2, A::x1);             // add x3,x2,x1
1940         a.add(A::x3, A::x2, A::x1, A::ASR, 3);  // add x3,x2,x1, asr #3
1941     },{
1942         0x00,0x00,0x20,0xd4,
1943         0xe0,0xff,0x3f,0xd4,
1944 
1945         0xc0,0x03,0x5f,0xd6,
1946         0xa0,0x01,0x5f,0xd6,
1947 
1948         0x42,0x10,0x00,0x91,
1949         0x43,0x80,0x00,0x91,
1950 
1951         0x42,0x10,0x00,0xd1,
1952         0x43,0x80,0x00,0xd1,
1953 
1954         0x42,0x10,0x00,0xf1,
1955         0x43,0x80,0x00,0xf1,
1956 
1957         0x5f,0x10,0x00,0xf1,
1958         0x5f,0x10,0x00,0xf1,
1959 
1960         0x01,0x00,0x00,0x54,   // b.ne #0
1961         0xe1,0xff,0xff,0x54,   // b.ne #-4
1962         0xcb,0xff,0xff,0x54,   // b.lt #-8
1963         0xae,0xff,0xff,0x54,   // b.al #-12
1964         0x82,0xff,0xff,0xb5,   // cbnz x2, #-16
1965         0x62,0xff,0xff,0xb4,   // cbz x2, #-20
1966 
1967         0x43,0x00,0x01,0x8b,
1968         0x43,0x0c,0x81,0x8b,
1969     });
1970 
1971     // Can we cbz() to a not-yet-defined label?
1972     test_asm(r, [&](A& a) {
1973         A::Label l;
1974         a.cbz(A::x2, &l);
1975         a.add(A::x3, A::x2, 32);
1976         a.label(&l);
1977         a.ret(A::x30);
1978     },{
1979         0x42,0x00,0x00,0xb4,  // cbz x2, #8
1980         0x43,0x80,0x00,0x91,  // add x3, x2, #32
1981         0xc0,0x03,0x5f,0xd6,  // ret
1982     });
1983 
1984     // If we start a label as a backward label,
1985     // can we redefine it to be a future label?
1986     // (Not sure this is useful... just want to test it works.)
1987     test_asm(r, [&](A& a) {
1988         A::Label l1;
1989         a.label(&l1);
1990         a.add(A::x3, A::x2, 32);
1991         a.cbz(A::x2, &l1);          // This will jump backward... nothing sneaky.
1992 
1993         A::Label l2;                // Start off the same...
1994         a.label(&l2);
1995         a.add(A::x3, A::x2, 32);
1996         a.cbz(A::x2, &l2);          // Looks like this will go backward...
1997         a.add(A::x2, A::x2, 4);
1998         a.add(A::x3, A::x2, 32);
1999         a.label(&l2);               // But no... actually forward!  What a switcheroo!
2000     },{
2001         0x43,0x80,0x00,0x91,  // add x3, x2, #32
2002         0xe2,0xff,0xff,0xb4,  // cbz x2, #-4
2003 
2004         0x43,0x80,0x00,0x91,  // add x3, x2, #32
2005         0x62,0x00,0x00,0xb4,  // cbz x2, #12
2006         0x42,0x10,0x00,0x91,  // add x2, x2, #4
2007         0x43,0x80,0x00,0x91,  // add x3, x2, #32
2008     });
2009 
2010     // Loading from a label on ARM.
2011     test_asm(r, [&](A& a) {
2012         A::Label fore,aft;
2013         a.label(&fore);
2014         a.word(0x01234567);
2015         a.ldrq(A::v1, &fore);
2016         a.ldrq(A::v2, &aft);
2017         a.label(&aft);
2018         a.word(0x76543210);
2019     },{
2020         0x67,0x45,0x23,0x01,
2021         0xe1,0xff,0xff,0x9c,  // ldr q1, #-4
2022         0x22,0x00,0x00,0x9c,  // ldr q2, #4
2023         0x10,0x32,0x54,0x76,
2024     });
2025 
2026     test_asm(r, [&](A& a) {
2027         a.ldrq(A::v0, A::x8);
2028         a.strq(A::v0, A::x8);
2029     },{
2030         0x00,0x01,0xc0,0x3d,
2031         0x00,0x01,0x80,0x3d,
2032     });
2033 
2034     test_asm(r, [&](A& a) {
2035         a.dup4s  (A::v0, A::x8);
2036         a.ld1r4s (A::v0, A::x8);  // echo 'ld1r.4s {v0}, [x8]' | llvm-mc --show-encoding
2037         a.ld1r8h (A::v0, A::x8);
2038         a.ld1r16b(A::v0, A::x8);
2039     },{
2040         0x00,0x0d,0x04,0x4e,
2041         0x00,0xc9,0x40,0x4d,
2042         0x00,0xc5,0x40,0x4d,
2043         0x00,0xc1,0x40,0x4d,
2044     });
2045 
2046     test_asm(r, [&](A& a) {
2047         a.ld24s(A::v0, A::x8);  // echo 'ld2.4s {v0,v1}, [x8]' | llvm-mc --show-encoding
2048         a.ld44s(A::v0, A::x8);
2049         a.st24s(A::v0, A::x8);
2050         a.st44s(A::v0, A::x8);  // echo 'st4.4s {v0,v1,v2,v3}, [x8]' | llvm-mc --show-encoding
2051 
2052         a.ld24s(A::v0, A::x8, 0);  //echo 'ld2 {v0.s,v1.s}[0], [x8]' | llvm-mc --show-encoding
2053         a.ld24s(A::v0, A::x8, 1);
2054         a.ld24s(A::v0, A::x8, 2);
2055         a.ld24s(A::v0, A::x8, 3);
2056 
2057         a.ld44s(A::v0, A::x8, 0);  // ld4 {v0.s,v1.s,v2.s,v3.s}[0], [x8]
2058         a.ld44s(A::v0, A::x8, 1);
2059         a.ld44s(A::v0, A::x8, 2);
2060         a.ld44s(A::v0, A::x8, 3);
2061     },{
2062         0x00,0x89,0x40,0x4c,
2063         0x00,0x09,0x40,0x4c,
2064         0x00,0x89,0x00,0x4c,
2065         0x00,0x09,0x00,0x4c,
2066 
2067         0x00,0x81,0x60,0x0d,
2068         0x00,0x91,0x60,0x0d,
2069         0x00,0x81,0x60,0x4d,
2070         0x00,0x91,0x60,0x4d,
2071 
2072         0x00,0xa1,0x60,0x0d,
2073         0x00,0xb1,0x60,0x0d,
2074         0x00,0xa1,0x60,0x4d,
2075         0x00,0xb1,0x60,0x4d,
2076     });
2077 
2078     test_asm(r, [&](A& a) {
2079         a.xtns2h(A::v0, A::v0);
2080         a.xtnh2b(A::v0, A::v0);
2081         a.strs  (A::v0, A::x0);
2082 
2083         a.ldrs   (A::v0, A::x0);
2084         a.uxtlb2h(A::v0, A::v0);
2085         a.uxtlh2s(A::v0, A::v0);
2086 
2087         a.uminv4s(A::v3, A::v4);
2088         a.movs   (A::x3, A::v4,0);  // mov.s w3,v4[0]
2089         a.movs   (A::x3, A::v4,1);  // mov.s w3,v4[1]
2090         a.inss   (A::v4, A::x3,3);  // ins.s v4[3],w3
2091     },{
2092         0x00,0x28,0x61,0x0e,
2093         0x00,0x28,0x21,0x0e,
2094         0x00,0x00,0x00,0xbd,
2095 
2096         0x00,0x00,0x40,0xbd,
2097         0x00,0xa4,0x08,0x2f,
2098         0x00,0xa4,0x10,0x2f,
2099 
2100         0x83,0xa8,0xb1,0x6e,
2101         0x83,0x3c,0x04,0x0e,
2102         0x83,0x3c,0x0c,0x0e,
2103         0x64,0x1c,0x1c,0x4e,
2104     });
2105 
2106     test_asm(r, [&](A& a) {
2107         a.ldrb(A::v0, A::x8);
2108         a.strb(A::v0, A::x8);
2109     },{
2110         0x00,0x01,0x40,0x3d,
2111         0x00,0x01,0x00,0x3d,
2112     });
2113 
2114     test_asm(r, [&](A& a) {
2115         a.ldrd(A::x0, A::x1, 3);   // ldr  x0, [x1, #24]
2116         a.ldrs(A::x0, A::x1, 3);   // ldr  w0, [x1, #12]
2117         a.ldrh(A::x0, A::x1, 3);   // ldrh w0, [x1, #6]
2118         a.ldrb(A::x0, A::x1, 3);   // ldrb w0, [x1, #3]
2119 
2120         a.strs(A::x0, A::x1, 3);   // str  w0, [x1, #12]
2121     },{
2122         0x20,0x0c,0x40,0xf9,
2123         0x20,0x0c,0x40,0xb9,
2124         0x20,0x0c,0x40,0x79,
2125         0x20,0x0c,0x40,0x39,
2126 
2127         0x20,0x0c,0x00,0xb9,
2128     });
2129 
2130     test_asm(r, [&](A& a) {
2131         a.tbl   (A::v0, A::v1, A::v2);
2132         a.uzp14s(A::v0, A::v1, A::v2);
2133         a.uzp24s(A::v0, A::v1, A::v2);
2134         a.zip14s(A::v0, A::v1, A::v2);
2135         a.zip24s(A::v0, A::v1, A::v2);
2136     },{
2137         0x20,0x00,0x02,0x4e,
2138         0x20,0x18,0x82,0x4e,
2139         0x20,0x58,0x82,0x4e,
2140         0x20,0x38,0x82,0x4e,
2141         0x20,0x78,0x82,0x4e,
2142     });
2143 }
2144 
DEF_TEST(SkVM_approx_math,r)2145 DEF_TEST(SkVM_approx_math, r) {
2146     auto eval = [](int N, float values[], auto fn) {
2147         skvm::Builder b;
2148         skvm::Ptr inout  = b.varying<float>();
2149 
2150         b.storeF(inout, fn(&b, b.loadF(inout)));
2151 
2152         b.done().eval(N, values);
2153     };
2154 
2155     auto compare = [r](int N, const float values[], const float expected[]) {
2156         for (int i = 0; i < N; ++i) {
2157             REPORTER_ASSERT(r, (values[i] == expected[i]) ||
2158                                SkScalarNearlyEqual(values[i], expected[i], 0.001f),
2159                                "evaluated to %g, but expected %g", values[i], expected[i]);
2160         }
2161     };
2162 
2163     // log2
2164     {
2165         float values[] = {0.25f, 0.5f, 1, 2, 4, 8};
2166         constexpr int N = std::size(values);
2167         eval(N, values, [](skvm::Builder* b, skvm::F32 v) {
2168             return b->approx_log2(v);
2169         });
2170         const float expected[] = {-2, -1, 0, 1, 2, 3};
2171         compare(N, values, expected);
2172     }
2173 
2174     // pow2
2175     {
2176         float values[] = {-80, -5, -2, -1, 0, 1, 2, 3, 5, 160};
2177         constexpr int N = std::size(values);
2178         eval(N, values, [](skvm::Builder* b, skvm::F32 v) {
2179             return b->approx_pow2(v);
2180         });
2181         const float expected[] = {0, 0.03125f, 0.25f, 0.5f, 1, 2, 4, 8, 32, INFINITY};
2182         compare(N, values, expected);
2183     }
2184     // powf -- 1^x
2185     {
2186         float exps[] = {-2, -1, 0, 1, 2};
2187         constexpr int N = std::size(exps);
2188         eval(N, exps, [](skvm::Builder* b, skvm::F32 exp) {
2189             return b->approx_powf(b->splat(1.0f), exp);
2190         });
2191         const float expected[] = {1, 1, 1, 1, 1};
2192         compare(N, exps, expected);
2193     }
2194     // powf -- 2^x
2195     {
2196         float exps[] = {-80, -5, -2, -1, 0, 1, 2, 3, 5, 160};
2197         constexpr int N = std::size(exps);
2198         eval(N, exps, [](skvm::Builder* b, skvm::F32 exp) {
2199             return b->approx_powf(2.0, exp);
2200         });
2201         const float expected[] = {0, 0.03125f, 0.25f, 0.5f, 1, 2, 4, 8, 32, INFINITY};
2202         compare(N, exps, expected);
2203     }
2204     // powf -- 3^x
2205     {
2206         float exps[] = {-2, -1, 0, 1, 2};
2207         constexpr int N = std::size(exps);
2208         eval(N, exps, [](skvm::Builder* b, skvm::F32 exp) {
2209             return b->approx_powf(b->splat(3.0f), exp);
2210         });
2211         const float expected[] = {1/9.0f, 1/3.0f, 1, 3, 9};
2212         compare(N, exps, expected);
2213     }
2214     // powf -- x^0.5
2215     {
2216         float bases[] = {0, 1, 4, 9, 16};
2217         constexpr int N = std::size(bases);
2218         eval(N, bases, [](skvm::Builder* b, skvm::F32 base) {
2219             return b->approx_powf(base, b->splat(0.5f));
2220         });
2221         const float expected[] = {0, 1, 2, 3, 4};
2222         compare(N, bases, expected);
2223     }
2224     // powf -- x^1
2225     {
2226         float bases[] = {0, 1, 2, 3, 4};
2227         constexpr int N = std::size(bases);
2228         eval(N, bases, [](skvm::Builder* b, skvm::F32 base) {
2229             return b->approx_powf(base, b->splat(1.0f));
2230         });
2231         const float expected[] = {0, 1, 2, 3, 4};
2232         compare(N, bases, expected);
2233     }
2234     // powf -- x^2
2235     {
2236         float bases[] = {0, 1, 2, 3, 4};
2237         constexpr int N = std::size(bases);
2238         eval(N, bases, [](skvm::Builder* b, skvm::F32 base) {
2239             return b->approx_powf(base, b->splat(2.0f));
2240         });
2241         const float expected[] = {0, 1, 4, 9, 16};
2242         compare(N, bases, expected);
2243     }
2244 
2245     auto test = [r](float arg, float expected, float tolerance, auto prog) {
2246         skvm::Builder b;
2247         skvm::Ptr inout  = b.varying<float>();
2248         b.storeF(inout, prog(b.loadF(inout)));
2249         float actual = arg;
2250         b.done().eval(1, &actual);
2251 
2252         float err = std::abs(actual - expected);
2253 
2254         if (err > tolerance) {
2255     //        SkDebugf("arg %g, expected %g, actual %g\n", arg, expected, actual);
2256             REPORTER_ASSERT(r, true);
2257         }
2258         return err;
2259     };
2260 
2261     auto test2 = [r](float arg0, float arg1, float expected, float tolerance, auto prog) {
2262         skvm::Builder b;
2263         skvm::Ptr in0  = b.varying<float>();
2264         skvm::Ptr in1  = b.varying<float>();
2265         skvm::Ptr out  = b.varying<float>();
2266         b.storeF(out, prog(b.loadF(in0), b.loadF(in1)));
2267         float actual;
2268         b.done().eval(1, &arg0, &arg1, &actual);
2269 
2270         float err = std::abs(actual - expected);
2271 
2272         if (err > tolerance) {
2273     //        SkDebugf("[%g, %g]: expected %g, actual %g\n", arg0, arg1, expected, actual);
2274             REPORTER_ASSERT(r, true);
2275         }
2276         return err;
2277     };
2278 
2279     // sine, cosine, tangent
2280     {
2281         constexpr float P = SK_ScalarPI;
2282         constexpr float tol = 0.00175f;
2283         for (float rad = -5*P; rad <= 5*P; rad += 0.1f) {
2284             test(rad, sk_float_sin(rad), tol, [](skvm::F32 x) {
2285                 return approx_sin(x);
2286             });
2287             test(rad, sk_float_cos(rad), tol, [](skvm::F32 x) {
2288                 return approx_cos(x);
2289             });
2290         }
2291 
2292         // Our tangent diverge more as we get near infinities (x near +- Pi/2),
2293         // so bring in the domain a little.
2294         constexpr float eps = 0.16f;
2295         float err = 0;
2296         for (float rad = -P/2 + eps; rad <= P/2 - eps; rad += 0.01f) {
2297             err += test(rad, sk_float_tan(rad), tol, [](skvm::F32 x) {
2298                 return approx_tan(x);
2299             });
2300             // try again with some multiples of P, to check our periodicity
2301             test(rad, sk_float_tan(rad), tol, [=](skvm::F32 x) {
2302                 return approx_tan(x + 3*P);
2303             });
2304             test(rad, sk_float_tan(rad), tol, [=](skvm::F32 x) {
2305                 return approx_tan(x - 3*P);
2306             });
2307         }
2308         if ((false)) { SkDebugf("tan error %g\n", err); }
2309     }
2310 
2311     // asin, acos, atan
2312     {
2313         constexpr float tol = 0.00175f;
2314         float err = 0;
2315         for (float x = -1; x <= 1; x += 1.0f/64) {
2316             err += test(x, asin(x), tol, [](skvm::F32 x) {
2317                 return approx_asin(x);
2318             });
2319             test(x, acos(x), tol, [](skvm::F32 x) {
2320                 return approx_acos(x);
2321             });
2322         }
2323         if ((false)) { SkDebugf("asin error %g\n", err); }
2324 
2325         err = 0;
2326         for (float x = -10; x <= 10; x += 1.0f/16) {
2327             err += test(x, atan(x), tol, [](skvm::F32 x) {
2328                 return approx_atan(x);
2329             });
2330         }
2331         if ((false)) { SkDebugf("atan error %g\n", err); }
2332 
2333         for (float y = -3; y <= 3; y += 1) {
2334             for (float x = -3; x <= 3; x += 1) {
2335                 err += test2(y, x, atan2(y,x), tol, [](skvm::F32 y, skvm::F32 x) {
2336                     return approx_atan2(y,x);
2337                 });
2338             }
2339         }
2340         if ((false)) { SkDebugf("atan2 error %g\n", err); }
2341     }
2342 }
2343 
DEF_TEST(SkVM_min_max,r)2344 DEF_TEST(SkVM_min_max, r) {
2345     // min() and max() have subtle behavior when one argument is NaN and
2346     // the other isn't.  It's not sound to blindly swap their arguments.
2347     //
2348     // All backends must behave like std::min() and std::max(), which are
2349     //
2350     //    min(x,y) = y<x ? y : x
2351     //    max(x,y) = x<y ? y : x
2352 
2353     // ±NaN, ±0, ±1, ±inf
2354     const uint32_t bits[] = {0x7f80'0001, 0xff80'0001, 0x0000'0000, 0x8000'0000,
2355                              0x3f80'0000, 0xbf80'0000, 0x7f80'0000, 0xff80'0000};
2356 
2357     float f[8];
2358     memcpy(f, bits, sizeof(bits));
2359 
2360     auto identical = [&](float x, float y) {
2361         uint32_t X,Y;
2362         memcpy(&X, &x, 4);
2363         memcpy(&Y, &y, 4);
2364         return X == Y;
2365     };
2366 
2367     // Test min/max with non-constant x, non-constant y.
2368     // (Whether x and y are varying or uniform shouldn't make any difference.)
2369     {
2370         skvm::Builder b;
2371         {
2372             skvm::Ptr src = b.varying<float>(),
2373                        mn = b.varying<float>(),
2374                        mx = b.varying<float>();
2375 
2376             skvm::F32 x = b.loadF(src),
2377                       y = b.uniformF(b.uniform(), 0);
2378 
2379             b.storeF(mn, b.min(x,y));
2380             b.storeF(mx, b.max(x,y));
2381         }
2382 
2383         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2384             float mn[8], mx[8];
2385             for (int i = 0; i < 8; i++) {
2386                 // min() and max() everything with f[i].
2387                 program.eval(8, f,mn,mx, &f[i]);
2388 
2389                 for (int j = 0; j < 8; j++) {
2390                     REPORTER_ASSERT(r, identical(mn[j], std::min(f[j], f[i])));
2391                     REPORTER_ASSERT(r, identical(mx[j], std::max(f[j], f[i])));
2392                 }
2393             }
2394         });
2395     }
2396 
2397     // Test each with constant on the right.
2398     for (int i = 0; i < 8; i++) {
2399         skvm::Builder b;
2400         {
2401             skvm::Ptr src = b.varying<float>(),
2402                        mn = b.varying<float>(),
2403                        mx = b.varying<float>();
2404 
2405             skvm::F32 x = b.loadF(src),
2406                       y = b.splat(f[i]);
2407 
2408             b.storeF(mn, b.min(x,y));
2409             b.storeF(mx, b.max(x,y));
2410         }
2411 
2412         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2413             float mn[8], mx[8];
2414             program.eval(8, f,mn,mx);
2415             for (int j = 0; j < 8; j++) {
2416                 REPORTER_ASSERT(r, identical(mn[j], std::min(f[j], f[i])));
2417                 REPORTER_ASSERT(r, identical(mx[j], std::max(f[j], f[i])));
2418             }
2419         });
2420     }
2421 
2422     // Test each with constant on the left.
2423     for (int i = 0; i < 8; i++) {
2424         skvm::Builder b;
2425         {
2426             skvm::Ptr src = b.varying<float>(),
2427                        mn = b.varying<float>(),
2428                        mx = b.varying<float>();
2429 
2430             skvm::F32 x = b.splat(f[i]),
2431                       y = b.loadF(src);
2432 
2433             b.storeF(mn, b.min(x,y));
2434             b.storeF(mx, b.max(x,y));
2435         }
2436 
2437         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2438             float mn[8], mx[8];
2439             program.eval(8, f,mn,mx);
2440             for (int j = 0; j < 8; j++) {
2441                 REPORTER_ASSERT(r, identical(mn[j], std::min(f[i], f[j])));
2442                 REPORTER_ASSERT(r, identical(mx[j], std::max(f[i], f[j])));
2443             }
2444         });
2445     }
2446 }
2447 
DEF_TEST(SkVM_halfs,r)2448 DEF_TEST(SkVM_halfs, r) {
2449     const uint16_t hs[] = {0x0000,0x3800,0x3c00,0x4000,
2450                            0xc400,0xb800,0xbc00,0xc000};
2451     const float fs[] = {+0.0f,+0.5f,+1.0f,+2.0f,
2452                         -4.0f,-0.5f,-1.0f,-2.0f};
2453     {
2454         skvm::Builder b;
2455         skvm::Ptr src = b.varying<uint16_t>(),
2456                   dst = b.varying<float>();
2457         b.storeF(dst, b.from_fp16(b.load16(src)));
2458 
2459         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2460             float dst[8];
2461             program.eval(8, hs, dst);
2462             for (int i = 0; i < 8; i++) {
2463                 REPORTER_ASSERT(r, dst[i] == fs[i]);
2464             }
2465         });
2466     }
2467     {
2468         skvm::Builder b;
2469         skvm::Ptr src = b.varying<float>(),
2470                   dst = b.varying<uint16_t>();
2471         b.store16(dst, b.to_fp16(b.loadF(src)));
2472 
2473         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2474             uint16_t dst[8];
2475             program.eval(8, fs, dst);
2476             for (int i = 0; i < 8; i++) {
2477                 REPORTER_ASSERT(r, dst[i] == hs[i]);
2478             }
2479         });
2480     }
2481 }
2482 
DEF_TEST(SkVM_64bit,r)2483 DEF_TEST(SkVM_64bit, r) {
2484     uint32_t lo[65],
2485              hi[65];
2486     uint64_t wide[65];
2487     for (int i = 0; i < 65; i++) {
2488         lo[i] = 2*i+0;
2489         hi[i] = 2*i+1;
2490         wide[i] = ((uint64_t)lo[i] <<  0)
2491                 | ((uint64_t)hi[i] << 32);
2492     }
2493 
2494     {
2495         skvm::Builder b;
2496         {
2497             skvm::Ptr widePtr = b.varying<uint64_t>(),
2498                         loPtr = b.varying<int>(),
2499                         hiPtr = b.varying<int>();
2500             b.store32(loPtr, b.load64(widePtr, 0));
2501             b.store32(hiPtr, b.load64(widePtr, 1));
2502         }
2503         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2504             uint32_t l[65], h[65];
2505             program.eval(65, wide,l,h);
2506             for (int i = 0; i < 65; i++) {
2507                 REPORTER_ASSERT(r, l[i] == lo[i]);
2508                 REPORTER_ASSERT(r, h[i] == hi[i]);
2509             }
2510         });
2511     }
2512 
2513     {
2514         skvm::Builder b;
2515         {
2516             skvm::Ptr widePtr = b.varying<uint64_t>(),
2517                         loPtr = b.varying<int>(),
2518                         hiPtr = b.varying<int>();
2519             b.store64(widePtr, b.load32(loPtr), b.load32(hiPtr));
2520         }
2521         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2522             uint64_t w[65];
2523             program.eval(65, w,lo,hi);
2524             for (int i = 0; i < 65; i++) {
2525                 REPORTER_ASSERT(r, w[i] == wide[i]);
2526             }
2527         });
2528     }
2529 }
2530 
DEF_TEST(SkVM_128bit,r)2531 DEF_TEST(SkVM_128bit, r) {
2532     float   floats[4*63];
2533     uint8_t packed[4*63];
2534 
2535     for (int i = 0; i < 4*63; i++) {
2536         floats[i] = i * (1/255.0f);
2537     }
2538 
2539     skvm::PixelFormat rgba_ffff = skvm::SkColorType_to_PixelFormat(kRGBA_F32_SkColorType),
2540                       rgba_8888 = skvm::SkColorType_to_PixelFormat(kRGBA_8888_SkColorType);
2541 
2542     {  // Convert RGBA F32 to RGBA 8888, testing 128-bit loads.
2543         skvm::Builder b;
2544         {
2545             skvm::Ptr dst = b.varying(4),
2546                       src = b.varying(16);
2547 
2548             skvm::Color c = b.load(rgba_ffff, src);
2549             b.store(rgba_8888, dst, c);
2550         }
2551         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2552             memset(packed, 0, sizeof(packed));
2553             program.eval(63, packed, floats);
2554             for (int i = 0; i < 4*63; i++) {
2555                 REPORTER_ASSERT(r, packed[i] == i);
2556             }
2557         });
2558     }
2559 
2560 
2561     {  // Convert RGBA 8888 to RGBA F32, testing 128-bit stores.
2562         skvm::Builder b;
2563         {
2564             skvm::Ptr dst = b.varying(16),
2565                       src = b.varying(4);
2566 
2567             skvm::Color c = b.load(rgba_8888, src);
2568             b.store(rgba_ffff, dst, c);
2569         }
2570         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2571             memset(floats, 0, sizeof(floats));
2572             program.eval(63, floats, packed);
2573             for (int i = 0; i < 4*63; i++) {
2574                 REPORTER_ASSERT(r, floats[i] == i * (1/255.0f));
2575             }
2576         });
2577     }
2578 
2579 }
2580 
DEF_TEST(SkVM_is_NaN_is_finite,r)2581 DEF_TEST(SkVM_is_NaN_is_finite, r) {
2582     skvm::Builder b;
2583     {
2584         skvm::Ptr src = b.varying<float>(),
2585                   nan = b.varying<int>(),
2586                   fin = b.varying<int>();
2587         b.store32(nan, is_NaN   (b.loadF(src)));
2588         b.store32(fin, is_finite(b.loadF(src)));
2589     }
2590     test_jit_and_interpreter(b, [&](const skvm::Program& program){
2591         // ±NaN, ±0, ±1, ±inf
2592         const uint32_t bits[] = {0x7f80'0001, 0xff80'0001, 0x0000'0000, 0x8000'0000,
2593                                  0x3f80'0000, 0xbf80'0000, 0x7f80'0000, 0xff80'0000};
2594         uint32_t nan[8], fin[8];
2595         program.eval(8, bits, nan,fin);
2596 
2597         for (int i = 0; i < 8; i++) {
2598             REPORTER_ASSERT(r, nan[i] == ((i == 0 || i == 1) ? 0xffffffff : 0));
2599             REPORTER_ASSERT(r, fin[i] == ((i == 2 || i == 3 ||
2600                                            i == 4 || i == 5) ? 0xffffffff : 0));
2601         }
2602     });
2603 }
2604 
DEF_TEST(SkVM_args,r)2605 DEF_TEST(SkVM_args, r) {
2606     // Test we can handle at least six arguments.
2607     skvm::Builder b;
2608     {
2609         skvm::Ptr dst = b.varying<float>(),
2610                     A = b.varying<float>(),
2611                     B = b.varying<float>(),
2612                     C = b.varying<float>(),
2613                     D = b.varying<float>(),
2614                     E = b.varying<float>();
2615         storeF(dst, b.loadF(A)
2616                   + b.loadF(B)
2617                   + b.loadF(C)
2618                   + b.loadF(D)
2619                   + b.loadF(E));
2620     }
2621 
2622     test_jit_and_interpreter(b, [&](const skvm::Program& program){
2623         float dst[17],A[17],B[17],C[17],D[17],E[17];
2624         for (int i = 0; i < 17; i++) {
2625             A[i] = B[i] = C[i] = D[i] = E[i] = (float)i;
2626         }
2627         program.eval(17, dst,A,B,C,D,E);
2628         for (int i = 0; i < 17; i++) {
2629             REPORTER_ASSERT(r, dst[i] == 5.0f*i);
2630         }
2631     });
2632 }
2633 
DEF_TEST(SkVM_badpack,reporter)2634 DEF_TEST(SkVM_badpack, reporter) {
2635     // Test case distilled from actual failing draw,
2636     // originally with a bad arm64 implementation of pack().
2637     skvm::Builder p;
2638     {
2639         skvm::UPtr uniforms = p.uniform();
2640         skvm::Ptr dst = p.varying<uint16_t>();
2641 
2642         skvm::I32 r = round(p.uniformF(uniforms, 8) * 15),
2643                   a = p.splat(0xf);
2644 
2645         skvm::I32 _4444 = p.splat(0);
2646         _4444 = pack(_4444, r, 12);
2647         _4444 = pack(_4444, a,  0);
2648         store16(dst, _4444);
2649     }
2650 
2651     test_jit_and_interpreter(p, [&](const skvm::Program& program){
2652         const float uniforms[] = { 0.0f, 0.0f,
2653                                    1.0f, 0.0f, 0.0f, 1.0f };
2654 
2655         uint16_t dst[17] = {0};
2656         program.eval(17, uniforms,dst);
2657         for (int i = 0; i < 17; i++) {
2658             REPORTER_ASSERT(reporter, dst[i] == 0xf00f, "got %04x, want %04x\n", dst[i], 0xf00f);
2659         }
2660     });
2661 }
2662 
DEF_TEST(SkVM_features,r)2663 DEF_TEST(SkVM_features, r) {
2664     auto build_program = [](skvm::Builder* b) {
2665         skvm::F32 x = b->loadF(b->varying<float>());
2666         b->storeF(b->varying<float>(), x*x+x);
2667     };
2668 
2669     {   // load-fma-store with FMA available.
2670         skvm::Features features;
2671         features.fma = true;
2672         skvm::Builder b(features);
2673         build_program(&b);
2674         REPORTER_ASSERT(r, b.optimize().size() == 3);
2675     }
2676 
2677     {   // load-mul-add-store without FMA.
2678         skvm::Features features;
2679         features.fma = false;
2680         skvm::Builder b(features);
2681         build_program(&b);
2682         REPORTER_ASSERT(r, b.optimize().size() == 4);
2683     }
2684 
2685     {   // Auto-detected, could be either.
2686         skvm::Builder b;
2687         build_program(&b);
2688         REPORTER_ASSERT(r, b.optimize().size() == 3
2689                         || b.optimize().size() == 4);
2690     }
2691 }
2692 
DEF_TEST(SkVM_gather_can_hoist,r)2693 DEF_TEST(SkVM_gather_can_hoist, r) {
2694     // A gather instruction isn't necessarily varying... it's whatever its index is.
2695     // First a typical gather scenario with varying index.
2696     {
2697         skvm::Builder b;
2698         skvm::UPtr uniforms = b.uniform();
2699         skvm::Ptr buf = b.varying<int>();
2700         skvm::I32 ix = b.load32(buf);
2701         b.store32(buf, b.gather32(uniforms,0, ix));
2702 
2703         skvm::Program p = b.done();
2704 
2705         // ix is varying, so the gather is too.
2706         //
2707         // loop:
2708         //     v0 = load32 buf
2709         //     v1 = gather32 uniforms+0 v0
2710         //     store32 buf v1
2711         REPORTER_ASSERT(r, p.instructions().size() == 3);
2712         REPORTER_ASSERT(r, p.loop() == 0);
2713     }
2714 
2715     // Now the same but with a uniform index instead.
2716     {
2717         skvm::Builder b;
2718         skvm::UPtr uniforms = b.uniform();
2719         skvm::Ptr buf = b.varying<int>();
2720         skvm::I32 ix = b.uniform32(uniforms,8);
2721         b.store32(buf, b.gather32(uniforms,0, ix));
2722 
2723         skvm::Program p = b.done();
2724 
2725         // ix is uniform, so the gather is too.
2726         //
2727         // v0 = uniform32 uniforms+8
2728         // v1 = gather32 uniforms+0 v0
2729         // loop:
2730         //     store32 buf v1
2731         REPORTER_ASSERT(r, p.instructions().size() == 3);
2732         REPORTER_ASSERT(r, p.loop() == 2);
2733     }
2734 }
2735 
DEF_TEST(SkVM_dont_dedup_loads,r)2736 DEF_TEST(SkVM_dont_dedup_loads, r) {
2737     // We've been assuming that all Ops with the same arguments produce the same value
2738     // and deduplicating them, which results in a simple common subexpression eliminator.
2739     //
2740     // But we can't soundly dedup two identical loads with a store between.
2741     // If we dedup the loads in this test program it will always increment by 1, not K.
2742     constexpr int K = 2;
2743     skvm::Builder b;
2744     {
2745         skvm::Ptr buf = b.varying<int>();
2746         for (int i = 0; i < K; i++) {
2747             b.store32(buf, b.load32(buf) + 1);
2748         }
2749     }
2750 
2751     test_jit_and_interpreter(b, [&](const skvm::Program& program){
2752         int buf[] = { 0,1,2,3,4 };
2753         program.eval(std::size(buf), buf);
2754         for (int i = 0; i < (int)std::size(buf); i++) {
2755             REPORTER_ASSERT(r, buf[i] == i+K);
2756         }
2757     });
2758 }
2759 
DEF_TEST(SkVM_dont_dedup_stores,r)2760 DEF_TEST(SkVM_dont_dedup_stores, r) {
2761     // Following a similar line of reasoning to SkVM_dont_dedup_loads,
2762     // we cannot dedup stores either.  A different store between two identical stores
2763     // will invalidate the first store, meaning we do need to reissue that store operation.
2764     skvm::Builder b;
2765     {
2766         skvm::Ptr buf = b.varying<int>();
2767         b.store32(buf, b.splat(4));
2768         b.store32(buf, b.splat(5));
2769         b.store32(buf, b.splat(4));   // If we dedup'd, we'd skip this store.
2770     }
2771 
2772     test_jit_and_interpreter(b, [&](const skvm::Program& program){
2773         int buf[42];
2774         program.eval(std::size(buf), buf);
2775         for (int x : buf) {
2776             REPORTER_ASSERT(r, x == 4);
2777         }
2778     });
2779 }
2780 
DEF_TEST(SkVM_fast_mul,r)2781 DEF_TEST(SkVM_fast_mul, r) {
2782     skvm::Builder b;
2783     {
2784         skvm::Ptr src = b.varying<float>(),
2785                  fast = b.varying<float>(),
2786                  slow = b.varying<float>();
2787         skvm::F32 x = b.loadF(src);
2788         b.storeF(fast, fast_mul(0.0f, x));
2789         b.storeF(slow, 0.0f * x);
2790     }
2791     test_jit_and_interpreter(b, [&](const skvm::Program& program){
2792         const uint32_t bits[] = {
2793             0x0000'0000, 0x8000'0000, //±0
2794             0x3f80'0000, 0xbf80'0000, //±1
2795             0x7f80'0000, 0xff80'0000, //±inf
2796             0x7f80'0001, 0xff80'0001, //±NaN
2797         };
2798         float fast[8],
2799               slow[8];
2800         program.eval(8,bits,fast,slow);
2801 
2802         for (int i = 0; i < 8; i++) {
2803             REPORTER_ASSERT(r, fast[i] == 0.0f);
2804 
2805             if (i < 4) {
2806                 REPORTER_ASSERT(r, slow[i] == 0.0f);
2807             } else {
2808                 REPORTER_ASSERT(r, std::isnan(slow[i]));
2809             }
2810         }
2811     });
2812 }
2813 
DEF_TEST(SkVM_duplicates,reporter)2814 DEF_TEST(SkVM_duplicates, reporter) {
2815     {
2816         skvm::Builder p(true);
2817         auto rptr = p.varying<int>();
2818 
2819         skvm::F32 r = p.loadF(rptr),
2820                   g = p.splat(0.0f),
2821                   b = p.splat(0.0f),
2822                   a = p.splat(1.0f);
2823 
2824         p.unpremul(&r, &g, &b, a);
2825         p.storeF(rptr, r);
2826 
2827         std::vector<skvm::Instruction> program = b->program();
2828 
2829         auto withDuplicates = skvm::finalize(program);
2830         int duplicates = 0;
2831         for (const auto& instr : withDuplicates) {
2832             if (instr.op == skvm::Op::duplicate) {
2833                 ++duplicates;
2834             }
2835         }
2836         REPORTER_ASSERT(reporter, duplicates > 0);
2837 
2838         auto eliminatedAsDeadCode = skvm::eliminate_dead_code(program);
2839         for (const auto& instr : eliminatedAsDeadCode) {
2840             REPORTER_ASSERT(reporter, instr.op != skvm::Op::duplicate);
2841         }
2842     }
2843 
2844     {
2845         skvm::Builder p(false);
2846         auto rptr = p.varying<int>();
2847 
2848         skvm::F32 r = p.loadF(rptr),
2849                   g = p.splat(0.0f),
2850                   b = p.splat(0.0f),
2851                   a = p.splat(1.0f);
2852 
2853         p.unpremul(&r, &g, &b, a);
2854         p.storeF(rptr, r);
2855 
2856         auto withoutDuplicates = p.done().instructions();
2857         for (const auto& instr : withoutDuplicates) {
2858             REPORTER_ASSERT(reporter, instr.op != skvm::Op::duplicate);
2859         }
2860     }
2861 }
2862 
DEF_TEST(SkVM_Visualizer,r)2863 DEF_TEST(SkVM_Visualizer, r) {
2864     const char* src =
2865             "int main(int x, int y) {\n"
2866             "   int a = 99;\n"
2867             "   if (x > 0) a += 100;\n"
2868             "   if (y > 0) a += 101;\n"
2869             "   a = 102;\n"
2870             "   return a;\n"
2871             "}";
2872     SkSL::Compiler compiler(SkSL::ShaderCapsFactory::Default());
2873     SkSL::ProgramSettings settings;
2874     auto program = compiler.convertProgram(SkSL::ProgramKind::kGeneric,
2875                                            std::string(src), settings);
2876     const SkSL::FunctionDeclaration* main = program->getFunction("main");
2877     SkSL::SkVMDebugTrace d;
2878     d.setSource(src);
2879     auto v = std::make_unique<skvm::viz::Visualizer>(&d);
2880     skvm::Builder b(skvm::Features{}, /*createDuplicates=*/true);
2881     SkSL::ProgramToSkVM(*program, *main->definition(), &b, &d, /*uniforms=*/{});
2882 
2883     skvm::Program p = b.done(nullptr, true, std::move(v));
2884     SkDynamicMemoryWStream vizFile;
2885     p.visualizer()->dump(&vizFile);
2886     auto vizData = vizFile.detachAsData();
2887     std::string html((const char*)vizData->data(), vizData->size());
2888     //b.dump();
2889     //std::printf(html.c_str());
2890     // Check that html contains all types of information:
2891     REPORTER_ASSERT(r, std::strstr(html.c_str(), "<tr class='normal'>"));       // SkVM byte code
2892     REPORTER_ASSERT(r, std::strstr(html.c_str(), "<tr class='source'>"));       // C++ source
2893     REPORTER_ASSERT(r, std::strstr(html.c_str(), "<tr class='dead'>"));         // dead code
2894     REPORTER_ASSERT(r, std::strstr(html.c_str(), "<tr class='dead deduped'>")); // deduped removed
2895     REPORTER_ASSERT(r, std::strstr(html.c_str(),                                // deduped origins
2896                        "<tr class='normal origin'>"
2897                        "<td>&#8593;&#8593;&#8593; *13</td>"
2898                        "<td>v2 = splat 0 (0)</td></tr>"));
2899     REPORTER_ASSERT(r, std::strstr(html.c_str(),                                // trace enter
2900                        "<tr class='source'><td class='mask'>&#8618;v9</td>"
2901                                    "<td colspan=2>int main(int x, int y)</td></tr>"));
2902     REPORTER_ASSERT(r, std::strstr(html.c_str(),                                // trace exit
2903                        "<tr class='source'><td class='mask'>&#8617;v9</td>"
2904                        "<td colspan=2>int main(int x, int y)</td></tr>"));
2905 }
2906