• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2019 Google LLC
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "include/core/SkColorPriv.h"
9 #include "include/private/SkColorData.h"
10 #include "src/core/SkCpu.h"
11 #include "src/core/SkMSAN.h"
12 #include "src/core/SkVM.h"
13 #include "tests/Test.h"
14 
15 template <typename Fn>
test_jit_and_interpreter(const skvm::Builder & b,Fn && test)16 static void test_jit_and_interpreter(const skvm::Builder& b, Fn&& test) {
17     skvm::Program p = b.done();
18     test(p);
19     if (p.hasJIT()) {
20         test(b.done(/*debug_name=*/nullptr, /*allow_jit=*/false));
21     }
22 }
23 
DEF_TEST(SkVM_eliminate_dead_code,r)24 DEF_TEST(SkVM_eliminate_dead_code, r) {
25     skvm::Builder b;
26     {
27         skvm::Ptr arg = b.varying<int>();
28         skvm::I32 l = b.load32(arg);
29         skvm::I32 a = b.add(l, l);
30         b.add(a, b.splat(7));
31     }
32 
33     std::vector<skvm::Instruction> program = b.program();
34     REPORTER_ASSERT(r, program.size() == 4);
35 
36     program = skvm::eliminate_dead_code(program);
37     REPORTER_ASSERT(r, program.size() == 0);
38 }
39 
DEF_TEST(SkVM_Pointless,r)40 DEF_TEST(SkVM_Pointless, r) {
41     // Let's build a program with no memory arguments.
42     // It should all be pegged as dead code, but we should be able to "run" it.
43     skvm::Builder b;
44     {
45         b.add(b.splat(5.0f),
46               b.splat(4.0f));
47     }
48 
49     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
50         for (int N = 0; N < 64; N++) {
51             program.eval(N);
52         }
53     });
54 
55     for (const skvm::OptimizedInstruction& inst : b.optimize()) {
56         REPORTER_ASSERT(r, inst.death == 0 && inst.can_hoist == true);
57     }
58 }
59 
DEF_TEST(SkVM_memset,r)60 DEF_TEST(SkVM_memset, r) {
61     skvm::Builder b;
62     b.store32(b.varying<int>(), b.splat(42));
63 
64     test_jit_and_interpreter(b, [&](const skvm::Program& p) {
65         int buf[18];
66         buf[17] = 47;
67 
68         p.eval(17, buf);
69         for (int i = 0; i < 17; i++) {
70             REPORTER_ASSERT(r, buf[i] == 42);
71         }
72         REPORTER_ASSERT(r, buf[17] == 47);
73     });
74 }
75 
DEF_TEST(SkVM_memcpy,r)76 DEF_TEST(SkVM_memcpy, r) {
77     skvm::Builder b;
78     {
79         auto src = b.varying<int>(),
80              dst = b.varying<int>();
81         b.store32(dst, b.load32(src));
82     }
83 
84     test_jit_and_interpreter(b, [&](const skvm::Program& p) {
85         int src[] = {1,2,3,4,5,6,7,8,9},
86             dst[] = {0,0,0,0,0,0,0,0,0};
87 
88         p.eval(SK_ARRAY_COUNT(src)-1, src, dst);
89         for (size_t i = 0; i < SK_ARRAY_COUNT(src)-1; i++) {
90             REPORTER_ASSERT(r, dst[i] == src[i]);
91         }
92         size_t i = SK_ARRAY_COUNT(src)-1;
93         REPORTER_ASSERT(r, dst[i] == 0);
94     });
95 }
96 
DEF_TEST(SkVM_allow_jit,r)97 DEF_TEST(SkVM_allow_jit, r) {
98     skvm::Builder b;
99     {
100         auto src = b.varying<int>(),
101              dst = b.varying<int>();
102         b.store32(dst, b.load32(src));
103     }
104 
105     if (b.done("test-allow_jit", /*allow_jit=*/true).hasJIT()) {
106         REPORTER_ASSERT(r, !b.done("", false).hasJIT());
107     }
108 }
109 
DEF_TEST(SkVM_LoopCounts,r)110 DEF_TEST(SkVM_LoopCounts, r) {
111     // Make sure we cover all the exact N we want.
112 
113     // buf[i] += 1
114     skvm::Builder b;
115     skvm::Ptr arg = b.varying<int>();
116     b.store32(arg,
117               b.add(b.splat(1),
118                     b.load32(arg)));
119 
120     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
121         int buf[64];
122         for (int N = 0; N <= (int)SK_ARRAY_COUNT(buf); N++) {
123             for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
124                 buf[i] = i;
125             }
126             program.eval(N, buf);
127 
128             for (int i = 0; i < N; i++) {
129                 REPORTER_ASSERT(r, buf[i] == i+1);
130             }
131             for (int i = N; i < (int)SK_ARRAY_COUNT(buf); i++) {
132                 REPORTER_ASSERT(r, buf[i] == i);
133             }
134         }
135     });
136 }
137 
DEF_TEST(SkVM_gather32,r)138 DEF_TEST(SkVM_gather32, r) {
139     skvm::Builder b;
140     {
141         skvm::UPtr uniforms = b.uniform();
142         skvm::Ptr buf = b.varying<int>();
143         skvm::I32 x = b.load32(buf);
144         b.store32(buf, b.gather32(uniforms,0, b.bit_and(x, b.splat(7))));
145     }
146 
147     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
148         const int img[] = {12,34,56,78, 90,98,76,54};
149 
150         int buf[20];
151         for (int i = 0; i < 20; i++) {
152             buf[i] = i;
153         }
154 
155         struct Uniforms {
156             const int* img;
157         } uniforms{img};
158 
159         program.eval(20, &uniforms, buf);
160         int i = 0;
161         REPORTER_ASSERT(r, buf[i] == 12); i++;
162         REPORTER_ASSERT(r, buf[i] == 34); i++;
163         REPORTER_ASSERT(r, buf[i] == 56); i++;
164         REPORTER_ASSERT(r, buf[i] == 78); i++;
165         REPORTER_ASSERT(r, buf[i] == 90); i++;
166         REPORTER_ASSERT(r, buf[i] == 98); i++;
167         REPORTER_ASSERT(r, buf[i] == 76); i++;
168         REPORTER_ASSERT(r, buf[i] == 54); i++;
169 
170         REPORTER_ASSERT(r, buf[i] == 12); i++;
171         REPORTER_ASSERT(r, buf[i] == 34); i++;
172         REPORTER_ASSERT(r, buf[i] == 56); i++;
173         REPORTER_ASSERT(r, buf[i] == 78); i++;
174         REPORTER_ASSERT(r, buf[i] == 90); i++;
175         REPORTER_ASSERT(r, buf[i] == 98); i++;
176         REPORTER_ASSERT(r, buf[i] == 76); i++;
177         REPORTER_ASSERT(r, buf[i] == 54); i++;
178 
179         REPORTER_ASSERT(r, buf[i] == 12); i++;
180         REPORTER_ASSERT(r, buf[i] == 34); i++;
181         REPORTER_ASSERT(r, buf[i] == 56); i++;
182         REPORTER_ASSERT(r, buf[i] == 78); i++;
183     });
184 }
185 
DEF_TEST(SkVM_gathers,r)186 DEF_TEST(SkVM_gathers, r) {
187     skvm::Builder b;
188     {
189         skvm::UPtr uniforms = b.uniform();
190         skvm::Ptr buf32    = b.varying<int>(),
191                   buf16    = b.varying<uint16_t>(),
192                   buf8     = b.varying<uint8_t>();
193 
194         skvm::I32 x = b.load32(buf32);
195 
196         b.store32(buf32, b.gather32(uniforms,0, b.bit_and(x, b.splat( 7))));
197         b.store16(buf16, b.gather16(uniforms,0, b.bit_and(x, b.splat(15))));
198         b.store8 (buf8 , b.gather8 (uniforms,0, b.bit_and(x, b.splat(31))));
199     }
200 
201     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
202         const int img[] = {12,34,56,78, 90,98,76,54};
203 
204         constexpr int N = 20;
205         int      buf32[N];
206         uint16_t buf16[N];
207         uint8_t  buf8 [N];
208 
209         for (int i = 0; i < 20; i++) {
210             buf32[i] = i;
211         }
212 
213         struct Uniforms {
214             const int* img;
215         } uniforms{img};
216 
217         program.eval(N, &uniforms, buf32, buf16, buf8);
218         int i = 0;
219         REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 12 && buf8[i] == 12); i++;
220         REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] ==  0 && buf8[i] ==  0); i++;
221         REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 34 && buf8[i] ==  0); i++;
222         REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] ==  0 && buf8[i] ==  0); i++;
223         REPORTER_ASSERT(r, buf32[i] == 90 && buf16[i] == 56 && buf8[i] == 34); i++;
224         REPORTER_ASSERT(r, buf32[i] == 98 && buf16[i] ==  0 && buf8[i] ==  0); i++;
225         REPORTER_ASSERT(r, buf32[i] == 76 && buf16[i] == 78 && buf8[i] ==  0); i++;
226         REPORTER_ASSERT(r, buf32[i] == 54 && buf16[i] ==  0 && buf8[i] ==  0); i++;
227 
228         REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 90 && buf8[i] == 56); i++;
229         REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] ==  0 && buf8[i] ==  0); i++;
230         REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 98 && buf8[i] ==  0); i++;
231         REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] ==  0 && buf8[i] ==  0); i++;
232         REPORTER_ASSERT(r, buf32[i] == 90 && buf16[i] == 76 && buf8[i] == 78); i++;
233         REPORTER_ASSERT(r, buf32[i] == 98 && buf16[i] ==  0 && buf8[i] ==  0); i++;
234         REPORTER_ASSERT(r, buf32[i] == 76 && buf16[i] == 54 && buf8[i] ==  0); i++;
235         REPORTER_ASSERT(r, buf32[i] == 54 && buf16[i] ==  0 && buf8[i] ==  0); i++;
236 
237         REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 12 && buf8[i] == 90); i++;
238         REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] ==  0 && buf8[i] ==  0); i++;
239         REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 34 && buf8[i] ==  0); i++;
240         REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] ==  0 && buf8[i] ==  0); i++;
241     });
242 }
243 
DEF_TEST(SkVM_gathers2,r)244 DEF_TEST(SkVM_gathers2, r) {
245     skvm::Builder b;
246     {
247         skvm::UPtr uniforms = b.uniform();
248         skvm::Ptr buf32    = b.varying<int>(),
249                   buf16    = b.varying<uint16_t>(),
250                   buf8     = b.varying<uint8_t>();
251 
252         skvm::I32 x = b.load32(buf32);
253 
254         b.store32(buf32, b.gather32(uniforms,0, x));
255         b.store16(buf16, b.gather16(uniforms,0, x));
256         b.store8 (buf8 , b.gather8 (uniforms,0, x));
257     }
258 
259     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
260         uint8_t img[256];
261         for (int i = 0; i < 256; i++) {
262             img[i] = i;
263         }
264 
265         int      buf32[64];
266         uint16_t buf16[64];
267         uint8_t  buf8 [64];
268 
269         for (int i = 0; i < 64; i++) {
270             buf32[i] = (i*47)&63;
271             buf16[i] = 0;
272             buf8 [i] = 0;
273         }
274 
275         struct Uniforms {
276             const uint8_t* img;
277         } uniforms{img};
278 
279         program.eval(64, &uniforms, buf32, buf16, buf8);
280 
281         for (int i = 0; i < 64; i++) {
282             REPORTER_ASSERT(r, buf8[i] == ((i*47)&63));  // 0,47,30,13,60,...
283         }
284 
285         REPORTER_ASSERT(r, buf16[ 0] == 0x0100);
286         REPORTER_ASSERT(r, buf16[63] == 0x2322);
287 
288         REPORTER_ASSERT(r, buf32[ 0] == 0x03020100);
289         REPORTER_ASSERT(r, buf32[63] == 0x47464544);
290     });
291 }
292 
DEF_TEST(SkVM_bitops,r)293 DEF_TEST(SkVM_bitops, r) {
294     skvm::Builder b;
295     {
296         skvm::Ptr ptr = b.varying<int>();
297 
298         skvm::I32 x = b.load32(ptr);
299 
300         x = b.bit_and  (x, b.splat(0xf1));  // 0x40
301         x = b.bit_or   (x, b.splat(0x80));  // 0xc0
302         x = b.bit_xor  (x, b.splat(0xfe));  // 0x3e
303         x = b.bit_clear(x, b.splat(0x30));  // 0x0e
304 
305         x = b.shl(x, 28);  // 0xe000'0000
306         x = b.sra(x, 28);  // 0xffff'fffe
307         x = b.shr(x,  1);  // 0x7fff'ffff
308 
309         b.store32(ptr, x);
310     }
311 
312     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
313         int x = 0x42;
314         program.eval(1, &x);
315         REPORTER_ASSERT(r, x == 0x7fff'ffff);
316     });
317 }
318 
DEF_TEST(SkVM_select_is_NaN,r)319 DEF_TEST(SkVM_select_is_NaN, r) {
320     skvm::Builder b;
321     {
322         skvm::Ptr src = b.varying<float>(),
323                   dst = b.varying<float>();
324 
325         skvm::F32 x = b.loadF(src);
326         x = select(is_NaN(x), b.splat(0.0f)
327                             , x);
328         b.storeF(dst, x);
329     }
330 
331     std::vector<skvm::OptimizedInstruction> program = b.optimize();
332     REPORTER_ASSERT(r, program.size() == 4);
333     REPORTER_ASSERT(r, program[0].op == skvm::Op::load32);
334     REPORTER_ASSERT(r, program[1].op == skvm::Op::neq_f32);
335     REPORTER_ASSERT(r, program[2].op == skvm::Op::bit_clear);
336     REPORTER_ASSERT(r, program[3].op == skvm::Op::store32);
337 
338     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
339         // ±NaN, ±0, ±1, ±inf
340         uint32_t src[] = {0x7f80'0001, 0xff80'0001, 0x0000'0000, 0x8000'0000,
341                           0x3f80'0000, 0xbf80'0000, 0x7f80'0000, 0xff80'0000};
342         uint32_t dst[SK_ARRAY_COUNT(src)];
343         program.eval(SK_ARRAY_COUNT(src), src, dst);
344 
345         for (int i = 0; i < (int)SK_ARRAY_COUNT(src); i++) {
346             REPORTER_ASSERT(r, dst[i] == (i < 2 ? 0 : src[i]));
347         }
348     });
349 }
350 
DEF_TEST(SkVM_f32,r)351 DEF_TEST(SkVM_f32, r) {
352     skvm::Builder b;
353     {
354         skvm::Ptr arg = b.varying<float>();
355 
356         skvm::F32 x = b.loadF(arg),
357                   y = b.add(x,x),   // y = 2x
358                   z = b.sub(y,x),   // z = 2x-x = x
359                   w = b.div(z,x);   // w = x/x = 1
360         b.storeF(arg, w);
361     }
362 
363     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
364         float buf[] = { 1,2,3,4,5,6,7,8,9 };
365         program.eval(SK_ARRAY_COUNT(buf), buf);
366         for (float v : buf) {
367             REPORTER_ASSERT(r, v == 1.0f);
368         }
369     });
370 }
371 
DEF_TEST(SkVM_cmp_i32,r)372 DEF_TEST(SkVM_cmp_i32, r) {
373     skvm::Builder b;
374     {
375         skvm::I32 x = b.load32(b.varying<int>());
376 
377         auto to_bit = [&](int shift, skvm::I32 mask) {
378             return b.shl(b.bit_and(mask, b.splat(0x1)), shift);
379         };
380 
381         skvm::I32 m = b.splat(0);
382         m = b.bit_or(m, to_bit(0, b. eq(x, b.splat(0))));
383         m = b.bit_or(m, to_bit(1, b.neq(x, b.splat(1))));
384         m = b.bit_or(m, to_bit(2, b. lt(x, b.splat(2))));
385         m = b.bit_or(m, to_bit(3, b.lte(x, b.splat(3))));
386         m = b.bit_or(m, to_bit(4, b. gt(x, b.splat(4))));
387         m = b.bit_or(m, to_bit(5, b.gte(x, b.splat(5))));
388 
389         b.store32(b.varying<int>(), m);
390     }
391     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
392         int in[] = { 0,1,2,3,4,5,6,7,8,9 };
393         int out[SK_ARRAY_COUNT(in)];
394 
395         program.eval(SK_ARRAY_COUNT(in), in, out);
396 
397         REPORTER_ASSERT(r, out[0] == 0b001111);
398         REPORTER_ASSERT(r, out[1] == 0b001100);
399         REPORTER_ASSERT(r, out[2] == 0b001010);
400         REPORTER_ASSERT(r, out[3] == 0b001010);
401         REPORTER_ASSERT(r, out[4] == 0b000010);
402         for (int i = 5; i < (int)SK_ARRAY_COUNT(out); i++) {
403             REPORTER_ASSERT(r, out[i] == 0b110010);
404         }
405     });
406 }
407 
DEF_TEST(SkVM_cmp_f32,r)408 DEF_TEST(SkVM_cmp_f32, r) {
409     skvm::Builder b;
410     {
411         skvm::F32 x = b.loadF(b.varying<float>());
412 
413         auto to_bit = [&](int shift, skvm::I32 mask) {
414             return b.shl(b.bit_and(mask, b.splat(0x1)), shift);
415         };
416 
417         skvm::I32 m = b.splat(0);
418         m = b.bit_or(m, to_bit(0, b. eq(x, b.splat(0.0f))));
419         m = b.bit_or(m, to_bit(1, b.neq(x, b.splat(1.0f))));
420         m = b.bit_or(m, to_bit(2, b. lt(x, b.splat(2.0f))));
421         m = b.bit_or(m, to_bit(3, b.lte(x, b.splat(3.0f))));
422         m = b.bit_or(m, to_bit(4, b. gt(x, b.splat(4.0f))));
423         m = b.bit_or(m, to_bit(5, b.gte(x, b.splat(5.0f))));
424 
425         b.store32(b.varying<int>(), m);
426     }
427 
428     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
429         float in[] = { 0,1,2,3,4,5,6,7,8,9 };
430         int out[SK_ARRAY_COUNT(in)];
431 
432         program.eval(SK_ARRAY_COUNT(in), in, out);
433 
434         REPORTER_ASSERT(r, out[0] == 0b001111);
435         REPORTER_ASSERT(r, out[1] == 0b001100);
436         REPORTER_ASSERT(r, out[2] == 0b001010);
437         REPORTER_ASSERT(r, out[3] == 0b001010);
438         REPORTER_ASSERT(r, out[4] == 0b000010);
439         for (int i = 5; i < (int)SK_ARRAY_COUNT(out); i++) {
440             REPORTER_ASSERT(r, out[i] == 0b110010);
441         }
442     });
443 }
444 
DEF_TEST(SkVM_index,r)445 DEF_TEST(SkVM_index, r) {
446     skvm::Builder b;
447     b.store32(b.varying<int>(), b.index());
448 
449     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
450         int buf[23];
451         program.eval(SK_ARRAY_COUNT(buf), buf);
452         for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
453             REPORTER_ASSERT(r, buf[i] == (int)SK_ARRAY_COUNT(buf)-i);
454         }
455     });
456 }
457 
DEF_TEST(SkVM_mad,r)458 DEF_TEST(SkVM_mad, r) {
459     // This program is designed to exercise the tricky corners of instruction
460     // and register selection for Op::mad_f32.
461 
462     skvm::Builder b;
463     {
464         skvm::Ptr arg = b.varying<int>();
465 
466         skvm::F32 x = b.to_F32(b.load32(arg)),
467                   y = b.mad(x,x,x),   // x is needed in the future, so r[x] != r[y].
468                   z = b.mad(y,y,x),   // y is needed in the future, but r[z] = r[x] is ok.
469                   w = b.mad(z,z,y),   // w can alias z but not y.
470                   v = b.mad(w,y,w);   // Got to stop somewhere.
471         b.store32(arg, b.trunc(v));
472     }
473 
474     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
475         int x = 2;
476         program.eval(1, &x);
477         // x = 2
478         // y = 2*2 + 2 = 6
479         // z = 6*6 + 2 = 38
480         // w = 38*38 + 6 = 1450
481         // v = 1450*6 + 1450 = 10150
482         REPORTER_ASSERT(r, x == 10150);
483     });
484 }
485 
DEF_TEST(SkVM_fms,r)486 DEF_TEST(SkVM_fms, r) {
487     // Create a pattern that can be peepholed into an Op::fms_f32.
488     skvm::Builder b;
489     {
490         skvm::Ptr arg = b.varying<int>();
491 
492         skvm::F32 x = b.to_F32(b.load32(arg)),
493                   v = b.sub(b.mul(x, b.splat(2.0f)),
494                             b.splat(1.0f));
495         b.store32(arg, b.trunc(v));
496     }
497 
498     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
499         int buf[] = {0,1,2,3,4,5,6,7,8,9,10};
500         program.eval((int)SK_ARRAY_COUNT(buf), &buf);
501 
502         for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
503             REPORTER_ASSERT(r, buf[i] = 2*i-1);
504         }
505     });
506 }
507 
DEF_TEST(SkVM_fnma,r)508 DEF_TEST(SkVM_fnma, r) {
509     // Create a pattern that can be peepholed into an Op::fnma_f32.
510     skvm::Builder b;
511     {
512         skvm::Ptr arg = b.varying<int>();
513 
514         skvm::F32 x = b.to_F32(b.load32(arg)),
515                   v = b.sub(b.splat(1.0f),
516                             b.mul(x, b.splat(2.0f)));
517         b.store32(arg, b.trunc(v));
518     }
519 
520     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
521         int buf[] = {0,1,2,3,4,5,6,7,8,9,10};
522         program.eval((int)SK_ARRAY_COUNT(buf), &buf);
523 
524         for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
525             REPORTER_ASSERT(r, buf[i] = 1-2*i);
526         }
527     });
528 }
529 
DEF_TEST(SkVM_madder,r)530 DEF_TEST(SkVM_madder, r) {
531     skvm::Builder b;
532     {
533         skvm::Ptr arg = b.varying<float>();
534 
535         skvm::F32 x = b.loadF(arg),
536                   y = b.mad(x,x,x),   // x is needed in the future, so r[x] != r[y].
537                   z = b.mad(y,x,y),   // r[x] can be reused after this instruction, but not r[y].
538                   w = b.mad(y,y,z);
539         b.storeF(arg, w);
540     }
541 
542     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
543         float x = 2.0f;
544         // y = 2*2 + 2 = 6
545         // z = 6*2 + 6 = 18
546         // w = 6*6 + 18 = 54
547         program.eval(1, &x);
548         REPORTER_ASSERT(r, x == 54.0f);
549     });
550 }
551 
DEF_TEST(SkVM_floor,r)552 DEF_TEST(SkVM_floor, r) {
553     skvm::Builder b;
554     {
555         skvm::Ptr arg = b.varying<float>();
556         b.storeF(arg, b.floor(b.loadF(arg)));
557     }
558 
559     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
560         float buf[]  = { -2.0f, -1.5f, -1.0f, 0.0f, 1.0f, 1.5f, 2.0f };
561         float want[] = { -2.0f, -2.0f, -1.0f, 0.0f, 1.0f, 1.0f, 2.0f };
562         program.eval(SK_ARRAY_COUNT(buf), buf);
563         for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
564             REPORTER_ASSERT(r, buf[i] == want[i]);
565         }
566     });
567 }
568 
DEF_TEST(SkVM_round,r)569 DEF_TEST(SkVM_round, r) {
570     skvm::Builder b;
571     {
572         skvm::Ptr src = b.varying<float>();
573         skvm::Ptr dst = b.varying<int>();
574         b.store32(dst, b.round(b.loadF(src)));
575     }
576 
577     // The test cases on exact 0.5f boundaries assume the current rounding mode is nearest even.
578     // We haven't explicitly guaranteed that here... it just probably is.
579     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
580         float buf[]  = { -1.5f, -0.5f, 0.0f, 0.5f, 0.2f, 0.6f, 1.0f, 1.4f, 1.5f, 2.0f };
581         int want[] =   { -2   ,  0   , 0   , 0   , 0   , 1   , 1   , 1   , 2   , 2    };
582         int dst[SK_ARRAY_COUNT(buf)];
583 
584         program.eval(SK_ARRAY_COUNT(buf), buf, dst);
585         for (int i = 0; i < (int)SK_ARRAY_COUNT(dst); i++) {
586             REPORTER_ASSERT(r, dst[i] == want[i]);
587         }
588     });
589 }
590 
DEF_TEST(SkVM_min,r)591 DEF_TEST(SkVM_min, r) {
592     skvm::Builder b;
593     {
594         skvm::Ptr src1 = b.varying<float>();
595         skvm::Ptr src2 = b.varying<float>();
596         skvm::Ptr dst = b.varying<float>();
597 
598         b.storeF(dst, b.min(b.loadF(src1), b.loadF(src2)));
599     }
600 
601     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
602         float s1[]  =  { 0.0f, 1.0f, 4.0f, -1.0f, -1.0f};
603         float s2[]  =  { 0.0f, 2.0f, 3.0f,  1.0f, -2.0f};
604         float want[] = { 0.0f, 1.0f, 3.0f, -1.0f, -2.0f};
605         float d[SK_ARRAY_COUNT(s1)];
606         program.eval(SK_ARRAY_COUNT(d), s1, s2, d);
607         for (int i = 0; i < (int)SK_ARRAY_COUNT(d); i++) {
608           REPORTER_ASSERT(r, d[i] == want[i]);
609         }
610     });
611 }
612 
DEF_TEST(SkVM_max,r)613 DEF_TEST(SkVM_max, r) {
614     skvm::Builder b;
615     {
616         skvm::Ptr src1 = b.varying<float>();
617         skvm::Ptr src2 = b.varying<float>();
618         skvm::Ptr dst = b.varying<float>();
619 
620         b.storeF(dst, b.max(b.loadF(src1), b.loadF(src2)));
621     }
622 
623     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
624         float s1[]  =  { 0.0f, 1.0f, 4.0f, -1.0f, -1.0f};
625         float s2[]  =  { 0.0f, 2.0f, 3.0f,  1.0f, -2.0f};
626         float want[] = { 0.0f, 2.0f, 4.0f,  1.0f, -1.0f};
627         float d[SK_ARRAY_COUNT(s1)];
628         program.eval(SK_ARRAY_COUNT(d), s1, s2, d);
629         for (int i = 0; i < (int)SK_ARRAY_COUNT(d); i++) {
630           REPORTER_ASSERT(r, d[i] == want[i]);
631         }
632     });
633 }
634 
DEF_TEST(SkVM_hoist,r)635 DEF_TEST(SkVM_hoist, r) {
636     // This program uses enough constants that it will fail to JIT if we hoist them.
637     // The JIT will try again without hoisting, and that'll just need 2 registers.
638     skvm::Builder b;
639     {
640         skvm::Ptr arg = b.varying<int>();
641         skvm::I32 x = b.load32(arg);
642         for (int i = 0; i < 32; i++) {
643             x = b.add(x, b.splat(i));
644         }
645         b.store32(arg, x);
646     }
647 
648     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
649         int x = 4;
650         program.eval(1, &x);
651         // x += 0 + 1 + 2 + 3 + ... + 30 + 31
652         // x += 496
653         REPORTER_ASSERT(r, x == 500);
654     });
655 }
656 
DEF_TEST(SkVM_select,r)657 DEF_TEST(SkVM_select, r) {
658     skvm::Builder b;
659     {
660         skvm::Ptr buf = b.varying<int>();
661 
662         skvm::I32 x = b.load32(buf);
663 
664         x = b.select( b.gt(x, b.splat(4)), x, b.splat(42) );
665 
666         b.store32(buf, x);
667     }
668 
669     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
670         int buf[] = { 0,1,2,3,4,5,6,7,8 };
671         program.eval(SK_ARRAY_COUNT(buf), buf);
672         for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
673             REPORTER_ASSERT(r, buf[i] == (i > 4 ? i : 42));
674         }
675     });
676 }
677 
DEF_TEST(SkVM_swap,r)678 DEF_TEST(SkVM_swap, r) {
679     skvm::Builder b;
680     {
681         // This program is the equivalent of
682         //     x = *X
683         //     y = *Y
684         //     *X = y
685         //     *Y = x
686         // One rescheduling of the program based only on data flow of Op arguments is
687         //     x = *X
688         //     *Y = x
689         //     y = *Y
690         //     *X = y
691         // but this reordering does not produce the same results and is invalid.
692         skvm::Ptr X = b.varying<int>(),
693                   Y = b.varying<int>();
694 
695         skvm::I32 x = b.load32(X),
696                   y = b.load32(Y);
697 
698         b.store32(X, y);
699         b.store32(Y, x);
700     }
701 
702     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
703         int b1[] = { 0,1,2,3 };
704         int b2[] = { 4,5,6,7 };
705         program.eval(SK_ARRAY_COUNT(b1), b1, b2);
706         for (int i = 0; i < (int)SK_ARRAY_COUNT(b1); i++) {
707             REPORTER_ASSERT(r, b1[i] == 4 + i);
708             REPORTER_ASSERT(r, b2[i] == i);
709         }
710     });
711 }
712 
DEF_TEST(SkVM_NewOps,r)713 DEF_TEST(SkVM_NewOps, r) {
714     // Exercise a somewhat arbitrary set of new ops.
715     skvm::Builder b;
716     {
717         skvm::Ptr buf = b.varying<int16_t>();
718         skvm::UPtr uniforms = b.uniform();
719 
720         skvm::I32 x = b.load16(buf);
721 
722         const size_t kPtr = sizeof(const int*);
723 
724         x = b.add(x, b.uniform32(uniforms, kPtr+0));
725         x = b.mul(x, b.uniform32(uniforms, kPtr+4));
726         x = b.sub(x, b.uniform32(uniforms, kPtr+8));
727 
728         skvm::I32 limit = b.uniform32(uniforms, kPtr+12);
729         x = b.select(b.lt(x, b.splat(0)), b.splat(0), x);
730         x = b.select(b.gt(x, limit     ), limit     , x);
731 
732         x = b.gather8(uniforms,0, x);
733 
734         b.store16(buf, x);
735     }
736 
737     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
738         const int N = 31;
739         int16_t buf[N];
740         for (int i = 0; i < N; i++) {
741             buf[i] = i;
742         }
743 
744         const int M = 16;
745         uint8_t img[M];
746         for (int i = 0; i < M; i++) {
747             img[i] = i*i;
748         }
749 
750         struct {
751             const uint8_t* img;
752             int      add   = 5;
753             int      mul   = 3;
754             int      sub   = 18;
755             int      limit = M-1;
756         } uniforms{img};
757 
758         program.eval(N, buf, &uniforms);
759 
760         for (int i = 0; i < N; i++) {
761             // Our first math calculates x = (i+5)*3 - 18 a.k.a 3*(i-1).
762             int x = 3*(i-1);
763 
764             // Then that's pinned to the limits of img.
765             if (i < 2) { x =  0; }  // Notice i == 1 hits x == 0 exactly...
766             if (i > 5) { x = 15; }  // ...and i == 6 hits x == 15 exactly
767             REPORTER_ASSERT(r, buf[i] == img[x]);
768         }
769     });
770 }
771 
DEF_TEST(SKVM_array32,r)772 DEF_TEST(SKVM_array32, r) {
773 
774 
775 
776     skvm::Builder b;
777     skvm::Uniforms uniforms(b.uniform(), 0);
778     // Take up the first slot, so other uniforms are not at 0 offset.
779     uniforms.push(0);
780     int i[] = {3, 7};
781     skvm::Uniform array = uniforms.pushArray(i);
782     float f[] = {5, 9};
783     skvm::Uniform arrayF = uniforms.pushArrayF(f);
784     {
785         skvm::Ptr buf0     = b.varying<int32_t>(),
786                   buf1     = b.varying<int32_t>(),
787                   buf2     = b.varying<int32_t>();
788 
789         skvm::I32 j = b.array32(array, 0);
790         b.store32(buf0, j);
791         skvm::I32 k = b.array32(array, 1);
792         b.store32(buf1, k);
793 
794         skvm::F32 x = b.arrayF(arrayF, 0);
795         skvm::F32 y = b.arrayF(arrayF, 1);
796         b.store32(buf2, b.trunc(b.add(x, y)));
797     }
798 
799     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
800         const int K = 10;
801         int32_t buf0[K],
802                 buf1[K],
803                 buf2[K];
804 
805         // reset the i[0] for the two tests.
806         i[0] = 3;
807         f[1] = 9;
808         program.eval(K, uniforms.buf.data(), buf0, buf1, buf2);
809         for (auto v : buf0) {
810             REPORTER_ASSERT(r, v == 3);
811         }
812         for (auto v : buf1) {
813             REPORTER_ASSERT(r, v == 7);
814         }
815         for (auto v : buf2) {
816             REPORTER_ASSERT(r, v == 14);
817         }
818         i[0] = 4;
819         f[1] = 10;
820         program.eval(K, uniforms.buf.data(), buf0, buf1, buf2);
821         for (auto v : buf0) {
822             REPORTER_ASSERT(r, v == 4);
823         }
824         for (auto v : buf1) {
825             REPORTER_ASSERT(r, v == 7);
826         }
827         for (auto v : buf2) {
828             REPORTER_ASSERT(r, v == 15);
829         }
830     });
831 }
832 
DEF_TEST(SkVM_sqrt,r)833 DEF_TEST(SkVM_sqrt, r) {
834     skvm::Builder b;
835     auto buf = b.varying<int>();
836     b.storeF(buf, b.sqrt(b.loadF(buf)));
837 
838     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
839         constexpr int K = 17;
840         float buf[K];
841         for (int i = 0; i < K; i++) {
842             buf[i] = (float)(i*i);
843         }
844 
845         // x^2 -> x
846         program.eval(K, buf);
847 
848         for (int i = 0; i < K; i++) {
849             REPORTER_ASSERT(r, buf[i] == (float)i);
850         }
851     });
852 }
853 
DEF_TEST(SkVM_MSAN,r)854 DEF_TEST(SkVM_MSAN, r) {
855     // This little memset32() program should be able to JIT, but if we run that
856     // JIT code in an MSAN build, it won't see the writes initialize buf.  So
857     // this tests that we're using the interpreter instead.
858     skvm::Builder b;
859     b.store32(b.varying<int>(), b.splat(42));
860 
861     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
862         constexpr int K = 17;
863         int buf[K];                 // Intentionally uninitialized.
864         program.eval(K, buf);
865         sk_msan_assert_initialized(buf, buf+K);
866         for (int x : buf) {
867             REPORTER_ASSERT(r, x == 42);
868         }
869     });
870 }
871 
DEF_TEST(SkVM_assert,r)872 DEF_TEST(SkVM_assert, r) {
873     skvm::Builder b;
874     b.assert_true(b.lt(b.load32(b.varying<int>()),
875                        b.splat(42)));
876 
877     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
878         int buf[] = { 0,1,2,3,4,5,6,7,8,9 };
879         program.eval(SK_ARRAY_COUNT(buf), buf);
880     });
881 }
882 
DEF_TEST(SkVM_trace_line,r)883 DEF_TEST(SkVM_trace_line, r) {
884     skvm::Builder b;
885     b.trace_line(b.splat(0xFFFFFFFF), 123);
886 
887     test_jit_and_interpreter(b, [&](const skvm::Program& program) {
888         // The trace_line instruction has no behavior yet.
889         program.eval(1);
890     });
891 }
892 
DEF_TEST(SkVM_premul,reporter)893 DEF_TEST(SkVM_premul, reporter) {
894     // Test that premul is short-circuited when alpha is known opaque.
895     {
896         skvm::Builder p;
897         auto rptr = p.varying<int>(),
898              aptr = p.varying<int>();
899 
900         skvm::F32 r = p.loadF(rptr),
901                   g = p.splat(0.0f),
902                   b = p.splat(0.0f),
903                   a = p.loadF(aptr);
904 
905         p.premul(&r, &g, &b, a);
906         p.storeF(rptr, r);
907 
908         // load red, load alpha, red *= alpha, store red
909         REPORTER_ASSERT(reporter, p.done().instructions().size() == 4);
910     }
911 
912     {
913         skvm::Builder p;
914         auto rptr = p.varying<int>();
915 
916         skvm::F32 r = p.loadF(rptr),
917                   g = p.splat(0.0f),
918                   b = p.splat(0.0f),
919                   a = p.splat(1.0f);
920 
921         p.premul(&r, &g, &b, a);
922         p.storeF(rptr, r);
923 
924         // load red, store red
925         REPORTER_ASSERT(reporter, p.done().instructions().size() == 2);
926     }
927 
928     // Same deal for unpremul.
929     {
930         skvm::Builder p;
931         auto rptr = p.varying<int>(),
932              aptr = p.varying<int>();
933 
934         skvm::F32 r = p.loadF(rptr),
935                   g = p.splat(0.0f),
936                   b = p.splat(0.0f),
937                   a = p.loadF(aptr);
938 
939         p.unpremul(&r, &g, &b, a);
940         p.storeF(rptr, r);
941 
942         // load red, load alpha, a bunch of unpremul instructions, store red
943         REPORTER_ASSERT(reporter, p.done().instructions().size() >= 4);
944     }
945 
946     {
947         skvm::Builder p;
948         auto rptr = p.varying<int>();
949 
950         skvm::F32 r = p.loadF(rptr),
951                   g = p.splat(0.0f),
952                   b = p.splat(0.0f),
953                   a = p.splat(1.0f);
954 
955         p.unpremul(&r, &g, &b, a);
956         p.storeF(rptr, r);
957 
958         // load red, store red
959         REPORTER_ASSERT(reporter, p.done().instructions().size() == 2);
960     }
961 }
962 
963 template <typename Fn>
test_asm(skiatest::Reporter * r,Fn && fn,std::initializer_list<uint8_t> expected)964 static void test_asm(skiatest::Reporter* r, Fn&& fn, std::initializer_list<uint8_t> expected) {
965     uint8_t buf[4096];
966     skvm::Assembler a{buf};
967     fn(a);
968 
969     REPORTER_ASSERT(r, a.size() == expected.size());
970 
971     auto got = (const uint8_t*)buf,
972          want = expected.begin();
973     for (int i = 0; i < (int)std::min(a.size(), expected.size()); i++) {
974         REPORTER_ASSERT(r, got[i] == want[i],
975                         "byte %d was %02x, want %02x", i, got[i], want[i]);
976     }
977 }
978 
DEF_TEST(SkVM_Assembler,r)979 DEF_TEST(SkVM_Assembler, r) {
980     // Easiest way to generate test cases is
981     //
982     //   echo '...some asm...' | llvm-mc -show-encoding -x86-asm-syntax=intel
983     //
984     // The -x86-asm-syntax=intel bit is optional, controlling the
985     // input syntax only; the output will always be AT&T  op x,y,dst style.
986     // Our APIs read more like Intel op dst,x,y as op(dst,x,y), so I find
987     // that a bit easier to use here, despite maybe favoring AT&T overall.
988 
989     using A = skvm::Assembler;
990     // Our exit strategy from AVX code.
991     test_asm(r, [&](A& a) {
992         a.int3();
993         a.vzeroupper();
994         a.ret();
995     },{
996         0xcc,
997         0xc5, 0xf8, 0x77,
998         0xc3,
999     });
1000 
1001     // Align should pad with zero
1002     test_asm(r, [&](A& a) {
1003         a.ret();
1004         a.align(4);
1005     },{
1006         0xc3,
1007         0x00, 0x00, 0x00,
1008     });
1009 
1010     test_asm(r, [&](A& a) {
1011         a.add(A::rax, 8);       // Always good to test rax.
1012         a.sub(A::rax, 32);
1013 
1014         a.add(A::rdi, 12);      // Last 0x48 REX
1015         a.sub(A::rdi, 8);
1016 
1017         a.add(A::r8 , 7);       // First 0x49 REX
1018         a.sub(A::r8 , 4);
1019 
1020         a.add(A::rsi, 128);     // Requires 4 byte immediate.
1021         a.sub(A::r8 , 1000000);
1022 
1023         a.add(A::Mem{A::rsi}, 7);                       // addq $7, (%rsi)
1024         a.add(A::Mem{A::rsi, 12}, 7);                   // addq $7, 12(%rsi)
1025         a.add(A::Mem{A::rsp, 12}, 7);                   // addq $7, 12(%rsp)
1026         a.add(A::Mem{A::r12, 12}, 7);                   // addq $7, 12(%r12)
1027         a.add(A::Mem{A::rsp, 12, A::rax, A::FOUR}, 7);  // addq $7, 12(%rsp,%rax,4)
1028         a.add(A::Mem{A::r12, 12, A::rax, A::FOUR}, 7);  // addq $7, 12(%r12,%rax,4)
1029         a.add(A::Mem{A::rax, 12, A::r12, A::FOUR}, 7);  // addq $7, 12(%rax,%r12,4)
1030         a.add(A::Mem{A::r11, 12, A::r8 , A::TWO }, 7);  // addq $7, 12(%r11,%r8,2)
1031         a.add(A::Mem{A::r11, 12, A::rax}         , 7);  // addq $7, 12(%r11,%rax)
1032         a.add(A::Mem{A::rax, 12, A::r11}         , 7);  // addq $7, 12(%rax,%r11)
1033 
1034         a.sub(A::Mem{A::rax, 12, A::r11}         , 7);  // subq $7, 12(%rax,%r11)
1035 
1036         a.add(       A::rax     , A::rcx);              // addq %rcx, %rax
1037         a.add(A::Mem{A::rax}    , A::rcx);              // addq %rcx, (%rax)
1038         a.add(A::Mem{A::rax, 12}, A::rcx);              // addq %rcx, 12(%rax)
1039         a.add(A::rcx, A::Mem{A::rax, 12});              // addq 12(%rax), %rcx
1040 
1041         a.sub(A::rcx, A::Mem{A::rax, 12});              // subq 12(%rax), %rcx
1042     },{
1043         0x48, 0x83, 0b11'000'000, 0x08,
1044         0x48, 0x83, 0b11'101'000, 0x20,
1045 
1046         0x48, 0x83, 0b11'000'111, 0x0c,
1047         0x48, 0x83, 0b11'101'111, 0x08,
1048 
1049         0x49, 0x83, 0b11'000'000, 0x07,
1050         0x49, 0x83, 0b11'101'000, 0x04,
1051 
1052         0x48, 0x81, 0b11'000'110, 0x80, 0x00, 0x00, 0x00,
1053         0x49, 0x81, 0b11'101'000, 0x40, 0x42, 0x0f, 0x00,
1054 
1055         0x48,0x83,0x06,0x07,
1056         0x48,0x83,0x46,0x0c,0x07,
1057         0x48,0x83,0x44,0x24,0x0c,0x07,
1058         0x49,0x83,0x44,0x24,0x0c,0x07,
1059         0x48,0x83,0x44,0x84,0x0c,0x07,
1060         0x49,0x83,0x44,0x84,0x0c,0x07,
1061         0x4a,0x83,0x44,0xa0,0x0c,0x07,
1062         0x4b,0x83,0x44,0x43,0x0c,0x07,
1063         0x49,0x83,0x44,0x03,0x0c,0x07,
1064         0x4a,0x83,0x44,0x18,0x0c,0x07,
1065 
1066         0x4a,0x83,0x6c,0x18,0x0c,0x07,
1067 
1068         0x48,0x01,0xc8,
1069         0x48,0x01,0x08,
1070         0x48,0x01,0x48,0x0c,
1071         0x48,0x03,0x48,0x0c,
1072         0x48,0x2b,0x48,0x0c,
1073     });
1074 
1075 
1076     test_asm(r, [&](A& a) {
1077         a.vpaddd (A::ymm0, A::ymm1, A::ymm2);  // Low registers and 0x0f map     -> 2-byte VEX.
1078         a.vpaddd (A::ymm8, A::ymm1, A::ymm2);  // A high dst register is ok      -> 2-byte VEX.
1079         a.vpaddd (A::ymm0, A::ymm8, A::ymm2);  // A high first argument register -> 2-byte VEX.
1080         a.vpaddd (A::ymm0, A::ymm1, A::ymm8);  // A high second argument         -> 3-byte VEX.
1081         a.vpmulld(A::ymm0, A::ymm1, A::ymm2);  // Using non-0x0f map instruction -> 3-byte VEX.
1082         a.vpsubd (A::ymm0, A::ymm1, A::ymm2);  // Test vpsubd to ensure argument order is right.
1083     },{
1084         /*    VEX     */ /*op*/ /*modRM*/
1085         0xc5,       0xf5, 0xfe, 0xc2,
1086         0xc5,       0x75, 0xfe, 0xc2,
1087         0xc5,       0xbd, 0xfe, 0xc2,
1088         0xc4, 0xc1, 0x75, 0xfe, 0xc0,
1089         0xc4, 0xe2, 0x75, 0x40, 0xc2,
1090         0xc5,       0xf5, 0xfa, 0xc2,
1091     });
1092 
1093     test_asm(r, [&](A& a) {
1094         a.vpaddw   (A::ymm4, A::ymm3, A::ymm2);
1095         a.vpavgw   (A::ymm4, A::ymm3, A::ymm2);
1096         a.vpcmpeqw (A::ymm4, A::ymm3, A::ymm2);
1097         a.vpcmpgtw (A::ymm4, A::ymm3, A::ymm2);
1098 
1099         a.vpminsw  (A::ymm4, A::ymm3, A::ymm2);
1100         a.vpmaxsw  (A::ymm4, A::ymm3, A::ymm2);
1101         a.vpminuw  (A::ymm4, A::ymm3, A::ymm2);
1102         a.vpmaxuw  (A::ymm4, A::ymm3, A::ymm2);
1103 
1104         a.vpmulhrsw(A::ymm4, A::ymm3, A::ymm2);
1105         a.vpabsw   (A::ymm4, A::ymm3);
1106         a.vpsllw   (A::ymm4, A::ymm3, 12);
1107         a.vpsraw   (A::ymm4, A::ymm3, 12);
1108     },{
1109         0xc5,     0xe5, 0xfd, 0xe2,
1110         0xc5,     0xe5, 0xe3, 0xe2,
1111         0xc5,     0xe5, 0x75, 0xe2,
1112         0xc5,     0xe5, 0x65, 0xe2,
1113 
1114         0xc5,     0xe5, 0xea, 0xe2,
1115         0xc5,     0xe5, 0xee, 0xe2,
1116         0xc4,0xe2,0x65, 0x3a, 0xe2,
1117         0xc4,0xe2,0x65, 0x3e, 0xe2,
1118 
1119         0xc4,0xe2,0x65, 0x0b, 0xe2,
1120         0xc4,0xe2,0x7d, 0x1d, 0xe3,
1121         0xc5,0xdd,0x71, 0xf3, 0x0c,
1122         0xc5,0xdd,0x71, 0xe3, 0x0c,
1123     });
1124 
1125     test_asm(r, [&](A& a) {
1126         A::Label l;
1127         a.vcmpeqps (A::ymm0, A::ymm1, &l);      // vcmpeqps 0x1c(%rip), %ymm1, %ymm0
1128         a.vpcmpeqd (A::ymm0, A::ymm1, A::ymm2);
1129         a.vpcmpgtd (A::ymm0, A::ymm1, A::ymm2);
1130         a.vcmpeqps (A::ymm0, A::ymm1, A::ymm2);
1131         a.vcmpltps (A::ymm0, A::ymm1, A::ymm2);
1132         a.vcmpleps (A::ymm0, A::ymm1, A::ymm2);
1133         a.vcmpneqps(A::ymm0, A::ymm1, A::ymm2);
1134         a.label(&l);   // 28 bytes after the vcmpeqps that uses it.
1135     },{
1136         0xc5,0xf4,0xc2,0x05,0x1c,0x00,0x00,0x00,0x00,
1137         0xc5,0xf5,0x76,0xc2,
1138         0xc5,0xf5,0x66,0xc2,
1139         0xc5,0xf4,0xc2,0xc2,0x00,
1140         0xc5,0xf4,0xc2,0xc2,0x01,
1141         0xc5,0xf4,0xc2,0xc2,0x02,
1142         0xc5,0xf4,0xc2,0xc2,0x04,
1143     });
1144 
1145     test_asm(r, [&](A& a) {
1146         a.vminps(A::ymm0, A::ymm1, A::ymm2);
1147         a.vmaxps(A::ymm0, A::ymm1, A::ymm2);
1148     },{
1149         0xc5,0xf4,0x5d,0xc2,
1150         0xc5,0xf4,0x5f,0xc2,
1151     });
1152 
1153     test_asm(r, [&](A& a) {
1154         a.vpblendvb(A::ymm0, A::ymm1, A::ymm2, A::ymm3);
1155     },{
1156         0xc4,0xe3,0x75, 0x4c, 0xc2, 0x30,
1157     });
1158 
1159     test_asm(r, [&](A& a) {
1160         a.vpsrld(A::ymm15, A::ymm2, 8);
1161         a.vpsrld(A::ymm0 , A::ymm8, 5);
1162     },{
1163         0xc5,     0x85, 0x72,0xd2, 0x08,
1164         0xc4,0xc1,0x7d, 0x72,0xd0, 0x05,
1165     });
1166 
1167     test_asm(r, [&](A& a) {
1168         A::Label l;
1169         a.vpermps(A::ymm1, A::ymm2, A::Mem{A::rdi, 32});
1170         a.vperm2f128(A::ymm1, A::ymm2, &l, 0x20);
1171         a.vpermq(A::ymm1, A::ymm2, 5);
1172         a.label(&l);  // 6 bytes after vperm2f128
1173     },{
1174         0xc4,0xe2,0x6d,0x16,0x4f,0x20,
1175         0xc4,0xe3,0x6d,0x06,0x0d,0x06,0x00,0x00,0x00,0x20,
1176         0xc4,0xe3,0xfd, 0x00,0xca, 0x05,
1177     });
1178 
1179     test_asm(r, [&](A& a) {
1180         a.vpunpckldq(A::ymm1, A::ymm2, A::Mem{A::rdi});
1181         a.vpunpckhdq(A::ymm1, A::ymm2, A::ymm3);
1182     },{
1183         0xc5,0xed,0x62,0x0f,
1184         0xc5,0xed,0x6a,0xcb,
1185     });
1186 
1187     test_asm(r, [&](A& a) {
1188         a.vroundps(A::ymm1, A::ymm2, A::NEAREST);
1189         a.vroundps(A::ymm1, A::ymm2, A::FLOOR);
1190         a.vroundps(A::ymm1, A::ymm2, A::CEIL);
1191         a.vroundps(A::ymm1, A::ymm2, A::TRUNC);
1192     },{
1193         0xc4,0xe3,0x7d,0x08,0xca,0x00,
1194         0xc4,0xe3,0x7d,0x08,0xca,0x01,
1195         0xc4,0xe3,0x7d,0x08,0xca,0x02,
1196         0xc4,0xe3,0x7d,0x08,0xca,0x03,
1197     });
1198 
1199     test_asm(r, [&](A& a) {
1200         A::Label l;
1201         a.label(&l);
1202         a.byte(1);
1203         a.byte(2);
1204         a.byte(3);
1205         a.byte(4);
1206 
1207         a.vbroadcastss(A::ymm0 , &l);
1208         a.vbroadcastss(A::ymm1 , &l);
1209         a.vbroadcastss(A::ymm8 , &l);
1210         a.vbroadcastss(A::ymm15, &l);
1211 
1212         a.vpshufb(A::ymm4, A::ymm3, &l);
1213         a.vpaddd (A::ymm4, A::ymm3, &l);
1214         a.vpsubd (A::ymm4, A::ymm3, &l);
1215 
1216         a.vptest(A::ymm4, &l);
1217 
1218         a.vmulps (A::ymm4, A::ymm3, &l);
1219     },{
1220         0x01, 0x02, 0x03, 0x4,
1221 
1222         /*     VEX    */  /*op*/ /*   ModRM    */  /*     offset     */
1223         0xc4, 0xe2, 0x7d,  0x18,   0b00'000'101,   0xf3,0xff,0xff,0xff,   // 0xfffffff3 == -13
1224         0xc4, 0xe2, 0x7d,  0x18,   0b00'001'101,   0xea,0xff,0xff,0xff,   // 0xffffffea == -22
1225         0xc4, 0x62, 0x7d,  0x18,   0b00'000'101,   0xe1,0xff,0xff,0xff,   // 0xffffffe1 == -31
1226         0xc4, 0x62, 0x7d,  0x18,   0b00'111'101,   0xd8,0xff,0xff,0xff,   // 0xffffffd8 == -40
1227 
1228         0xc4, 0xe2, 0x65,  0x00,   0b00'100'101,   0xcf,0xff,0xff,0xff,   // 0xffffffcf == -49
1229 
1230         0xc5, 0xe5,        0xfe,   0b00'100'101,   0xc7,0xff,0xff,0xff,   // 0xffffffc7 == -57
1231         0xc5, 0xe5,        0xfa,   0b00'100'101,   0xbf,0xff,0xff,0xff,   // 0xffffffbf == -65
1232 
1233         0xc4, 0xe2, 0x7d,  0x17,   0b00'100'101,   0xb6,0xff,0xff,0xff,   // 0xffffffb6 == -74
1234 
1235         0xc5, 0xe4,        0x59,   0b00'100'101,   0xae,0xff,0xff,0xff,   // 0xffffffaf == -82
1236     });
1237 
1238     test_asm(r, [&](A& a) {
1239         a.vbroadcastss(A::ymm0,  A::Mem{A::rdi,   0});
1240         a.vbroadcastss(A::ymm13, A::Mem{A::r14,   7});
1241         a.vbroadcastss(A::ymm8,  A::Mem{A::rdx, -12});
1242         a.vbroadcastss(A::ymm8,  A::Mem{A::rdx, 400});
1243 
1244         a.vbroadcastss(A::ymm8,  A::xmm0);
1245         a.vbroadcastss(A::ymm0,  A::xmm13);
1246     },{
1247         /*   VEX    */ /*op*/     /*ModRM*/   /*offset*/
1248         0xc4,0xe2,0x7d, 0x18,   0b00'000'111,
1249         0xc4,0x42,0x7d, 0x18,   0b01'101'110,  0x07,
1250         0xc4,0x62,0x7d, 0x18,   0b01'000'010,  0xf4,
1251         0xc4,0x62,0x7d, 0x18,   0b10'000'010,  0x90,0x01,0x00,0x00,
1252 
1253         0xc4,0x62,0x7d, 0x18,   0b11'000'000,
1254         0xc4,0xc2,0x7d, 0x18,   0b11'000'101,
1255     });
1256 
1257     test_asm(r, [&](A& a) {
1258         A::Label l;
1259         a.label(&l);
1260         a.jne(&l);
1261         a.jne(&l);
1262         a.je (&l);
1263         a.jmp(&l);
1264         a.jl (&l);
1265         a.jc (&l);
1266 
1267         a.cmp(A::rdx, 1);
1268         a.cmp(A::rax, 12);
1269         a.cmp(A::r14, 2000000000);
1270     },{
1271         0x0f,0x85, 0xfa,0xff,0xff,0xff,   // near jne -6 bytes
1272         0x0f,0x85, 0xf4,0xff,0xff,0xff,   // near jne -12 bytes
1273         0x0f,0x84, 0xee,0xff,0xff,0xff,   // near je  -18 bytes
1274         0xe9,      0xe9,0xff,0xff,0xff,   // near jmp -23 bytes
1275         0x0f,0x8c, 0xe3,0xff,0xff,0xff,   // near jl  -29 bytes
1276         0x0f,0x82, 0xdd,0xff,0xff,0xff,   // near jc  -35 bytes
1277 
1278         0x48,0x83,0xfa,0x01,
1279         0x48,0x83,0xf8,0x0c,
1280         0x49,0x81,0xfe,0x00,0x94,0x35,0x77,
1281     });
1282 
1283     test_asm(r, [&](A& a) {
1284         a.vmovups(A::ymm5, A::Mem{A::rsi});
1285         a.vmovups(A::Mem{A::rsi}, A::ymm5);
1286 
1287         a.vmovups(A::xmm5, A::Mem{A::rsi});
1288         a.vmovups(A::Mem{A::rsi}, A::xmm5);
1289 
1290         a.vpmovzxwd(A::ymm4, A::Mem{A::rsi});
1291         a.vpmovzxbd(A::ymm4, A::Mem{A::rsi});
1292 
1293         a.vmovq(A::Mem{A::rdx}, A::xmm15);
1294     },{
1295         /*    VEX    */  /*Op*/  /*  ModRM  */
1296         0xc5,     0xfc,   0x10,  0b00'101'110,
1297         0xc5,     0xfc,   0x11,  0b00'101'110,
1298 
1299         0xc5,     0xf8,   0x10,  0b00'101'110,
1300         0xc5,     0xf8,   0x11,  0b00'101'110,
1301 
1302         0xc4,0xe2,0x7d,   0x33,  0b00'100'110,
1303         0xc4,0xe2,0x7d,   0x31,  0b00'100'110,
1304 
1305         0xc5,     0x79,   0xd6,  0b00'111'010,
1306     });
1307 
1308     test_asm(r, [&](A& a) {
1309         a.vmovups(A::ymm5, A::Mem{A::rsp,  0});
1310         a.vmovups(A::ymm5, A::Mem{A::rsp, 64});
1311         a.vmovups(A::ymm5, A::Mem{A::rsp,128});
1312 
1313         a.vmovups(A::Mem{A::rsp,  0}, A::ymm5);
1314         a.vmovups(A::Mem{A::rsp, 64}, A::ymm5);
1315         a.vmovups(A::Mem{A::rsp,128}, A::ymm5);
1316     },{
1317         0xc5,0xfc,0x10,0x2c,0x24,
1318         0xc5,0xfc,0x10,0x6c,0x24,0x40,
1319         0xc5,0xfc,0x10,0xac,0x24,0x80,0x00,0x00,0x00,
1320 
1321         0xc5,0xfc,0x11,0x2c,0x24,
1322         0xc5,0xfc,0x11,0x6c,0x24,0x40,
1323         0xc5,0xfc,0x11,0xac,0x24,0x80,0x00,0x00,0x00,
1324     });
1325 
1326     test_asm(r, [&](A& a) {
1327         a.movzbq(A::rax, A::Mem{A::rsi});   // Low registers for src and dst.
1328         a.movzbq(A::rax, A::Mem{A::r8,});   // High src register.
1329         a.movzbq(A::r8 , A::Mem{A::rsi});   // High dst register.
1330         a.movzbq(A::r8,  A::Mem{A::rsi, 12});
1331         a.movzbq(A::r8,  A::Mem{A::rsi, 400});
1332 
1333         a.movzwq(A::rax, A::Mem{A::rsi});   // Low registers for src and dst.
1334         a.movzwq(A::rax, A::Mem{A::r8,});   // High src register.
1335         a.movzwq(A::r8 , A::Mem{A::rsi});   // High dst register.
1336         a.movzwq(A::r8,  A::Mem{A::rsi, 12});
1337         a.movzwq(A::r8,  A::Mem{A::rsi, 400});
1338 
1339         a.vmovd(A::Mem{A::rax}, A::xmm0);
1340         a.vmovd(A::Mem{A::rax}, A::xmm8);
1341         a.vmovd(A::Mem{A::r8 }, A::xmm0);
1342 
1343         a.vmovd(A::xmm0, A::Mem{A::rax});
1344         a.vmovd(A::xmm8, A::Mem{A::rax});
1345         a.vmovd(A::xmm0, A::Mem{A::r8 });
1346 
1347         a.vmovd(A::xmm0 , A::Mem{A::rax, 0, A::rcx, A::FOUR});
1348         a.vmovd(A::xmm15, A::Mem{A::rax, 0, A::r8,  A::TWO });
1349         a.vmovd(A::xmm0 , A::Mem{A::r8 , 0, A::rcx});
1350 
1351         a.vmovd(A::rax, A::xmm0);
1352         a.vmovd(A::rax, A::xmm8);
1353         a.vmovd(A::r8 ,  A::xmm0);
1354 
1355         a.vmovd(A::xmm0, A::rax);
1356         a.vmovd(A::xmm8, A::rax);
1357         a.vmovd(A::xmm0, A::r8 );
1358 
1359         a.movb(A::Mem{A::rdx}, A::rax);
1360         a.movb(A::Mem{A::rdx}, A::r8 );
1361         a.movb(A::Mem{A::r8 }, A::rax);
1362 
1363         a.movb(A::rdx, A::Mem{A::rax});
1364         a.movb(A::rdx, A::Mem{A::r8 });
1365         a.movb(A::r8 , A::Mem{A::rax});
1366 
1367         a.movb(A::rdx, 12);
1368         a.movb(A::rax,  4);
1369         a.movb(A::r8 , -1);
1370 
1371         a.movb(A::Mem{A::rdx}, 12);
1372         a.movb(A::Mem{A::rax},  4);
1373         a.movb(A::Mem{A::r8 }, -1);
1374     },{
1375         0x48,0x0f,0xb6,0x06,     // movzbq (%rsi), %rax
1376         0x49,0x0f,0xb6,0x00,
1377         0x4c,0x0f,0xb6,0x06,
1378         0x4c,0x0f,0xb6,0x46, 12,
1379         0x4c,0x0f,0xb6,0x86, 0x90,0x01,0x00,0x00,
1380 
1381         0x48,0x0f,0xb7,0x06,    // movzwq (%rsi), %rax
1382         0x49,0x0f,0xb7,0x00,
1383         0x4c,0x0f,0xb7,0x06,
1384         0x4c,0x0f,0xb7,0x46, 12,
1385         0x4c,0x0f,0xb7,0x86, 0x90,0x01,0x00,0x00,
1386 
1387         0xc5,0xf9,0x7e,0x00,
1388         0xc5,0x79,0x7e,0x00,
1389         0xc4,0xc1,0x79,0x7e,0x00,
1390 
1391         0xc5,0xf9,0x6e,0x00,
1392         0xc5,0x79,0x6e,0x00,
1393         0xc4,0xc1,0x79,0x6e,0x00,
1394 
1395         0xc5,0xf9,0x6e,0x04,0x88,
1396         0xc4,0x21,0x79,0x6e,0x3c,0x40,
1397         0xc4,0xc1,0x79,0x6e,0x04,0x08,
1398 
1399         0xc5,0xf9,0x7e,0xc0,
1400         0xc5,0x79,0x7e,0xc0,
1401         0xc4,0xc1,0x79,0x7e,0xc0,
1402 
1403         0xc5,0xf9,0x6e,0xc0,
1404         0xc5,0x79,0x6e,0xc0,
1405         0xc4,0xc1,0x79,0x6e,0xc0,
1406 
1407         0x48 ,0x88, 0x02,
1408         0x4c, 0x88, 0x02,
1409         0x49, 0x88, 0x00,
1410 
1411         0x48 ,0x8a, 0x10,
1412         0x49, 0x8a, 0x10,
1413         0x4c, 0x8a, 0x00,
1414 
1415         0x48, 0xc6, 0xc2, 0x0c,
1416         0x48, 0xc6, 0xc0, 0x04,
1417         0x49, 0xc6, 0xc0, 0xff,
1418 
1419         0x48, 0xc6, 0x02, 0x0c,
1420         0x48, 0xc6, 0x00, 0x04,
1421         0x49, 0xc6, 0x00, 0xff,
1422     });
1423 
1424     test_asm(r, [&](A& a) {
1425         a.vpinsrd(A::xmm1, A::xmm8, A::Mem{A::rsi}, 1);   // vpinsrd $1, (%rsi), %xmm8, %xmm1
1426         a.vpinsrd(A::xmm8, A::xmm1, A::Mem{A::r8 }, 3);   // vpinsrd $3, (%r8), %xmm1, %xmm8;
1427 
1428         a.vpinsrw(A::xmm1, A::xmm8, A::Mem{A::rsi}, 4);   // vpinsrw $4, (%rsi), %xmm8, %xmm1
1429         a.vpinsrw(A::xmm8, A::xmm1, A::Mem{A::r8 }, 12);  // vpinrsw $12, (%r8), %xmm1, %xmm8
1430 
1431         a.vpinsrb(A::xmm1, A::xmm8, A::Mem{A::rsi}, 4);   // vpinsrb $4, (%rsi), %xmm8, %xmm1
1432         a.vpinsrb(A::xmm8, A::xmm1, A::Mem{A::r8 }, 12);  // vpinsrb $12, (%r8), %xmm1, %xmm8
1433 
1434         a.vextracti128(A::xmm1, A::ymm8, 1);  // vextracti128 $1, %ymm8, %xmm1
1435         a.vextracti128(A::xmm8, A::ymm1, 0);  // vextracti128 $0, %ymm1, %xmm8
1436 
1437         a.vpextrd(A::Mem{A::rsi}, A::xmm8, 3);  // vpextrd  $3, %xmm8, (%rsi)
1438         a.vpextrd(A::Mem{A::r8 }, A::xmm1, 2);  // vpextrd  $2, %xmm1, (%r8)
1439 
1440         a.vpextrw(A::Mem{A::rsi}, A::xmm8, 7);
1441         a.vpextrw(A::Mem{A::r8 }, A::xmm1, 15);
1442 
1443         a.vpextrb(A::Mem{A::rsi}, A::xmm8, 7);
1444         a.vpextrb(A::Mem{A::r8 }, A::xmm1, 15);
1445     },{
1446         0xc4,0xe3,0x39, 0x22, 0x0e, 1,
1447         0xc4,0x43,0x71, 0x22, 0x00, 3,
1448 
1449         0xc5,0xb9,      0xc4, 0x0e,  4,
1450         0xc4,0x41,0x71, 0xc4, 0x00, 12,
1451 
1452         0xc4,0xe3,0x39, 0x20, 0x0e,  4,
1453         0xc4,0x43,0x71, 0x20, 0x00, 12,
1454 
1455         0xc4,0x63,0x7d,0x39,0xc1, 1,
1456         0xc4,0xc3,0x7d,0x39,0xc8, 0,
1457 
1458         0xc4,0x63,0x79,0x16,0x06, 3,
1459         0xc4,0xc3,0x79,0x16,0x08, 2,
1460 
1461         0xc4,0x63,0x79, 0x15, 0x06,  7,
1462         0xc4,0xc3,0x79, 0x15, 0x08, 15,
1463 
1464         0xc4,0x63,0x79, 0x14, 0x06,  7,
1465         0xc4,0xc3,0x79, 0x14, 0x08, 15,
1466     });
1467 
1468     test_asm(r, [&](A& a) {
1469         a.vpandn(A::ymm3, A::ymm12, A::ymm2);
1470     },{
1471         0xc5, 0x9d, 0xdf, 0xda,
1472     });
1473 
1474     test_asm(r, [&](A& a) {
1475         A::Label l;
1476         a.vmovdqa(A::ymm3, A::ymm2);                                // vmovdqa %ymm2         , %ymm3
1477 
1478         a.vmovdqa(A::ymm3, A::Mem{A::rsi});                         // vmovdqa  (%rsi)       , %ymm3
1479         a.vmovdqa(A::ymm3, A::Mem{A::rsp});                         // vmovdqa  (%rsp)       , %ymm3
1480         a.vmovdqa(A::ymm3, A::Mem{A::r11});                         // vmovdqa  (%r11)       , %ymm3
1481 
1482         a.vmovdqa(A::ymm3, A::Mem{A::rsi,  4});                     // vmovdqa 4(%rsi)       , %ymm3
1483         a.vmovdqa(A::ymm3, A::Mem{A::rsp,  4});                     // vmovdqa 4(%rsp)       , %ymm3
1484 
1485         a.vmovdqa(A::ymm3, A::Mem{A::rsi,  4, A::rax, A::EIGHT});   // vmovdqa 4(%rsi,%rax,8), %ymm3
1486         a.vmovdqa(A::ymm3, A::Mem{A::r11,  4, A::rax, A::TWO  });   // vmovdqa 4(%r11,%rax,2), %ymm3
1487         a.vmovdqa(A::ymm3, A::Mem{A::rsi,  4, A::r11, A::FOUR });   // vmovdqa 4(%rsi,%r11,4), %ymm3
1488         a.vmovdqa(A::ymm3, A::Mem{A::rsi,  4, A::r11, A::ONE  });   // vmovdqa 4(%rsi,%r11,1), %ymm3
1489         a.vmovdqa(A::ymm3, A::Mem{A::rsi,  4, A::r11});             // vmovdqa 4(%rsi,%r11)  , %ymm3
1490 
1491         a.vmovdqa(A::ymm3, A::Mem{A::rsi,  64, A::r11});            // vmovdqa  64(%rsi,%r11), %ymm3
1492         a.vmovdqa(A::ymm3, A::Mem{A::rsi, 128, A::r11});            // vmovdqa 128(%rsi,%r11), %ymm3
1493         a.vmovdqa(A::ymm3, &l);                                     // vmovdqa  16(%rip)     , %ymm3
1494 
1495         a.vcvttps2dq(A::ymm3, A::ymm2);
1496         a.vcvtdq2ps (A::ymm3, A::ymm2);
1497         a.vcvtps2dq (A::ymm3, A::ymm2);
1498         a.vsqrtps   (A::ymm3, A::ymm2);
1499         a.label(&l);
1500     },{
1501         0xc5,0xfd,0x6f,0xda,
1502 
1503         0xc5,0xfd,0x6f,0x1e,
1504         0xc5,0xfd,0x6f,0x1c,0x24,
1505         0xc4,0xc1,0x7d,0x6f,0x1b,
1506 
1507         0xc5,0xfd,0x6f,0x5e,0x04,
1508         0xc5,0xfd,0x6f,0x5c,0x24,0x04,
1509 
1510         0xc5,0xfd,0x6f,0x5c,0xc6,0x04,
1511         0xc4,0xc1,0x7d,0x6f,0x5c,0x43,0x04,
1512         0xc4,0xa1,0x7d,0x6f,0x5c,0x9e,0x04,
1513         0xc4,0xa1,0x7d,0x6f,0x5c,0x1e,0x04,
1514         0xc4,0xa1,0x7d,0x6f,0x5c,0x1e,0x04,
1515 
1516         0xc4,0xa1,0x7d,0x6f,0x5c,0x1e,0x40,
1517         0xc4,0xa1,0x7d,0x6f,0x9c,0x1e,0x80,0x00,0x00,0x00,
1518 
1519         0xc5,0xfd,0x6f,0x1d,0x10,0x00,0x00,0x00,
1520 
1521         0xc5,0xfe,0x5b,0xda,
1522         0xc5,0xfc,0x5b,0xda,
1523         0xc5,0xfd,0x5b,0xda,
1524         0xc5,0xfc,0x51,0xda,
1525     });
1526 
1527     test_asm(r, [&](A& a) {
1528         a.vcvtps2ph(A::xmm3, A::ymm2, A::CURRENT);
1529         a.vcvtps2ph(A::Mem{A::rsi, 32, A::rax, A::EIGHT}, A::ymm5, A::CEIL);
1530 
1531         a.vcvtph2ps(A::ymm15, A::Mem{A::rdi, 12, A::r9, A::ONE});
1532         a.vcvtph2ps(A::ymm2, A::xmm3);
1533     },{
1534         0xc4,0xe3,0x7d,0x1d,0xd3,0x04,
1535         0xc4,0xe3,0x7d,0x1d,0x6c,0xc6,0x20,0x02,
1536 
1537         0xc4,0x22,0x7d,0x13,0x7c,0x0f,0x0c,
1538         0xc4,0xe2,0x7d,0x13,0xd3,
1539     });
1540 
1541     test_asm(r, [&](A& a) {
1542         a.vgatherdps(A::ymm1 , A::FOUR , A::ymm0 , A::rdi, A::ymm2 );
1543         a.vgatherdps(A::ymm0 , A::ONE  , A::ymm2 , A::rax, A::ymm1 );
1544         a.vgatherdps(A::ymm10, A::ONE  , A::ymm2 , A::rax, A::ymm1 );
1545         a.vgatherdps(A::ymm0 , A::ONE  , A::ymm12, A::rax, A::ymm1 );
1546         a.vgatherdps(A::ymm0 , A::ONE  , A::ymm2 , A::r9 , A::ymm1 );
1547         a.vgatherdps(A::ymm0 , A::ONE  , A::ymm2 , A::rax, A::ymm12);
1548         a.vgatherdps(A::ymm0 , A::EIGHT, A::ymm2 , A::rax, A::ymm12);
1549     },{
1550         0xc4,0xe2,0x6d,0x92,0x0c,0x87,
1551         0xc4,0xe2,0x75,0x92,0x04,0x10,
1552         0xc4,0x62,0x75,0x92,0x14,0x10,
1553         0xc4,0xa2,0x75,0x92,0x04,0x20,
1554         0xc4,0xc2,0x75,0x92,0x04,0x11,
1555         0xc4,0xe2,0x1d,0x92,0x04,0x10,
1556         0xc4,0xe2,0x1d,0x92,0x04,0xd0,
1557     });
1558 
1559     test_asm(r, [&](A& a) {
1560         a.mov(A::rax, A::Mem{A::rdi,   0});
1561         a.mov(A::rax, A::Mem{A::rdi,   1});
1562         a.mov(A::rax, A::Mem{A::rdi, 512});
1563         a.mov(A::r15, A::Mem{A::r13,  42});
1564         a.mov(A::rax, A::Mem{A::r13,  42});
1565         a.mov(A::r15, A::Mem{A::rax,  42});
1566         a.mov(A::rax, 1);
1567         a.mov(A::rax, A::rcx);
1568     },{
1569         0x48, 0x8b, 0x07,
1570         0x48, 0x8b, 0x47, 0x01,
1571         0x48, 0x8b, 0x87, 0x00,0x02,0x00,0x00,
1572         0x4d, 0x8b, 0x7d, 0x2a,
1573         0x49, 0x8b, 0x45, 0x2a,
1574         0x4c, 0x8b, 0x78, 0x2a,
1575         0x48, 0xc7, 0xc0, 0x01,0x00,0x00,0x00,
1576         0x48, 0x89, 0xc8,
1577     });
1578 
1579     // echo "fmul v4.4s, v3.4s, v1.4s" | llvm-mc -show-encoding -arch arm64
1580 
1581     test_asm(r, [&](A& a) {
1582         a.and16b(A::v4, A::v3, A::v1);
1583         a.orr16b(A::v4, A::v3, A::v1);
1584         a.eor16b(A::v4, A::v3, A::v1);
1585         a.bic16b(A::v4, A::v3, A::v1);
1586         a.bsl16b(A::v4, A::v3, A::v1);
1587         a.not16b(A::v4, A::v3);
1588 
1589         a.add4s(A::v4, A::v3, A::v1);
1590         a.sub4s(A::v4, A::v3, A::v1);
1591         a.mul4s(A::v4, A::v3, A::v1);
1592 
1593         a.cmeq4s(A::v4, A::v3, A::v1);
1594         a.cmgt4s(A::v4, A::v3, A::v1);
1595 
1596         a.sub8h(A::v4, A::v3, A::v1);
1597         a.mul8h(A::v4, A::v3, A::v1);
1598 
1599         a.fadd4s(A::v4, A::v3, A::v1);
1600         a.fsub4s(A::v4, A::v3, A::v1);
1601         a.fmul4s(A::v4, A::v3, A::v1);
1602         a.fdiv4s(A::v4, A::v3, A::v1);
1603         a.fmin4s(A::v4, A::v3, A::v1);
1604         a.fmax4s(A::v4, A::v3, A::v1);
1605 
1606         a.fneg4s (A::v4, A::v3);
1607         a.fsqrt4s(A::v4, A::v3);
1608 
1609         a.fmla4s(A::v4, A::v3, A::v1);
1610         a.fmls4s(A::v4, A::v3, A::v1);
1611 
1612         a.fcmeq4s(A::v4, A::v3, A::v1);
1613         a.fcmgt4s(A::v4, A::v3, A::v1);
1614         a.fcmge4s(A::v4, A::v3, A::v1);
1615     },{
1616         0x64,0x1c,0x21,0x4e,
1617         0x64,0x1c,0xa1,0x4e,
1618         0x64,0x1c,0x21,0x6e,
1619         0x64,0x1c,0x61,0x4e,
1620         0x64,0x1c,0x61,0x6e,
1621         0x64,0x58,0x20,0x6e,
1622 
1623         0x64,0x84,0xa1,0x4e,
1624         0x64,0x84,0xa1,0x6e,
1625         0x64,0x9c,0xa1,0x4e,
1626 
1627         0x64,0x8c,0xa1,0x6e,
1628         0x64,0x34,0xa1,0x4e,
1629 
1630         0x64,0x84,0x61,0x6e,
1631         0x64,0x9c,0x61,0x4e,
1632 
1633         0x64,0xd4,0x21,0x4e,
1634         0x64,0xd4,0xa1,0x4e,
1635         0x64,0xdc,0x21,0x6e,
1636         0x64,0xfc,0x21,0x6e,
1637         0x64,0xf4,0xa1,0x4e,
1638         0x64,0xf4,0x21,0x4e,
1639 
1640         0x64,0xf8,0xa0,0x6e,
1641         0x64,0xf8,0xa1,0x6e,
1642 
1643         0x64,0xcc,0x21,0x4e,
1644         0x64,0xcc,0xa1,0x4e,
1645 
1646         0x64,0xe4,0x21,0x4e,
1647         0x64,0xe4,0xa1,0x6e,
1648         0x64,0xe4,0x21,0x6e,
1649     });
1650 
1651     test_asm(r, [&](A& a) {
1652         a.shl4s(A::v4, A::v3,  0);
1653         a.shl4s(A::v4, A::v3,  1);
1654         a.shl4s(A::v4, A::v3,  8);
1655         a.shl4s(A::v4, A::v3, 16);
1656         a.shl4s(A::v4, A::v3, 31);
1657 
1658         a.sshr4s(A::v4, A::v3,  1);
1659         a.sshr4s(A::v4, A::v3,  8);
1660         a.sshr4s(A::v4, A::v3, 31);
1661 
1662         a.ushr4s(A::v4, A::v3,  1);
1663         a.ushr4s(A::v4, A::v3,  8);
1664         a.ushr4s(A::v4, A::v3, 31);
1665 
1666         a.ushr8h(A::v4, A::v3,  1);
1667         a.ushr8h(A::v4, A::v3,  8);
1668         a.ushr8h(A::v4, A::v3, 15);
1669     },{
1670         0x64,0x54,0x20,0x4f,
1671         0x64,0x54,0x21,0x4f,
1672         0x64,0x54,0x28,0x4f,
1673         0x64,0x54,0x30,0x4f,
1674         0x64,0x54,0x3f,0x4f,
1675 
1676         0x64,0x04,0x3f,0x4f,
1677         0x64,0x04,0x38,0x4f,
1678         0x64,0x04,0x21,0x4f,
1679 
1680         0x64,0x04,0x3f,0x6f,
1681         0x64,0x04,0x38,0x6f,
1682         0x64,0x04,0x21,0x6f,
1683 
1684         0x64,0x04,0x1f,0x6f,
1685         0x64,0x04,0x18,0x6f,
1686         0x64,0x04,0x11,0x6f,
1687     });
1688 
1689     test_asm(r, [&](A& a) {
1690         a.sli4s(A::v4, A::v3,  0);
1691         a.sli4s(A::v4, A::v3,  1);
1692         a.sli4s(A::v4, A::v3,  8);
1693         a.sli4s(A::v4, A::v3, 16);
1694         a.sli4s(A::v4, A::v3, 31);
1695     },{
1696         0x64,0x54,0x20,0x6f,
1697         0x64,0x54,0x21,0x6f,
1698         0x64,0x54,0x28,0x6f,
1699         0x64,0x54,0x30,0x6f,
1700         0x64,0x54,0x3f,0x6f,
1701     });
1702 
1703     test_asm(r, [&](A& a) {
1704         a.scvtf4s (A::v4, A::v3);
1705         a.fcvtzs4s(A::v4, A::v3);
1706         a.fcvtns4s(A::v4, A::v3);
1707         a.frintp4s(A::v4, A::v3);
1708         a.frintm4s(A::v4, A::v3);
1709         a.fcvtn   (A::v4, A::v3);
1710         a.fcvtl   (A::v4, A::v3);
1711     },{
1712         0x64,0xd8,0x21,0x4e,
1713         0x64,0xb8,0xa1,0x4e,
1714         0x64,0xa8,0x21,0x4e,
1715         0x64,0x88,0xa1,0x4e,
1716         0x64,0x98,0x21,0x4e,
1717         0x64,0x68,0x21,0x0e,
1718         0x64,0x78,0x21,0x0e,
1719     });
1720 
1721     test_asm(r, [&](A& a) {
1722         a.sub (A::sp, A::sp, 32);  // sub   sp, sp, #32
1723         a.strq(A::v0, A::sp, 1);   // str   q0, [sp, #16]
1724         a.strq(A::v1, A::sp);      // str   q1, [sp]
1725         a.strd(A::v0, A::sp, 6);   // str   s0, [sp, #48]
1726         a.strs(A::v0, A::sp, 6);   // str   s0, [sp, #24]
1727         a.strh(A::v0, A::sp, 10);  // str   h0, [sp, #20]
1728         a.strb(A::v0, A::sp, 47);  // str   b0, [sp, #47]
1729         a.ldrb(A::v9, A::sp, 42);  // ldr   b9, [sp, #42]
1730         a.ldrh(A::v9, A::sp, 47);  // ldr   h9, [sp, #94]
1731         a.ldrs(A::v7, A::sp, 10);  // ldr   s7, [sp, #40]
1732         a.ldrd(A::v7, A::sp,  1);  // ldr   d7, [sp, #8]
1733         a.ldrq(A::v5, A::sp, 128); // ldr   q5, [sp, #2048]
1734         a.add (A::sp, A::sp, 32);  // add   sp, sp, #32
1735     },{
1736          0xff,0x83,0x00,0xd1,
1737          0xe0,0x07,0x80,0x3d,
1738          0xe1,0x03,0x80,0x3d,
1739          0xe0,0x1b,0x00,0xfd,
1740          0xe0,0x1b,0x00,0xbd,
1741          0xe0,0x2b,0x00,0x7d,
1742          0xe0,0xbf,0x00,0x3d,
1743          0xe9,0xab,0x40,0x3d,
1744          0xe9,0xbf,0x40,0x7d,
1745          0xe7,0x2b,0x40,0xbd,
1746          0xe7,0x07,0x40,0xfd,
1747          0xe5,0x03,0xc2,0x3d,
1748          0xff,0x83,0x00,0x91,
1749     });
1750 
1751     test_asm(r, [&](A& a) {
1752         a.brk(0);
1753         a.brk(65535);
1754 
1755         a.ret(A::x30);   // Conventional ret using link register.
1756         a.ret(A::x13);   // Can really return using any register if we like.
1757 
1758         a.add(A::x2, A::x2,  4);
1759         a.add(A::x3, A::x2, 32);
1760 
1761         a.sub(A::x2, A::x2, 4);
1762         a.sub(A::x3, A::x2, 32);
1763 
1764         a.subs(A::x2, A::x2,  4);
1765         a.subs(A::x3, A::x2, 32);
1766 
1767         a.subs(A::xzr, A::x2, 4);  // These are actually the same instruction!
1768         a.cmp(A::x2, 4);
1769 
1770         A::Label l;
1771         a.label(&l);
1772         a.bne(&l);
1773         a.bne(&l);
1774         a.blt(&l);
1775         a.b(&l);
1776         a.cbnz(A::x2, &l);
1777         a.cbz(A::x2, &l);
1778 
1779         a.add(A::x3, A::x2, A::x1);             // add x3,x2,x1
1780         a.add(A::x3, A::x2, A::x1, A::ASR, 3);  // add x3,x2,x1, asr #3
1781     },{
1782         0x00,0x00,0x20,0xd4,
1783         0xe0,0xff,0x3f,0xd4,
1784 
1785         0xc0,0x03,0x5f,0xd6,
1786         0xa0,0x01,0x5f,0xd6,
1787 
1788         0x42,0x10,0x00,0x91,
1789         0x43,0x80,0x00,0x91,
1790 
1791         0x42,0x10,0x00,0xd1,
1792         0x43,0x80,0x00,0xd1,
1793 
1794         0x42,0x10,0x00,0xf1,
1795         0x43,0x80,0x00,0xf1,
1796 
1797         0x5f,0x10,0x00,0xf1,
1798         0x5f,0x10,0x00,0xf1,
1799 
1800         0x01,0x00,0x00,0x54,   // b.ne #0
1801         0xe1,0xff,0xff,0x54,   // b.ne #-4
1802         0xcb,0xff,0xff,0x54,   // b.lt #-8
1803         0xae,0xff,0xff,0x54,   // b.al #-12
1804         0x82,0xff,0xff,0xb5,   // cbnz x2, #-16
1805         0x62,0xff,0xff,0xb4,   // cbz x2, #-20
1806 
1807         0x43,0x00,0x01,0x8b,
1808         0x43,0x0c,0x81,0x8b,
1809     });
1810 
1811     // Can we cbz() to a not-yet-defined label?
1812     test_asm(r, [&](A& a) {
1813         A::Label l;
1814         a.cbz(A::x2, &l);
1815         a.add(A::x3, A::x2, 32);
1816         a.label(&l);
1817         a.ret(A::x30);
1818     },{
1819         0x42,0x00,0x00,0xb4,  // cbz x2, #8
1820         0x43,0x80,0x00,0x91,  // add x3, x2, #32
1821         0xc0,0x03,0x5f,0xd6,  // ret
1822     });
1823 
1824     // If we start a label as a backward label,
1825     // can we redefine it to be a future label?
1826     // (Not sure this is useful... just want to test it works.)
1827     test_asm(r, [&](A& a) {
1828         A::Label l1;
1829         a.label(&l1);
1830         a.add(A::x3, A::x2, 32);
1831         a.cbz(A::x2, &l1);          // This will jump backward... nothing sneaky.
1832 
1833         A::Label l2;                // Start off the same...
1834         a.label(&l2);
1835         a.add(A::x3, A::x2, 32);
1836         a.cbz(A::x2, &l2);          // Looks like this will go backward...
1837         a.add(A::x2, A::x2, 4);
1838         a.add(A::x3, A::x2, 32);
1839         a.label(&l2);               // But no... actually forward!  What a switcheroo!
1840     },{
1841         0x43,0x80,0x00,0x91,  // add x3, x2, #32
1842         0xe2,0xff,0xff,0xb4,  // cbz x2, #-4
1843 
1844         0x43,0x80,0x00,0x91,  // add x3, x2, #32
1845         0x62,0x00,0x00,0xb4,  // cbz x2, #12
1846         0x42,0x10,0x00,0x91,  // add x2, x2, #4
1847         0x43,0x80,0x00,0x91,  // add x3, x2, #32
1848     });
1849 
1850     // Loading from a label on ARM.
1851     test_asm(r, [&](A& a) {
1852         A::Label fore,aft;
1853         a.label(&fore);
1854         a.word(0x01234567);
1855         a.ldrq(A::v1, &fore);
1856         a.ldrq(A::v2, &aft);
1857         a.label(&aft);
1858         a.word(0x76543210);
1859     },{
1860         0x67,0x45,0x23,0x01,
1861         0xe1,0xff,0xff,0x9c,  // ldr q1, #-4
1862         0x22,0x00,0x00,0x9c,  // ldr q2, #4
1863         0x10,0x32,0x54,0x76,
1864     });
1865 
1866     test_asm(r, [&](A& a) {
1867         a.ldrq(A::v0, A::x8);
1868         a.strq(A::v0, A::x8);
1869     },{
1870         0x00,0x01,0xc0,0x3d,
1871         0x00,0x01,0x80,0x3d,
1872     });
1873 
1874     test_asm(r, [&](A& a) {
1875         a.dup4s  (A::v0, A::x8);
1876         a.ld1r4s (A::v0, A::x8);  // echo 'ld1r.4s {v0}, [x8]' | llvm-mc --show-encoding
1877         a.ld1r8h (A::v0, A::x8);
1878         a.ld1r16b(A::v0, A::x8);
1879     },{
1880         0x00,0x0d,0x04,0x4e,
1881         0x00,0xc9,0x40,0x4d,
1882         0x00,0xc5,0x40,0x4d,
1883         0x00,0xc1,0x40,0x4d,
1884     });
1885 
1886     test_asm(r, [&](A& a) {
1887         a.ld24s(A::v0, A::x8);  // echo 'ld2.4s {v0,v1}, [x8]' | llvm-mc --show-encoding
1888         a.ld44s(A::v0, A::x8);
1889         a.st24s(A::v0, A::x8);
1890         a.st44s(A::v0, A::x8);  // echo 'st4.4s {v0,v1,v2,v3}, [x8]' | llvm-mc --show-encoding
1891 
1892         a.ld24s(A::v0, A::x8, 0);  //echo 'ld2 {v0.s,v1.s}[0], [x8]' | llvm-mc --show-encoding
1893         a.ld24s(A::v0, A::x8, 1);
1894         a.ld24s(A::v0, A::x8, 2);
1895         a.ld24s(A::v0, A::x8, 3);
1896 
1897         a.ld44s(A::v0, A::x8, 0);  // ld4 {v0.s,v1.s,v2.s,v3.s}[0], [x8]
1898         a.ld44s(A::v0, A::x8, 1);
1899         a.ld44s(A::v0, A::x8, 2);
1900         a.ld44s(A::v0, A::x8, 3);
1901     },{
1902         0x00,0x89,0x40,0x4c,
1903         0x00,0x09,0x40,0x4c,
1904         0x00,0x89,0x00,0x4c,
1905         0x00,0x09,0x00,0x4c,
1906 
1907         0x00,0x81,0x60,0x0d,
1908         0x00,0x91,0x60,0x0d,
1909         0x00,0x81,0x60,0x4d,
1910         0x00,0x91,0x60,0x4d,
1911 
1912         0x00,0xa1,0x60,0x0d,
1913         0x00,0xb1,0x60,0x0d,
1914         0x00,0xa1,0x60,0x4d,
1915         0x00,0xb1,0x60,0x4d,
1916     });
1917 
1918     test_asm(r, [&](A& a) {
1919         a.xtns2h(A::v0, A::v0);
1920         a.xtnh2b(A::v0, A::v0);
1921         a.strs  (A::v0, A::x0);
1922 
1923         a.ldrs   (A::v0, A::x0);
1924         a.uxtlb2h(A::v0, A::v0);
1925         a.uxtlh2s(A::v0, A::v0);
1926 
1927         a.uminv4s(A::v3, A::v4);
1928         a.movs   (A::x3, A::v4,0);  // mov.s w3,v4[0]
1929         a.movs   (A::x3, A::v4,1);  // mov.s w3,v4[1]
1930         a.inss   (A::v4, A::x3,3);  // ins.s v4[3],w3
1931     },{
1932         0x00,0x28,0x61,0x0e,
1933         0x00,0x28,0x21,0x0e,
1934         0x00,0x00,0x00,0xbd,
1935 
1936         0x00,0x00,0x40,0xbd,
1937         0x00,0xa4,0x08,0x2f,
1938         0x00,0xa4,0x10,0x2f,
1939 
1940         0x83,0xa8,0xb1,0x6e,
1941         0x83,0x3c,0x04,0x0e,
1942         0x83,0x3c,0x0c,0x0e,
1943         0x64,0x1c,0x1c,0x4e,
1944     });
1945 
1946     test_asm(r, [&](A& a) {
1947         a.ldrb(A::v0, A::x8);
1948         a.strb(A::v0, A::x8);
1949     },{
1950         0x00,0x01,0x40,0x3d,
1951         0x00,0x01,0x00,0x3d,
1952     });
1953 
1954     test_asm(r, [&](A& a) {
1955         a.ldrd(A::x0, A::x1, 3);   // ldr  x0, [x1, #24]
1956         a.ldrs(A::x0, A::x1, 3);   // ldr  w0, [x1, #12]
1957         a.ldrh(A::x0, A::x1, 3);   // ldrh w0, [x1, #6]
1958         a.ldrb(A::x0, A::x1, 3);   // ldrb w0, [x1, #3]
1959 
1960         a.strs(A::x0, A::x1, 3);   // str  w0, [x1, #12]
1961     },{
1962         0x20,0x0c,0x40,0xf9,
1963         0x20,0x0c,0x40,0xb9,
1964         0x20,0x0c,0x40,0x79,
1965         0x20,0x0c,0x40,0x39,
1966 
1967         0x20,0x0c,0x00,0xb9,
1968     });
1969 
1970     test_asm(r, [&](A& a) {
1971         a.tbl   (A::v0, A::v1, A::v2);
1972         a.uzp14s(A::v0, A::v1, A::v2);
1973         a.uzp24s(A::v0, A::v1, A::v2);
1974         a.zip14s(A::v0, A::v1, A::v2);
1975         a.zip24s(A::v0, A::v1, A::v2);
1976     },{
1977         0x20,0x00,0x02,0x4e,
1978         0x20,0x18,0x82,0x4e,
1979         0x20,0x58,0x82,0x4e,
1980         0x20,0x38,0x82,0x4e,
1981         0x20,0x78,0x82,0x4e,
1982     });
1983 }
1984 
DEF_TEST(SkVM_approx_math,r)1985 DEF_TEST(SkVM_approx_math, r) {
1986     auto eval = [](int N, float values[], auto fn) {
1987         skvm::Builder b;
1988         skvm::Ptr inout  = b.varying<float>();
1989 
1990         b.storeF(inout, fn(&b, b.loadF(inout)));
1991 
1992         b.done().eval(N, values);
1993     };
1994 
1995     auto compare = [r](int N, const float values[], const float expected[]) {
1996         for (int i = 0; i < N; ++i) {
1997             REPORTER_ASSERT(r, SkScalarNearlyEqual(values[i], expected[i], 0.001f));
1998         }
1999     };
2000 
2001     // log2
2002     {
2003         float values[] = {0.25f, 0.5f, 1, 2, 4, 8};
2004         constexpr int N = SK_ARRAY_COUNT(values);
2005         eval(N, values, [](skvm::Builder* b, skvm::F32 v) {
2006             return b->approx_log2(v);
2007         });
2008         const float expected[] = {-2, -1, 0, 1, 2, 3};
2009         compare(N, values, expected);
2010     }
2011 
2012     // pow2
2013     {
2014         float values[] = {-2, -1, 0, 1, 2, 3};
2015         constexpr int N = SK_ARRAY_COUNT(values);
2016         eval(N, values, [](skvm::Builder* b, skvm::F32 v) {
2017             return b->approx_pow2(v);
2018         });
2019         const float expected[] = {0.25f, 0.5f, 1, 2, 4, 8};
2020         compare(N, values, expected);
2021     }
2022 
2023     // powf -- x^0.5
2024     {
2025         float bases[] = {0, 1, 4, 9, 16};
2026         constexpr int N = SK_ARRAY_COUNT(bases);
2027         eval(N, bases, [](skvm::Builder* b, skvm::F32 base) {
2028             return b->approx_powf(base, b->splat(0.5f));
2029         });
2030         const float expected[] = {0, 1, 2, 3, 4};
2031         compare(N, bases, expected);
2032     }
2033     // powf -- 3^x
2034     {
2035         float exps[] = {-2, -1, 0, 1, 2};
2036         constexpr int N = SK_ARRAY_COUNT(exps);
2037         eval(N, exps, [](skvm::Builder* b, skvm::F32 exp) {
2038             return b->approx_powf(b->splat(3.0f), exp);
2039         });
2040         const float expected[] = {1/9.0f, 1/3.0f, 1, 3, 9};
2041         compare(N, exps, expected);
2042     }
2043 
2044     auto test = [r](float arg, float expected, float tolerance, auto prog) {
2045         skvm::Builder b;
2046         skvm::Ptr inout  = b.varying<float>();
2047         b.storeF(inout, prog(b.loadF(inout)));
2048         float actual = arg;
2049         b.done().eval(1, &actual);
2050 
2051         float err = std::abs(actual - expected);
2052 
2053         if (err > tolerance) {
2054     //        SkDebugf("arg %g, expected %g, actual %g\n", arg, expected, actual);
2055             REPORTER_ASSERT(r, true);
2056         }
2057         return err;
2058     };
2059 
2060     auto test2 = [r](float arg0, float arg1, float expected, float tolerance, auto prog) {
2061         skvm::Builder b;
2062         skvm::Ptr in0  = b.varying<float>();
2063         skvm::Ptr in1  = b.varying<float>();
2064         skvm::Ptr out  = b.varying<float>();
2065         b.storeF(out, prog(b.loadF(in0), b.loadF(in1)));
2066         float actual;
2067         b.done().eval(1, &arg0, &arg1, &actual);
2068 
2069         float err = std::abs(actual - expected);
2070 
2071         if (err > tolerance) {
2072     //        SkDebugf("[%g, %g]: expected %g, actual %g\n", arg0, arg1, expected, actual);
2073             REPORTER_ASSERT(r, true);
2074         }
2075         return err;
2076     };
2077 
2078     // sine, cosine, tangent
2079     {
2080         constexpr float P = SK_ScalarPI;
2081         constexpr float tol = 0.00175f;
2082         for (float rad = -5*P; rad <= 5*P; rad += 0.1f) {
2083             test(rad, sk_float_sin(rad), tol, [](skvm::F32 x) {
2084                 return approx_sin(x);
2085             });
2086             test(rad, sk_float_cos(rad), tol, [](skvm::F32 x) {
2087                 return approx_cos(x);
2088             });
2089         }
2090 
2091         // Our tangent diverge more as we get near infinities (x near +- Pi/2),
2092         // so bring in the domain a little.
2093         constexpr float eps = 0.16f;
2094         float err = 0;
2095         for (float rad = -P/2 + eps; rad <= P/2 - eps; rad += 0.01f) {
2096             err += test(rad, sk_float_tan(rad), tol, [](skvm::F32 x) {
2097                 return approx_tan(x);
2098             });
2099             // try again with some multiples of P, to check our periodicity
2100             test(rad, sk_float_tan(rad), tol, [=](skvm::F32 x) {
2101                 return approx_tan(x + 3*P);
2102             });
2103             test(rad, sk_float_tan(rad), tol, [=](skvm::F32 x) {
2104                 return approx_tan(x - 3*P);
2105             });
2106         }
2107         if (0) { SkDebugf("tan error %g\n", err); }
2108     }
2109 
2110     // asin, acos, atan
2111     {
2112         constexpr float tol = 0.00175f;
2113         float err = 0;
2114         for (float x = -1; x <= 1; x += 1.0f/64) {
2115             err += test(x, asin(x), tol, [](skvm::F32 x) {
2116                 return approx_asin(x);
2117             });
2118             test(x, acos(x), tol, [](skvm::F32 x) {
2119                 return approx_acos(x);
2120             });
2121         }
2122         if (0) { SkDebugf("asin error %g\n", err); }
2123 
2124         err = 0;
2125         for (float x = -10; x <= 10; x += 1.0f/16) {
2126             err += test(x, atan(x), tol, [](skvm::F32 x) {
2127                 return approx_atan(x);
2128             });
2129         }
2130         if (0) { SkDebugf("atan error %g\n", err); }
2131 
2132         for (float y = -3; y <= 3; y += 1) {
2133             for (float x = -3; x <= 3; x += 1) {
2134                 err += test2(y, x, atan2(y,x), tol, [](skvm::F32 y, skvm::F32 x) {
2135                     return approx_atan2(y,x);
2136                 });
2137             }
2138         }
2139         if (0) { SkDebugf("atan2 error %g\n", err); }
2140     }
2141 }
2142 
DEF_TEST(SkVM_min_max,r)2143 DEF_TEST(SkVM_min_max, r) {
2144     // min() and max() have subtle behavior when one argument is NaN and
2145     // the other isn't.  It's not sound to blindly swap their arguments.
2146     //
2147     // All backends must behave like std::min() and std::max(), which are
2148     //
2149     //    min(x,y) = y<x ? y : x
2150     //    max(x,y) = x<y ? y : x
2151 
2152     // ±NaN, ±0, ±1, ±inf
2153     const uint32_t bits[] = {0x7f80'0001, 0xff80'0001, 0x0000'0000, 0x8000'0000,
2154                              0x3f80'0000, 0xbf80'0000, 0x7f80'0000, 0xff80'0000};
2155 
2156     float f[8];
2157     memcpy(f, bits, sizeof(bits));
2158 
2159     auto identical = [&](float x, float y) {
2160         uint32_t X,Y;
2161         memcpy(&X, &x, 4);
2162         memcpy(&Y, &y, 4);
2163         return X == Y;
2164     };
2165 
2166     // Test min/max with non-constant x, non-constant y.
2167     // (Whether x and y are varying or uniform shouldn't make any difference.)
2168     {
2169         skvm::Builder b;
2170         {
2171             skvm::Ptr src = b.varying<float>(),
2172                        mn = b.varying<float>(),
2173                        mx = b.varying<float>();
2174 
2175             skvm::F32 x = b.loadF(src),
2176                       y = b.uniformF(b.uniform(), 0);
2177 
2178             b.storeF(mn, b.min(x,y));
2179             b.storeF(mx, b.max(x,y));
2180         }
2181 
2182         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2183             float mn[8], mx[8];
2184             for (int i = 0; i < 8; i++) {
2185                 // min() and max() everything with f[i].
2186                 program.eval(8, f,mn,mx, &f[i]);
2187 
2188                 for (int j = 0; j < 8; j++) {
2189                     REPORTER_ASSERT(r, identical(mn[j], std::min(f[j], f[i])));
2190                     REPORTER_ASSERT(r, identical(mx[j], std::max(f[j], f[i])));
2191                 }
2192             }
2193         });
2194     }
2195 
2196     // Test each with constant on the right.
2197     for (int i = 0; i < 8; i++) {
2198         skvm::Builder b;
2199         {
2200             skvm::Ptr src = b.varying<float>(),
2201                        mn = b.varying<float>(),
2202                        mx = b.varying<float>();
2203 
2204             skvm::F32 x = b.loadF(src),
2205                       y = b.splat(f[i]);
2206 
2207             b.storeF(mn, b.min(x,y));
2208             b.storeF(mx, b.max(x,y));
2209         }
2210 
2211         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2212             float mn[8], mx[8];
2213             program.eval(8, f,mn,mx);
2214             for (int j = 0; j < 8; j++) {
2215                 REPORTER_ASSERT(r, identical(mn[j], std::min(f[j], f[i])));
2216                 REPORTER_ASSERT(r, identical(mx[j], std::max(f[j], f[i])));
2217             }
2218         });
2219     }
2220 
2221     // Test each with constant on the left.
2222     for (int i = 0; i < 8; i++) {
2223         skvm::Builder b;
2224         {
2225             skvm::Ptr src = b.varying<float>(),
2226                        mn = b.varying<float>(),
2227                        mx = b.varying<float>();
2228 
2229             skvm::F32 x = b.splat(f[i]),
2230                       y = b.loadF(src);
2231 
2232             b.storeF(mn, b.min(x,y));
2233             b.storeF(mx, b.max(x,y));
2234         }
2235 
2236         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2237             float mn[8], mx[8];
2238             program.eval(8, f,mn,mx);
2239             for (int j = 0; j < 8; j++) {
2240                 REPORTER_ASSERT(r, identical(mn[j], std::min(f[i], f[j])));
2241                 REPORTER_ASSERT(r, identical(mx[j], std::max(f[i], f[j])));
2242             }
2243         });
2244     }
2245 }
2246 
DEF_TEST(SkVM_halfs,r)2247 DEF_TEST(SkVM_halfs, r) {
2248     const uint16_t hs[] = {0x0000,0x3800,0x3c00,0x4000,
2249                            0xc400,0xb800,0xbc00,0xc000};
2250     const float fs[] = {+0.0f,+0.5f,+1.0f,+2.0f,
2251                         -4.0f,-0.5f,-1.0f,-2.0f};
2252     {
2253         skvm::Builder b;
2254         skvm::Ptr src = b.varying<uint16_t>(),
2255                   dst = b.varying<float>();
2256         b.storeF(dst, b.from_fp16(b.load16(src)));
2257 
2258         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2259             float dst[8];
2260             program.eval(8, hs, dst);
2261             for (int i = 0; i < 8; i++) {
2262                 REPORTER_ASSERT(r, dst[i] == fs[i]);
2263             }
2264         });
2265     }
2266     {
2267         skvm::Builder b;
2268         skvm::Ptr src = b.varying<float>(),
2269                   dst = b.varying<uint16_t>();
2270         b.store16(dst, b.to_fp16(b.loadF(src)));
2271 
2272         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2273             uint16_t dst[8];
2274             program.eval(8, fs, dst);
2275             for (int i = 0; i < 8; i++) {
2276                 REPORTER_ASSERT(r, dst[i] == hs[i]);
2277             }
2278         });
2279     }
2280 }
2281 
DEF_TEST(SkVM_64bit,r)2282 DEF_TEST(SkVM_64bit, r) {
2283     uint32_t lo[65],
2284              hi[65];
2285     uint64_t wide[65];
2286     for (int i = 0; i < 65; i++) {
2287         lo[i] = 2*i+0;
2288         hi[i] = 2*i+1;
2289         wide[i] = ((uint64_t)lo[i] <<  0)
2290                 | ((uint64_t)hi[i] << 32);
2291     }
2292 
2293     {
2294         skvm::Builder b;
2295         {
2296             skvm::Ptr widePtr = b.varying<uint64_t>(),
2297                         loPtr = b.varying<int>(),
2298                         hiPtr = b.varying<int>();
2299             b.store32(loPtr, b.load64(widePtr, 0));
2300             b.store32(hiPtr, b.load64(widePtr, 1));
2301         }
2302         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2303             uint32_t l[65], h[65];
2304             program.eval(65, wide,l,h);
2305             for (int i = 0; i < 65; i++) {
2306                 REPORTER_ASSERT(r, l[i] == lo[i]);
2307                 REPORTER_ASSERT(r, h[i] == hi[i]);
2308             }
2309         });
2310     }
2311 
2312     {
2313         skvm::Builder b;
2314         {
2315             skvm::Ptr widePtr = b.varying<uint64_t>(),
2316                         loPtr = b.varying<int>(),
2317                         hiPtr = b.varying<int>();
2318             b.store64(widePtr, b.load32(loPtr), b.load32(hiPtr));
2319         }
2320         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2321             uint64_t w[65];
2322             program.eval(65, w,lo,hi);
2323             for (int i = 0; i < 65; i++) {
2324                 REPORTER_ASSERT(r, w[i] == wide[i]);
2325             }
2326         });
2327     }
2328 }
2329 
DEF_TEST(SkVM_128bit,r)2330 DEF_TEST(SkVM_128bit, r) {
2331     float   floats[4*63];
2332     uint8_t packed[4*63];
2333 
2334     for (int i = 0; i < 4*63; i++) {
2335         floats[i] = i * (1/255.0f);
2336     }
2337 
2338     skvm::PixelFormat rgba_ffff = skvm::SkColorType_to_PixelFormat(kRGBA_F32_SkColorType),
2339                       rgba_8888 = skvm::SkColorType_to_PixelFormat(kRGBA_8888_SkColorType);
2340 
2341     {  // Convert RGBA F32 to RGBA 8888, testing 128-bit loads.
2342         skvm::Builder b;
2343         {
2344             skvm::Ptr dst = b.varying(4),
2345                       src = b.varying(16);
2346 
2347             skvm::Color c = b.load(rgba_ffff, src);
2348             b.store(rgba_8888, dst, c);
2349         }
2350         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2351             memset(packed, 0, sizeof(packed));
2352             program.eval(63, packed, floats);
2353             for (int i = 0; i < 4*63; i++) {
2354                 REPORTER_ASSERT(r, packed[i] == i);
2355             }
2356         });
2357     }
2358 
2359 
2360     {  // Convert RGBA 8888 to RGBA F32, testing 128-bit stores.
2361         skvm::Builder b;
2362         {
2363             skvm::Ptr dst = b.varying(16),
2364                       src = b.varying(4);
2365 
2366             skvm::Color c = b.load(rgba_8888, src);
2367             b.store(rgba_ffff, dst, c);
2368         }
2369         test_jit_and_interpreter(b, [&](const skvm::Program& program){
2370             memset(floats, 0, sizeof(floats));
2371             program.eval(63, floats, packed);
2372             for (int i = 0; i < 4*63; i++) {
2373                 REPORTER_ASSERT(r, floats[i] == i * (1/255.0f));
2374             }
2375         });
2376     }
2377 
2378 }
2379 
DEF_TEST(SkVM_is_NaN_is_finite,r)2380 DEF_TEST(SkVM_is_NaN_is_finite, r) {
2381     skvm::Builder b;
2382     {
2383         skvm::Ptr src = b.varying<float>(),
2384                   nan = b.varying<int>(),
2385                   fin = b.varying<int>();
2386         b.store32(nan, is_NaN   (b.loadF(src)));
2387         b.store32(fin, is_finite(b.loadF(src)));
2388     }
2389     test_jit_and_interpreter(b, [&](const skvm::Program& program){
2390         // ±NaN, ±0, ±1, ±inf
2391         const uint32_t bits[] = {0x7f80'0001, 0xff80'0001, 0x0000'0000, 0x8000'0000,
2392                                  0x3f80'0000, 0xbf80'0000, 0x7f80'0000, 0xff80'0000};
2393         uint32_t nan[8], fin[8];
2394         program.eval(8, bits, nan,fin);
2395 
2396         for (int i = 0; i < 8; i++) {
2397             REPORTER_ASSERT(r, nan[i] == ((i == 0 || i == 1) ? 0xffffffff : 0));
2398             REPORTER_ASSERT(r, fin[i] == ((i == 2 || i == 3 ||
2399                                            i == 4 || i == 5) ? 0xffffffff : 0));
2400         }
2401     });
2402 }
2403 
DEF_TEST(SkVM_args,r)2404 DEF_TEST(SkVM_args, r) {
2405     // Test we can handle at least six arguments.
2406     skvm::Builder b;
2407     {
2408         skvm::Ptr dst = b.varying<float>(),
2409                     A = b.varying<float>(),
2410                     B = b.varying<float>(),
2411                     C = b.varying<float>(),
2412                     D = b.varying<float>(),
2413                     E = b.varying<float>();
2414         storeF(dst, b.loadF(A)
2415                   + b.loadF(B)
2416                   + b.loadF(C)
2417                   + b.loadF(D)
2418                   + b.loadF(E));
2419     }
2420 
2421     test_jit_and_interpreter(b, [&](const skvm::Program& program){
2422         float dst[17],A[17],B[17],C[17],D[17],E[17];
2423         for (int i = 0; i < 17; i++) {
2424             A[i] = B[i] = C[i] = D[i] = E[i] = (float)i;
2425         }
2426         program.eval(17, dst,A,B,C,D,E);
2427         for (int i = 0; i < 17; i++) {
2428             REPORTER_ASSERT(r, dst[i] == 5.0f*i);
2429         }
2430     });
2431 }
2432 
DEF_TEST(SkVM_badpack,reporter)2433 DEF_TEST(SkVM_badpack, reporter) {
2434     // Test case distilled from actual failing draw,
2435     // originally with a bad arm64 implementation of pack().
2436     skvm::Builder p;
2437     {
2438         skvm::UPtr uniforms = p.uniform();
2439         skvm::Ptr dst = p.varying<uint16_t>();
2440 
2441         skvm::I32 r = round(p.uniformF(uniforms, 8) * 15),
2442                   a = p.splat(0xf);
2443 
2444         skvm::I32 _4444 = p.splat(0);
2445         _4444 = pack(_4444, r, 12);
2446         _4444 = pack(_4444, a,  0);
2447         store16(dst, _4444);
2448     }
2449 
2450     test_jit_and_interpreter(p, [&](const skvm::Program& program){
2451         const float uniforms[] = { 0.0f, 0.0f,
2452                                    1.0f, 0.0f, 0.0f, 1.0f };
2453 
2454         uint16_t dst[17] = {0};
2455         program.eval(17, uniforms,dst);
2456         for (int i = 0; i < 17; i++) {
2457             REPORTER_ASSERT(reporter, dst[i] == 0xf00f, "got %04x, want %04x\n", dst[i], 0xf00f);
2458         }
2459     });
2460 }
2461 
DEF_TEST(SkVM_features,r)2462 DEF_TEST(SkVM_features, r) {
2463     auto build_program = [](skvm::Builder* b) {
2464         skvm::F32 x = b->loadF(b->varying<float>());
2465         b->storeF(b->varying<float>(), x*x+x);
2466     };
2467 
2468     {   // load-fma-store with FMA available.
2469         skvm::Features features;
2470         features.fma = true;
2471         skvm::Builder b(features);
2472         build_program(&b);
2473         REPORTER_ASSERT(r, b.optimize().size() == 3);
2474     }
2475 
2476     {   // load-mul-add-store without FMA.
2477         skvm::Features features;
2478         features.fma = false;
2479         skvm::Builder b(features);
2480         build_program(&b);
2481         REPORTER_ASSERT(r, b.optimize().size() == 4);
2482     }
2483 
2484     {   // Auto-detected, could be either.
2485         skvm::Builder b;
2486         build_program(&b);
2487         REPORTER_ASSERT(r, b.optimize().size() == 3
2488                         || b.optimize().size() == 4);
2489     }
2490 }
2491 
DEF_TEST(SkVM_gather_can_hoist,r)2492 DEF_TEST(SkVM_gather_can_hoist, r) {
2493     // A gather instruction isn't necessarily varying... it's whatever its index is.
2494     // First a typical gather scenario with varying index.
2495     {
2496         skvm::Builder b;
2497         skvm::UPtr uniforms = b.uniform();
2498         skvm::Ptr buf = b.varying<int>();
2499         skvm::I32 ix = b.load32(buf);
2500         b.store32(buf, b.gather32(uniforms,0, ix));
2501 
2502         skvm::Program p = b.done();
2503 
2504         // ix is varying, so the gather is too.
2505         //
2506         // loop:
2507         //     v0 = load32 buf
2508         //     v1 = gather32 uniforms+0 v0
2509         //     store32 buf v1
2510         REPORTER_ASSERT(r, p.instructions().size() == 3);
2511         REPORTER_ASSERT(r, p.loop() == 0);
2512     }
2513 
2514     // Now the same but with a uniform index instead.
2515     {
2516         skvm::Builder b;
2517         skvm::UPtr uniforms = b.uniform();
2518         skvm::Ptr buf = b.varying<int>();
2519         skvm::I32 ix = b.uniform32(uniforms,8);
2520         b.store32(buf, b.gather32(uniforms,0, ix));
2521 
2522         skvm::Program p = b.done();
2523 
2524         // ix is uniform, so the gather is too.
2525         //
2526         // v0 = uniform32 uniforms+8
2527         // v1 = gather32 uniforms+0 v0
2528         // loop:
2529         //     store32 buf v1
2530         REPORTER_ASSERT(r, p.instructions().size() == 3);
2531         REPORTER_ASSERT(r, p.loop() == 2);
2532     }
2533 }
2534 
DEF_TEST(SkVM_dont_dedup_loads,r)2535 DEF_TEST(SkVM_dont_dedup_loads, r) {
2536     // We've been assuming that all Ops with the same arguments produce the same value
2537     // and deduplicating them, which results in a simple common subexpression eliminator.
2538     //
2539     // But we can't soundly dedup two identical loads with a store between.
2540     // If we dedup the loads in this test program it will always increment by 1, not K.
2541     constexpr int K = 2;
2542     skvm::Builder b;
2543     {
2544         skvm::Ptr buf = b.varying<int>();
2545         for (int i = 0; i < K; i++) {
2546             b.store32(buf, b.load32(buf) + 1);
2547         }
2548     }
2549 
2550     test_jit_and_interpreter(b, [&](const skvm::Program& program){
2551         int buf[] = { 0,1,2,3,4 };
2552         program.eval(SK_ARRAY_COUNT(buf), buf);
2553         for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
2554             REPORTER_ASSERT(r, buf[i] == i+K);
2555         }
2556     });
2557 }
2558 
DEF_TEST(SkVM_dont_dedup_stores,r)2559 DEF_TEST(SkVM_dont_dedup_stores, r) {
2560     // Following a similar line of reasoning to SkVM_dont_dedup_loads,
2561     // we cannot dedup stores either.  A different store between two identical stores
2562     // will invalidate the first store, meaning we do need to reissue that store operation.
2563     skvm::Builder b;
2564     {
2565         skvm::Ptr buf = b.varying<int>();
2566         b.store32(buf, b.splat(4));
2567         b.store32(buf, b.splat(5));
2568         b.store32(buf, b.splat(4));   // If we dedup'd, we'd skip this store.
2569     }
2570 
2571     test_jit_and_interpreter(b, [&](const skvm::Program& program){
2572         int buf[42];
2573         program.eval(SK_ARRAY_COUNT(buf), buf);
2574         for (int x : buf) {
2575             REPORTER_ASSERT(r, x == 4);
2576         }
2577     });
2578 }
2579 
DEF_TEST(SkVM_fast_mul,r)2580 DEF_TEST(SkVM_fast_mul, r) {
2581     skvm::Builder b;
2582     {
2583         skvm::Ptr src = b.varying<float>(),
2584                  fast = b.varying<float>(),
2585                  slow = b.varying<float>();
2586         skvm::F32 x = b.loadF(src);
2587         b.storeF(fast, fast_mul(0.0f, x));
2588         b.storeF(slow, 0.0f * x);
2589     }
2590     test_jit_and_interpreter(b, [&](const skvm::Program& program){
2591         const uint32_t bits[] = {
2592             0x0000'0000, 0x8000'0000, //±0
2593             0x3f80'0000, 0xbf80'0000, //±1
2594             0x7f80'0000, 0xff80'0000, //±inf
2595             0x7f80'0001, 0xff80'0001, //±NaN
2596         };
2597         float fast[8],
2598               slow[8];
2599         program.eval(8,bits,fast,slow);
2600 
2601         for (int i = 0; i < 8; i++) {
2602             REPORTER_ASSERT(r, fast[i] == 0.0f);
2603 
2604             if (i < 4) {
2605                 REPORTER_ASSERT(r, slow[i] == 0.0f);
2606             } else {
2607                 REPORTER_ASSERT(r, isnan(slow[i]));
2608             }
2609         }
2610     });
2611 }
2612