1 /*
2 * Copyright 2019 Google LLC
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "include/core/SkColorPriv.h"
9 #include "include/private/SkColorData.h"
10 #include "src/core/SkCpu.h"
11 #include "src/core/SkMSAN.h"
12 #include "src/core/SkVM.h"
13 #include "tests/Test.h"
14
15 template <typename Fn>
test_jit_and_interpreter(const skvm::Builder & b,Fn && test)16 static void test_jit_and_interpreter(const skvm::Builder& b, Fn&& test) {
17 skvm::Program p = b.done();
18 test(p);
19 if (p.hasJIT()) {
20 test(b.done(/*debug_name=*/nullptr, /*allow_jit=*/false));
21 }
22 }
23
DEF_TEST(SkVM_eliminate_dead_code,r)24 DEF_TEST(SkVM_eliminate_dead_code, r) {
25 skvm::Builder b;
26 {
27 skvm::Ptr arg = b.varying<int>();
28 skvm::I32 l = b.load32(arg);
29 skvm::I32 a = b.add(l, l);
30 b.add(a, b.splat(7));
31 }
32
33 std::vector<skvm::Instruction> program = b.program();
34 REPORTER_ASSERT(r, program.size() == 4);
35
36 program = skvm::eliminate_dead_code(program);
37 REPORTER_ASSERT(r, program.size() == 0);
38 }
39
DEF_TEST(SkVM_Pointless,r)40 DEF_TEST(SkVM_Pointless, r) {
41 // Let's build a program with no memory arguments.
42 // It should all be pegged as dead code, but we should be able to "run" it.
43 skvm::Builder b;
44 {
45 b.add(b.splat(5.0f),
46 b.splat(4.0f));
47 }
48
49 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
50 for (int N = 0; N < 64; N++) {
51 program.eval(N);
52 }
53 });
54
55 for (const skvm::OptimizedInstruction& inst : b.optimize()) {
56 REPORTER_ASSERT(r, inst.death == 0 && inst.can_hoist == true);
57 }
58 }
59
DEF_TEST(SkVM_memset,r)60 DEF_TEST(SkVM_memset, r) {
61 skvm::Builder b;
62 b.store32(b.varying<int>(), b.splat(42));
63
64 test_jit_and_interpreter(b, [&](const skvm::Program& p) {
65 int buf[18];
66 buf[17] = 47;
67
68 p.eval(17, buf);
69 for (int i = 0; i < 17; i++) {
70 REPORTER_ASSERT(r, buf[i] == 42);
71 }
72 REPORTER_ASSERT(r, buf[17] == 47);
73 });
74 }
75
DEF_TEST(SkVM_memcpy,r)76 DEF_TEST(SkVM_memcpy, r) {
77 skvm::Builder b;
78 {
79 auto src = b.varying<int>(),
80 dst = b.varying<int>();
81 b.store32(dst, b.load32(src));
82 }
83
84 test_jit_and_interpreter(b, [&](const skvm::Program& p) {
85 int src[] = {1,2,3,4,5,6,7,8,9},
86 dst[] = {0,0,0,0,0,0,0,0,0};
87
88 p.eval(SK_ARRAY_COUNT(src)-1, src, dst);
89 for (size_t i = 0; i < SK_ARRAY_COUNT(src)-1; i++) {
90 REPORTER_ASSERT(r, dst[i] == src[i]);
91 }
92 size_t i = SK_ARRAY_COUNT(src)-1;
93 REPORTER_ASSERT(r, dst[i] == 0);
94 });
95 }
96
DEF_TEST(SkVM_allow_jit,r)97 DEF_TEST(SkVM_allow_jit, r) {
98 skvm::Builder b;
99 {
100 auto src = b.varying<int>(),
101 dst = b.varying<int>();
102 b.store32(dst, b.load32(src));
103 }
104
105 if (b.done("test-allow_jit", /*allow_jit=*/true).hasJIT()) {
106 REPORTER_ASSERT(r, !b.done("", false).hasJIT());
107 }
108 }
109
DEF_TEST(SkVM_LoopCounts,r)110 DEF_TEST(SkVM_LoopCounts, r) {
111 // Make sure we cover all the exact N we want.
112
113 // buf[i] += 1
114 skvm::Builder b;
115 skvm::Ptr arg = b.varying<int>();
116 b.store32(arg,
117 b.add(b.splat(1),
118 b.load32(arg)));
119
120 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
121 int buf[64];
122 for (int N = 0; N <= (int)SK_ARRAY_COUNT(buf); N++) {
123 for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
124 buf[i] = i;
125 }
126 program.eval(N, buf);
127
128 for (int i = 0; i < N; i++) {
129 REPORTER_ASSERT(r, buf[i] == i+1);
130 }
131 for (int i = N; i < (int)SK_ARRAY_COUNT(buf); i++) {
132 REPORTER_ASSERT(r, buf[i] == i);
133 }
134 }
135 });
136 }
137
DEF_TEST(SkVM_gather32,r)138 DEF_TEST(SkVM_gather32, r) {
139 skvm::Builder b;
140 {
141 skvm::UPtr uniforms = b.uniform();
142 skvm::Ptr buf = b.varying<int>();
143 skvm::I32 x = b.load32(buf);
144 b.store32(buf, b.gather32(uniforms,0, b.bit_and(x, b.splat(7))));
145 }
146
147 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
148 const int img[] = {12,34,56,78, 90,98,76,54};
149
150 int buf[20];
151 for (int i = 0; i < 20; i++) {
152 buf[i] = i;
153 }
154
155 struct Uniforms {
156 const int* img;
157 } uniforms{img};
158
159 program.eval(20, &uniforms, buf);
160 int i = 0;
161 REPORTER_ASSERT(r, buf[i] == 12); i++;
162 REPORTER_ASSERT(r, buf[i] == 34); i++;
163 REPORTER_ASSERT(r, buf[i] == 56); i++;
164 REPORTER_ASSERT(r, buf[i] == 78); i++;
165 REPORTER_ASSERT(r, buf[i] == 90); i++;
166 REPORTER_ASSERT(r, buf[i] == 98); i++;
167 REPORTER_ASSERT(r, buf[i] == 76); i++;
168 REPORTER_ASSERT(r, buf[i] == 54); i++;
169
170 REPORTER_ASSERT(r, buf[i] == 12); i++;
171 REPORTER_ASSERT(r, buf[i] == 34); i++;
172 REPORTER_ASSERT(r, buf[i] == 56); i++;
173 REPORTER_ASSERT(r, buf[i] == 78); i++;
174 REPORTER_ASSERT(r, buf[i] == 90); i++;
175 REPORTER_ASSERT(r, buf[i] == 98); i++;
176 REPORTER_ASSERT(r, buf[i] == 76); i++;
177 REPORTER_ASSERT(r, buf[i] == 54); i++;
178
179 REPORTER_ASSERT(r, buf[i] == 12); i++;
180 REPORTER_ASSERT(r, buf[i] == 34); i++;
181 REPORTER_ASSERT(r, buf[i] == 56); i++;
182 REPORTER_ASSERT(r, buf[i] == 78); i++;
183 });
184 }
185
DEF_TEST(SkVM_gathers,r)186 DEF_TEST(SkVM_gathers, r) {
187 skvm::Builder b;
188 {
189 skvm::UPtr uniforms = b.uniform();
190 skvm::Ptr buf32 = b.varying<int>(),
191 buf16 = b.varying<uint16_t>(),
192 buf8 = b.varying<uint8_t>();
193
194 skvm::I32 x = b.load32(buf32);
195
196 b.store32(buf32, b.gather32(uniforms,0, b.bit_and(x, b.splat( 7))));
197 b.store16(buf16, b.gather16(uniforms,0, b.bit_and(x, b.splat(15))));
198 b.store8 (buf8 , b.gather8 (uniforms,0, b.bit_and(x, b.splat(31))));
199 }
200
201 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
202 const int img[] = {12,34,56,78, 90,98,76,54};
203
204 constexpr int N = 20;
205 int buf32[N];
206 uint16_t buf16[N];
207 uint8_t buf8 [N];
208
209 for (int i = 0; i < 20; i++) {
210 buf32[i] = i;
211 }
212
213 struct Uniforms {
214 const int* img;
215 } uniforms{img};
216
217 program.eval(N, &uniforms, buf32, buf16, buf8);
218 int i = 0;
219 REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 12 && buf8[i] == 12); i++;
220 REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] == 0 && buf8[i] == 0); i++;
221 REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 34 && buf8[i] == 0); i++;
222 REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] == 0 && buf8[i] == 0); i++;
223 REPORTER_ASSERT(r, buf32[i] == 90 && buf16[i] == 56 && buf8[i] == 34); i++;
224 REPORTER_ASSERT(r, buf32[i] == 98 && buf16[i] == 0 && buf8[i] == 0); i++;
225 REPORTER_ASSERT(r, buf32[i] == 76 && buf16[i] == 78 && buf8[i] == 0); i++;
226 REPORTER_ASSERT(r, buf32[i] == 54 && buf16[i] == 0 && buf8[i] == 0); i++;
227
228 REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 90 && buf8[i] == 56); i++;
229 REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] == 0 && buf8[i] == 0); i++;
230 REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 98 && buf8[i] == 0); i++;
231 REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] == 0 && buf8[i] == 0); i++;
232 REPORTER_ASSERT(r, buf32[i] == 90 && buf16[i] == 76 && buf8[i] == 78); i++;
233 REPORTER_ASSERT(r, buf32[i] == 98 && buf16[i] == 0 && buf8[i] == 0); i++;
234 REPORTER_ASSERT(r, buf32[i] == 76 && buf16[i] == 54 && buf8[i] == 0); i++;
235 REPORTER_ASSERT(r, buf32[i] == 54 && buf16[i] == 0 && buf8[i] == 0); i++;
236
237 REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 12 && buf8[i] == 90); i++;
238 REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] == 0 && buf8[i] == 0); i++;
239 REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 34 && buf8[i] == 0); i++;
240 REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] == 0 && buf8[i] == 0); i++;
241 });
242 }
243
DEF_TEST(SkVM_gathers2,r)244 DEF_TEST(SkVM_gathers2, r) {
245 skvm::Builder b;
246 {
247 skvm::UPtr uniforms = b.uniform();
248 skvm::Ptr buf32 = b.varying<int>(),
249 buf16 = b.varying<uint16_t>(),
250 buf8 = b.varying<uint8_t>();
251
252 skvm::I32 x = b.load32(buf32);
253
254 b.store32(buf32, b.gather32(uniforms,0, x));
255 b.store16(buf16, b.gather16(uniforms,0, x));
256 b.store8 (buf8 , b.gather8 (uniforms,0, x));
257 }
258
259 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
260 uint8_t img[256];
261 for (int i = 0; i < 256; i++) {
262 img[i] = i;
263 }
264
265 int buf32[64];
266 uint16_t buf16[64];
267 uint8_t buf8 [64];
268
269 for (int i = 0; i < 64; i++) {
270 buf32[i] = (i*47)&63;
271 buf16[i] = 0;
272 buf8 [i] = 0;
273 }
274
275 struct Uniforms {
276 const uint8_t* img;
277 } uniforms{img};
278
279 program.eval(64, &uniforms, buf32, buf16, buf8);
280
281 for (int i = 0; i < 64; i++) {
282 REPORTER_ASSERT(r, buf8[i] == ((i*47)&63)); // 0,47,30,13,60,...
283 }
284
285 REPORTER_ASSERT(r, buf16[ 0] == 0x0100);
286 REPORTER_ASSERT(r, buf16[63] == 0x2322);
287
288 REPORTER_ASSERT(r, buf32[ 0] == 0x03020100);
289 REPORTER_ASSERT(r, buf32[63] == 0x47464544);
290 });
291 }
292
DEF_TEST(SkVM_bitops,r)293 DEF_TEST(SkVM_bitops, r) {
294 skvm::Builder b;
295 {
296 skvm::Ptr ptr = b.varying<int>();
297
298 skvm::I32 x = b.load32(ptr);
299
300 x = b.bit_and (x, b.splat(0xf1)); // 0x40
301 x = b.bit_or (x, b.splat(0x80)); // 0xc0
302 x = b.bit_xor (x, b.splat(0xfe)); // 0x3e
303 x = b.bit_clear(x, b.splat(0x30)); // 0x0e
304
305 x = b.shl(x, 28); // 0xe000'0000
306 x = b.sra(x, 28); // 0xffff'fffe
307 x = b.shr(x, 1); // 0x7fff'ffff
308
309 b.store32(ptr, x);
310 }
311
312 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
313 int x = 0x42;
314 program.eval(1, &x);
315 REPORTER_ASSERT(r, x == 0x7fff'ffff);
316 });
317 }
318
DEF_TEST(SkVM_select_is_NaN,r)319 DEF_TEST(SkVM_select_is_NaN, r) {
320 skvm::Builder b;
321 {
322 skvm::Ptr src = b.varying<float>(),
323 dst = b.varying<float>();
324
325 skvm::F32 x = b.loadF(src);
326 x = select(is_NaN(x), b.splat(0.0f)
327 , x);
328 b.storeF(dst, x);
329 }
330
331 std::vector<skvm::OptimizedInstruction> program = b.optimize();
332 REPORTER_ASSERT(r, program.size() == 4);
333 REPORTER_ASSERT(r, program[0].op == skvm::Op::load32);
334 REPORTER_ASSERT(r, program[1].op == skvm::Op::neq_f32);
335 REPORTER_ASSERT(r, program[2].op == skvm::Op::bit_clear);
336 REPORTER_ASSERT(r, program[3].op == skvm::Op::store32);
337
338 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
339 // ±NaN, ±0, ±1, ±inf
340 uint32_t src[] = {0x7f80'0001, 0xff80'0001, 0x0000'0000, 0x8000'0000,
341 0x3f80'0000, 0xbf80'0000, 0x7f80'0000, 0xff80'0000};
342 uint32_t dst[SK_ARRAY_COUNT(src)];
343 program.eval(SK_ARRAY_COUNT(src), src, dst);
344
345 for (int i = 0; i < (int)SK_ARRAY_COUNT(src); i++) {
346 REPORTER_ASSERT(r, dst[i] == (i < 2 ? 0 : src[i]));
347 }
348 });
349 }
350
DEF_TEST(SkVM_f32,r)351 DEF_TEST(SkVM_f32, r) {
352 skvm::Builder b;
353 {
354 skvm::Ptr arg = b.varying<float>();
355
356 skvm::F32 x = b.loadF(arg),
357 y = b.add(x,x), // y = 2x
358 z = b.sub(y,x), // z = 2x-x = x
359 w = b.div(z,x); // w = x/x = 1
360 b.storeF(arg, w);
361 }
362
363 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
364 float buf[] = { 1,2,3,4,5,6,7,8,9 };
365 program.eval(SK_ARRAY_COUNT(buf), buf);
366 for (float v : buf) {
367 REPORTER_ASSERT(r, v == 1.0f);
368 }
369 });
370 }
371
DEF_TEST(SkVM_cmp_i32,r)372 DEF_TEST(SkVM_cmp_i32, r) {
373 skvm::Builder b;
374 {
375 skvm::I32 x = b.load32(b.varying<int>());
376
377 auto to_bit = [&](int shift, skvm::I32 mask) {
378 return b.shl(b.bit_and(mask, b.splat(0x1)), shift);
379 };
380
381 skvm::I32 m = b.splat(0);
382 m = b.bit_or(m, to_bit(0, b. eq(x, b.splat(0))));
383 m = b.bit_or(m, to_bit(1, b.neq(x, b.splat(1))));
384 m = b.bit_or(m, to_bit(2, b. lt(x, b.splat(2))));
385 m = b.bit_or(m, to_bit(3, b.lte(x, b.splat(3))));
386 m = b.bit_or(m, to_bit(4, b. gt(x, b.splat(4))));
387 m = b.bit_or(m, to_bit(5, b.gte(x, b.splat(5))));
388
389 b.store32(b.varying<int>(), m);
390 }
391 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
392 int in[] = { 0,1,2,3,4,5,6,7,8,9 };
393 int out[SK_ARRAY_COUNT(in)];
394
395 program.eval(SK_ARRAY_COUNT(in), in, out);
396
397 REPORTER_ASSERT(r, out[0] == 0b001111);
398 REPORTER_ASSERT(r, out[1] == 0b001100);
399 REPORTER_ASSERT(r, out[2] == 0b001010);
400 REPORTER_ASSERT(r, out[3] == 0b001010);
401 REPORTER_ASSERT(r, out[4] == 0b000010);
402 for (int i = 5; i < (int)SK_ARRAY_COUNT(out); i++) {
403 REPORTER_ASSERT(r, out[i] == 0b110010);
404 }
405 });
406 }
407
DEF_TEST(SkVM_cmp_f32,r)408 DEF_TEST(SkVM_cmp_f32, r) {
409 skvm::Builder b;
410 {
411 skvm::F32 x = b.loadF(b.varying<float>());
412
413 auto to_bit = [&](int shift, skvm::I32 mask) {
414 return b.shl(b.bit_and(mask, b.splat(0x1)), shift);
415 };
416
417 skvm::I32 m = b.splat(0);
418 m = b.bit_or(m, to_bit(0, b. eq(x, b.splat(0.0f))));
419 m = b.bit_or(m, to_bit(1, b.neq(x, b.splat(1.0f))));
420 m = b.bit_or(m, to_bit(2, b. lt(x, b.splat(2.0f))));
421 m = b.bit_or(m, to_bit(3, b.lte(x, b.splat(3.0f))));
422 m = b.bit_or(m, to_bit(4, b. gt(x, b.splat(4.0f))));
423 m = b.bit_or(m, to_bit(5, b.gte(x, b.splat(5.0f))));
424
425 b.store32(b.varying<int>(), m);
426 }
427
428 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
429 float in[] = { 0,1,2,3,4,5,6,7,8,9 };
430 int out[SK_ARRAY_COUNT(in)];
431
432 program.eval(SK_ARRAY_COUNT(in), in, out);
433
434 REPORTER_ASSERT(r, out[0] == 0b001111);
435 REPORTER_ASSERT(r, out[1] == 0b001100);
436 REPORTER_ASSERT(r, out[2] == 0b001010);
437 REPORTER_ASSERT(r, out[3] == 0b001010);
438 REPORTER_ASSERT(r, out[4] == 0b000010);
439 for (int i = 5; i < (int)SK_ARRAY_COUNT(out); i++) {
440 REPORTER_ASSERT(r, out[i] == 0b110010);
441 }
442 });
443 }
444
DEF_TEST(SkVM_index,r)445 DEF_TEST(SkVM_index, r) {
446 skvm::Builder b;
447 b.store32(b.varying<int>(), b.index());
448
449 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
450 int buf[23];
451 program.eval(SK_ARRAY_COUNT(buf), buf);
452 for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
453 REPORTER_ASSERT(r, buf[i] == (int)SK_ARRAY_COUNT(buf)-i);
454 }
455 });
456 }
457
DEF_TEST(SkVM_mad,r)458 DEF_TEST(SkVM_mad, r) {
459 // This program is designed to exercise the tricky corners of instruction
460 // and register selection for Op::mad_f32.
461
462 skvm::Builder b;
463 {
464 skvm::Ptr arg = b.varying<int>();
465
466 skvm::F32 x = b.to_F32(b.load32(arg)),
467 y = b.mad(x,x,x), // x is needed in the future, so r[x] != r[y].
468 z = b.mad(y,y,x), // y is needed in the future, but r[z] = r[x] is ok.
469 w = b.mad(z,z,y), // w can alias z but not y.
470 v = b.mad(w,y,w); // Got to stop somewhere.
471 b.store32(arg, b.trunc(v));
472 }
473
474 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
475 int x = 2;
476 program.eval(1, &x);
477 // x = 2
478 // y = 2*2 + 2 = 6
479 // z = 6*6 + 2 = 38
480 // w = 38*38 + 6 = 1450
481 // v = 1450*6 + 1450 = 10150
482 REPORTER_ASSERT(r, x == 10150);
483 });
484 }
485
DEF_TEST(SkVM_fms,r)486 DEF_TEST(SkVM_fms, r) {
487 // Create a pattern that can be peepholed into an Op::fms_f32.
488 skvm::Builder b;
489 {
490 skvm::Ptr arg = b.varying<int>();
491
492 skvm::F32 x = b.to_F32(b.load32(arg)),
493 v = b.sub(b.mul(x, b.splat(2.0f)),
494 b.splat(1.0f));
495 b.store32(arg, b.trunc(v));
496 }
497
498 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
499 int buf[] = {0,1,2,3,4,5,6,7,8,9,10};
500 program.eval((int)SK_ARRAY_COUNT(buf), &buf);
501
502 for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
503 REPORTER_ASSERT(r, buf[i] = 2*i-1);
504 }
505 });
506 }
507
DEF_TEST(SkVM_fnma,r)508 DEF_TEST(SkVM_fnma, r) {
509 // Create a pattern that can be peepholed into an Op::fnma_f32.
510 skvm::Builder b;
511 {
512 skvm::Ptr arg = b.varying<int>();
513
514 skvm::F32 x = b.to_F32(b.load32(arg)),
515 v = b.sub(b.splat(1.0f),
516 b.mul(x, b.splat(2.0f)));
517 b.store32(arg, b.trunc(v));
518 }
519
520 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
521 int buf[] = {0,1,2,3,4,5,6,7,8,9,10};
522 program.eval((int)SK_ARRAY_COUNT(buf), &buf);
523
524 for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
525 REPORTER_ASSERT(r, buf[i] = 1-2*i);
526 }
527 });
528 }
529
DEF_TEST(SkVM_madder,r)530 DEF_TEST(SkVM_madder, r) {
531 skvm::Builder b;
532 {
533 skvm::Ptr arg = b.varying<float>();
534
535 skvm::F32 x = b.loadF(arg),
536 y = b.mad(x,x,x), // x is needed in the future, so r[x] != r[y].
537 z = b.mad(y,x,y), // r[x] can be reused after this instruction, but not r[y].
538 w = b.mad(y,y,z);
539 b.storeF(arg, w);
540 }
541
542 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
543 float x = 2.0f;
544 // y = 2*2 + 2 = 6
545 // z = 6*2 + 6 = 18
546 // w = 6*6 + 18 = 54
547 program.eval(1, &x);
548 REPORTER_ASSERT(r, x == 54.0f);
549 });
550 }
551
DEF_TEST(SkVM_floor,r)552 DEF_TEST(SkVM_floor, r) {
553 skvm::Builder b;
554 {
555 skvm::Ptr arg = b.varying<float>();
556 b.storeF(arg, b.floor(b.loadF(arg)));
557 }
558
559 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
560 float buf[] = { -2.0f, -1.5f, -1.0f, 0.0f, 1.0f, 1.5f, 2.0f };
561 float want[] = { -2.0f, -2.0f, -1.0f, 0.0f, 1.0f, 1.0f, 2.0f };
562 program.eval(SK_ARRAY_COUNT(buf), buf);
563 for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
564 REPORTER_ASSERT(r, buf[i] == want[i]);
565 }
566 });
567 }
568
DEF_TEST(SkVM_round,r)569 DEF_TEST(SkVM_round, r) {
570 skvm::Builder b;
571 {
572 skvm::Ptr src = b.varying<float>();
573 skvm::Ptr dst = b.varying<int>();
574 b.store32(dst, b.round(b.loadF(src)));
575 }
576
577 // The test cases on exact 0.5f boundaries assume the current rounding mode is nearest even.
578 // We haven't explicitly guaranteed that here... it just probably is.
579 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
580 float buf[] = { -1.5f, -0.5f, 0.0f, 0.5f, 0.2f, 0.6f, 1.0f, 1.4f, 1.5f, 2.0f };
581 int want[] = { -2 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 2 , 2 };
582 int dst[SK_ARRAY_COUNT(buf)];
583
584 program.eval(SK_ARRAY_COUNT(buf), buf, dst);
585 for (int i = 0; i < (int)SK_ARRAY_COUNT(dst); i++) {
586 REPORTER_ASSERT(r, dst[i] == want[i]);
587 }
588 });
589 }
590
DEF_TEST(SkVM_min,r)591 DEF_TEST(SkVM_min, r) {
592 skvm::Builder b;
593 {
594 skvm::Ptr src1 = b.varying<float>();
595 skvm::Ptr src2 = b.varying<float>();
596 skvm::Ptr dst = b.varying<float>();
597
598 b.storeF(dst, b.min(b.loadF(src1), b.loadF(src2)));
599 }
600
601 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
602 float s1[] = { 0.0f, 1.0f, 4.0f, -1.0f, -1.0f};
603 float s2[] = { 0.0f, 2.0f, 3.0f, 1.0f, -2.0f};
604 float want[] = { 0.0f, 1.0f, 3.0f, -1.0f, -2.0f};
605 float d[SK_ARRAY_COUNT(s1)];
606 program.eval(SK_ARRAY_COUNT(d), s1, s2, d);
607 for (int i = 0; i < (int)SK_ARRAY_COUNT(d); i++) {
608 REPORTER_ASSERT(r, d[i] == want[i]);
609 }
610 });
611 }
612
DEF_TEST(SkVM_max,r)613 DEF_TEST(SkVM_max, r) {
614 skvm::Builder b;
615 {
616 skvm::Ptr src1 = b.varying<float>();
617 skvm::Ptr src2 = b.varying<float>();
618 skvm::Ptr dst = b.varying<float>();
619
620 b.storeF(dst, b.max(b.loadF(src1), b.loadF(src2)));
621 }
622
623 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
624 float s1[] = { 0.0f, 1.0f, 4.0f, -1.0f, -1.0f};
625 float s2[] = { 0.0f, 2.0f, 3.0f, 1.0f, -2.0f};
626 float want[] = { 0.0f, 2.0f, 4.0f, 1.0f, -1.0f};
627 float d[SK_ARRAY_COUNT(s1)];
628 program.eval(SK_ARRAY_COUNT(d), s1, s2, d);
629 for (int i = 0; i < (int)SK_ARRAY_COUNT(d); i++) {
630 REPORTER_ASSERT(r, d[i] == want[i]);
631 }
632 });
633 }
634
DEF_TEST(SkVM_hoist,r)635 DEF_TEST(SkVM_hoist, r) {
636 // This program uses enough constants that it will fail to JIT if we hoist them.
637 // The JIT will try again without hoisting, and that'll just need 2 registers.
638 skvm::Builder b;
639 {
640 skvm::Ptr arg = b.varying<int>();
641 skvm::I32 x = b.load32(arg);
642 for (int i = 0; i < 32; i++) {
643 x = b.add(x, b.splat(i));
644 }
645 b.store32(arg, x);
646 }
647
648 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
649 int x = 4;
650 program.eval(1, &x);
651 // x += 0 + 1 + 2 + 3 + ... + 30 + 31
652 // x += 496
653 REPORTER_ASSERT(r, x == 500);
654 });
655 }
656
DEF_TEST(SkVM_select,r)657 DEF_TEST(SkVM_select, r) {
658 skvm::Builder b;
659 {
660 skvm::Ptr buf = b.varying<int>();
661
662 skvm::I32 x = b.load32(buf);
663
664 x = b.select( b.gt(x, b.splat(4)), x, b.splat(42) );
665
666 b.store32(buf, x);
667 }
668
669 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
670 int buf[] = { 0,1,2,3,4,5,6,7,8 };
671 program.eval(SK_ARRAY_COUNT(buf), buf);
672 for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
673 REPORTER_ASSERT(r, buf[i] == (i > 4 ? i : 42));
674 }
675 });
676 }
677
DEF_TEST(SkVM_swap,r)678 DEF_TEST(SkVM_swap, r) {
679 skvm::Builder b;
680 {
681 // This program is the equivalent of
682 // x = *X
683 // y = *Y
684 // *X = y
685 // *Y = x
686 // One rescheduling of the program based only on data flow of Op arguments is
687 // x = *X
688 // *Y = x
689 // y = *Y
690 // *X = y
691 // but this reordering does not produce the same results and is invalid.
692 skvm::Ptr X = b.varying<int>(),
693 Y = b.varying<int>();
694
695 skvm::I32 x = b.load32(X),
696 y = b.load32(Y);
697
698 b.store32(X, y);
699 b.store32(Y, x);
700 }
701
702 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
703 int b1[] = { 0,1,2,3 };
704 int b2[] = { 4,5,6,7 };
705 program.eval(SK_ARRAY_COUNT(b1), b1, b2);
706 for (int i = 0; i < (int)SK_ARRAY_COUNT(b1); i++) {
707 REPORTER_ASSERT(r, b1[i] == 4 + i);
708 REPORTER_ASSERT(r, b2[i] == i);
709 }
710 });
711 }
712
DEF_TEST(SkVM_NewOps,r)713 DEF_TEST(SkVM_NewOps, r) {
714 // Exercise a somewhat arbitrary set of new ops.
715 skvm::Builder b;
716 {
717 skvm::Ptr buf = b.varying<int16_t>();
718 skvm::UPtr uniforms = b.uniform();
719
720 skvm::I32 x = b.load16(buf);
721
722 const size_t kPtr = sizeof(const int*);
723
724 x = b.add(x, b.uniform32(uniforms, kPtr+0));
725 x = b.mul(x, b.uniform32(uniforms, kPtr+4));
726 x = b.sub(x, b.uniform32(uniforms, kPtr+8));
727
728 skvm::I32 limit = b.uniform32(uniforms, kPtr+12);
729 x = b.select(b.lt(x, b.splat(0)), b.splat(0), x);
730 x = b.select(b.gt(x, limit ), limit , x);
731
732 x = b.gather8(uniforms,0, x);
733
734 b.store16(buf, x);
735 }
736
737 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
738 const int N = 31;
739 int16_t buf[N];
740 for (int i = 0; i < N; i++) {
741 buf[i] = i;
742 }
743
744 const int M = 16;
745 uint8_t img[M];
746 for (int i = 0; i < M; i++) {
747 img[i] = i*i;
748 }
749
750 struct {
751 const uint8_t* img;
752 int add = 5;
753 int mul = 3;
754 int sub = 18;
755 int limit = M-1;
756 } uniforms{img};
757
758 program.eval(N, buf, &uniforms);
759
760 for (int i = 0; i < N; i++) {
761 // Our first math calculates x = (i+5)*3 - 18 a.k.a 3*(i-1).
762 int x = 3*(i-1);
763
764 // Then that's pinned to the limits of img.
765 if (i < 2) { x = 0; } // Notice i == 1 hits x == 0 exactly...
766 if (i > 5) { x = 15; } // ...and i == 6 hits x == 15 exactly
767 REPORTER_ASSERT(r, buf[i] == img[x]);
768 }
769 });
770 }
771
DEF_TEST(SKVM_array32,r)772 DEF_TEST(SKVM_array32, r) {
773
774
775
776 skvm::Builder b;
777 skvm::Uniforms uniforms(b.uniform(), 0);
778 // Take up the first slot, so other uniforms are not at 0 offset.
779 uniforms.push(0);
780 int i[] = {3, 7};
781 skvm::Uniform array = uniforms.pushArray(i);
782 float f[] = {5, 9};
783 skvm::Uniform arrayF = uniforms.pushArrayF(f);
784 {
785 skvm::Ptr buf0 = b.varying<int32_t>(),
786 buf1 = b.varying<int32_t>(),
787 buf2 = b.varying<int32_t>();
788
789 skvm::I32 j = b.array32(array, 0);
790 b.store32(buf0, j);
791 skvm::I32 k = b.array32(array, 1);
792 b.store32(buf1, k);
793
794 skvm::F32 x = b.arrayF(arrayF, 0);
795 skvm::F32 y = b.arrayF(arrayF, 1);
796 b.store32(buf2, b.trunc(b.add(x, y)));
797 }
798
799 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
800 const int K = 10;
801 int32_t buf0[K],
802 buf1[K],
803 buf2[K];
804
805 // reset the i[0] for the two tests.
806 i[0] = 3;
807 f[1] = 9;
808 program.eval(K, uniforms.buf.data(), buf0, buf1, buf2);
809 for (auto v : buf0) {
810 REPORTER_ASSERT(r, v == 3);
811 }
812 for (auto v : buf1) {
813 REPORTER_ASSERT(r, v == 7);
814 }
815 for (auto v : buf2) {
816 REPORTER_ASSERT(r, v == 14);
817 }
818 i[0] = 4;
819 f[1] = 10;
820 program.eval(K, uniforms.buf.data(), buf0, buf1, buf2);
821 for (auto v : buf0) {
822 REPORTER_ASSERT(r, v == 4);
823 }
824 for (auto v : buf1) {
825 REPORTER_ASSERT(r, v == 7);
826 }
827 for (auto v : buf2) {
828 REPORTER_ASSERT(r, v == 15);
829 }
830 });
831 }
832
DEF_TEST(SkVM_sqrt,r)833 DEF_TEST(SkVM_sqrt, r) {
834 skvm::Builder b;
835 auto buf = b.varying<int>();
836 b.storeF(buf, b.sqrt(b.loadF(buf)));
837
838 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
839 constexpr int K = 17;
840 float buf[K];
841 for (int i = 0; i < K; i++) {
842 buf[i] = (float)(i*i);
843 }
844
845 // x^2 -> x
846 program.eval(K, buf);
847
848 for (int i = 0; i < K; i++) {
849 REPORTER_ASSERT(r, buf[i] == (float)i);
850 }
851 });
852 }
853
DEF_TEST(SkVM_MSAN,r)854 DEF_TEST(SkVM_MSAN, r) {
855 // This little memset32() program should be able to JIT, but if we run that
856 // JIT code in an MSAN build, it won't see the writes initialize buf. So
857 // this tests that we're using the interpreter instead.
858 skvm::Builder b;
859 b.store32(b.varying<int>(), b.splat(42));
860
861 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
862 constexpr int K = 17;
863 int buf[K]; // Intentionally uninitialized.
864 program.eval(K, buf);
865 sk_msan_assert_initialized(buf, buf+K);
866 for (int x : buf) {
867 REPORTER_ASSERT(r, x == 42);
868 }
869 });
870 }
871
DEF_TEST(SkVM_assert,r)872 DEF_TEST(SkVM_assert, r) {
873 skvm::Builder b;
874 b.assert_true(b.lt(b.load32(b.varying<int>()),
875 b.splat(42)));
876
877 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
878 int buf[] = { 0,1,2,3,4,5,6,7,8,9 };
879 program.eval(SK_ARRAY_COUNT(buf), buf);
880 });
881 }
882
DEF_TEST(SkVM_trace_line,r)883 DEF_TEST(SkVM_trace_line, r) {
884 skvm::Builder b;
885 b.trace_line(b.splat(0xFFFFFFFF), 123);
886
887 test_jit_and_interpreter(b, [&](const skvm::Program& program) {
888 // The trace_line instruction has no behavior yet.
889 program.eval(1);
890 });
891 }
892
DEF_TEST(SkVM_premul,reporter)893 DEF_TEST(SkVM_premul, reporter) {
894 // Test that premul is short-circuited when alpha is known opaque.
895 {
896 skvm::Builder p;
897 auto rptr = p.varying<int>(),
898 aptr = p.varying<int>();
899
900 skvm::F32 r = p.loadF(rptr),
901 g = p.splat(0.0f),
902 b = p.splat(0.0f),
903 a = p.loadF(aptr);
904
905 p.premul(&r, &g, &b, a);
906 p.storeF(rptr, r);
907
908 // load red, load alpha, red *= alpha, store red
909 REPORTER_ASSERT(reporter, p.done().instructions().size() == 4);
910 }
911
912 {
913 skvm::Builder p;
914 auto rptr = p.varying<int>();
915
916 skvm::F32 r = p.loadF(rptr),
917 g = p.splat(0.0f),
918 b = p.splat(0.0f),
919 a = p.splat(1.0f);
920
921 p.premul(&r, &g, &b, a);
922 p.storeF(rptr, r);
923
924 // load red, store red
925 REPORTER_ASSERT(reporter, p.done().instructions().size() == 2);
926 }
927
928 // Same deal for unpremul.
929 {
930 skvm::Builder p;
931 auto rptr = p.varying<int>(),
932 aptr = p.varying<int>();
933
934 skvm::F32 r = p.loadF(rptr),
935 g = p.splat(0.0f),
936 b = p.splat(0.0f),
937 a = p.loadF(aptr);
938
939 p.unpremul(&r, &g, &b, a);
940 p.storeF(rptr, r);
941
942 // load red, load alpha, a bunch of unpremul instructions, store red
943 REPORTER_ASSERT(reporter, p.done().instructions().size() >= 4);
944 }
945
946 {
947 skvm::Builder p;
948 auto rptr = p.varying<int>();
949
950 skvm::F32 r = p.loadF(rptr),
951 g = p.splat(0.0f),
952 b = p.splat(0.0f),
953 a = p.splat(1.0f);
954
955 p.unpremul(&r, &g, &b, a);
956 p.storeF(rptr, r);
957
958 // load red, store red
959 REPORTER_ASSERT(reporter, p.done().instructions().size() == 2);
960 }
961 }
962
963 template <typename Fn>
test_asm(skiatest::Reporter * r,Fn && fn,std::initializer_list<uint8_t> expected)964 static void test_asm(skiatest::Reporter* r, Fn&& fn, std::initializer_list<uint8_t> expected) {
965 uint8_t buf[4096];
966 skvm::Assembler a{buf};
967 fn(a);
968
969 REPORTER_ASSERT(r, a.size() == expected.size());
970
971 auto got = (const uint8_t*)buf,
972 want = expected.begin();
973 for (int i = 0; i < (int)std::min(a.size(), expected.size()); i++) {
974 REPORTER_ASSERT(r, got[i] == want[i],
975 "byte %d was %02x, want %02x", i, got[i], want[i]);
976 }
977 }
978
DEF_TEST(SkVM_Assembler,r)979 DEF_TEST(SkVM_Assembler, r) {
980 // Easiest way to generate test cases is
981 //
982 // echo '...some asm...' | llvm-mc -show-encoding -x86-asm-syntax=intel
983 //
984 // The -x86-asm-syntax=intel bit is optional, controlling the
985 // input syntax only; the output will always be AT&T op x,y,dst style.
986 // Our APIs read more like Intel op dst,x,y as op(dst,x,y), so I find
987 // that a bit easier to use here, despite maybe favoring AT&T overall.
988
989 using A = skvm::Assembler;
990 // Our exit strategy from AVX code.
991 test_asm(r, [&](A& a) {
992 a.int3();
993 a.vzeroupper();
994 a.ret();
995 },{
996 0xcc,
997 0xc5, 0xf8, 0x77,
998 0xc3,
999 });
1000
1001 // Align should pad with zero
1002 test_asm(r, [&](A& a) {
1003 a.ret();
1004 a.align(4);
1005 },{
1006 0xc3,
1007 0x00, 0x00, 0x00,
1008 });
1009
1010 test_asm(r, [&](A& a) {
1011 a.add(A::rax, 8); // Always good to test rax.
1012 a.sub(A::rax, 32);
1013
1014 a.add(A::rdi, 12); // Last 0x48 REX
1015 a.sub(A::rdi, 8);
1016
1017 a.add(A::r8 , 7); // First 0x49 REX
1018 a.sub(A::r8 , 4);
1019
1020 a.add(A::rsi, 128); // Requires 4 byte immediate.
1021 a.sub(A::r8 , 1000000);
1022
1023 a.add(A::Mem{A::rsi}, 7); // addq $7, (%rsi)
1024 a.add(A::Mem{A::rsi, 12}, 7); // addq $7, 12(%rsi)
1025 a.add(A::Mem{A::rsp, 12}, 7); // addq $7, 12(%rsp)
1026 a.add(A::Mem{A::r12, 12}, 7); // addq $7, 12(%r12)
1027 a.add(A::Mem{A::rsp, 12, A::rax, A::FOUR}, 7); // addq $7, 12(%rsp,%rax,4)
1028 a.add(A::Mem{A::r12, 12, A::rax, A::FOUR}, 7); // addq $7, 12(%r12,%rax,4)
1029 a.add(A::Mem{A::rax, 12, A::r12, A::FOUR}, 7); // addq $7, 12(%rax,%r12,4)
1030 a.add(A::Mem{A::r11, 12, A::r8 , A::TWO }, 7); // addq $7, 12(%r11,%r8,2)
1031 a.add(A::Mem{A::r11, 12, A::rax} , 7); // addq $7, 12(%r11,%rax)
1032 a.add(A::Mem{A::rax, 12, A::r11} , 7); // addq $7, 12(%rax,%r11)
1033
1034 a.sub(A::Mem{A::rax, 12, A::r11} , 7); // subq $7, 12(%rax,%r11)
1035
1036 a.add( A::rax , A::rcx); // addq %rcx, %rax
1037 a.add(A::Mem{A::rax} , A::rcx); // addq %rcx, (%rax)
1038 a.add(A::Mem{A::rax, 12}, A::rcx); // addq %rcx, 12(%rax)
1039 a.add(A::rcx, A::Mem{A::rax, 12}); // addq 12(%rax), %rcx
1040
1041 a.sub(A::rcx, A::Mem{A::rax, 12}); // subq 12(%rax), %rcx
1042 },{
1043 0x48, 0x83, 0b11'000'000, 0x08,
1044 0x48, 0x83, 0b11'101'000, 0x20,
1045
1046 0x48, 0x83, 0b11'000'111, 0x0c,
1047 0x48, 0x83, 0b11'101'111, 0x08,
1048
1049 0x49, 0x83, 0b11'000'000, 0x07,
1050 0x49, 0x83, 0b11'101'000, 0x04,
1051
1052 0x48, 0x81, 0b11'000'110, 0x80, 0x00, 0x00, 0x00,
1053 0x49, 0x81, 0b11'101'000, 0x40, 0x42, 0x0f, 0x00,
1054
1055 0x48,0x83,0x06,0x07,
1056 0x48,0x83,0x46,0x0c,0x07,
1057 0x48,0x83,0x44,0x24,0x0c,0x07,
1058 0x49,0x83,0x44,0x24,0x0c,0x07,
1059 0x48,0x83,0x44,0x84,0x0c,0x07,
1060 0x49,0x83,0x44,0x84,0x0c,0x07,
1061 0x4a,0x83,0x44,0xa0,0x0c,0x07,
1062 0x4b,0x83,0x44,0x43,0x0c,0x07,
1063 0x49,0x83,0x44,0x03,0x0c,0x07,
1064 0x4a,0x83,0x44,0x18,0x0c,0x07,
1065
1066 0x4a,0x83,0x6c,0x18,0x0c,0x07,
1067
1068 0x48,0x01,0xc8,
1069 0x48,0x01,0x08,
1070 0x48,0x01,0x48,0x0c,
1071 0x48,0x03,0x48,0x0c,
1072 0x48,0x2b,0x48,0x0c,
1073 });
1074
1075
1076 test_asm(r, [&](A& a) {
1077 a.vpaddd (A::ymm0, A::ymm1, A::ymm2); // Low registers and 0x0f map -> 2-byte VEX.
1078 a.vpaddd (A::ymm8, A::ymm1, A::ymm2); // A high dst register is ok -> 2-byte VEX.
1079 a.vpaddd (A::ymm0, A::ymm8, A::ymm2); // A high first argument register -> 2-byte VEX.
1080 a.vpaddd (A::ymm0, A::ymm1, A::ymm8); // A high second argument -> 3-byte VEX.
1081 a.vpmulld(A::ymm0, A::ymm1, A::ymm2); // Using non-0x0f map instruction -> 3-byte VEX.
1082 a.vpsubd (A::ymm0, A::ymm1, A::ymm2); // Test vpsubd to ensure argument order is right.
1083 },{
1084 /* VEX */ /*op*/ /*modRM*/
1085 0xc5, 0xf5, 0xfe, 0xc2,
1086 0xc5, 0x75, 0xfe, 0xc2,
1087 0xc5, 0xbd, 0xfe, 0xc2,
1088 0xc4, 0xc1, 0x75, 0xfe, 0xc0,
1089 0xc4, 0xe2, 0x75, 0x40, 0xc2,
1090 0xc5, 0xf5, 0xfa, 0xc2,
1091 });
1092
1093 test_asm(r, [&](A& a) {
1094 a.vpaddw (A::ymm4, A::ymm3, A::ymm2);
1095 a.vpavgw (A::ymm4, A::ymm3, A::ymm2);
1096 a.vpcmpeqw (A::ymm4, A::ymm3, A::ymm2);
1097 a.vpcmpgtw (A::ymm4, A::ymm3, A::ymm2);
1098
1099 a.vpminsw (A::ymm4, A::ymm3, A::ymm2);
1100 a.vpmaxsw (A::ymm4, A::ymm3, A::ymm2);
1101 a.vpminuw (A::ymm4, A::ymm3, A::ymm2);
1102 a.vpmaxuw (A::ymm4, A::ymm3, A::ymm2);
1103
1104 a.vpmulhrsw(A::ymm4, A::ymm3, A::ymm2);
1105 a.vpabsw (A::ymm4, A::ymm3);
1106 a.vpsllw (A::ymm4, A::ymm3, 12);
1107 a.vpsraw (A::ymm4, A::ymm3, 12);
1108 },{
1109 0xc5, 0xe5, 0xfd, 0xe2,
1110 0xc5, 0xe5, 0xe3, 0xe2,
1111 0xc5, 0xe5, 0x75, 0xe2,
1112 0xc5, 0xe5, 0x65, 0xe2,
1113
1114 0xc5, 0xe5, 0xea, 0xe2,
1115 0xc5, 0xe5, 0xee, 0xe2,
1116 0xc4,0xe2,0x65, 0x3a, 0xe2,
1117 0xc4,0xe2,0x65, 0x3e, 0xe2,
1118
1119 0xc4,0xe2,0x65, 0x0b, 0xe2,
1120 0xc4,0xe2,0x7d, 0x1d, 0xe3,
1121 0xc5,0xdd,0x71, 0xf3, 0x0c,
1122 0xc5,0xdd,0x71, 0xe3, 0x0c,
1123 });
1124
1125 test_asm(r, [&](A& a) {
1126 A::Label l;
1127 a.vcmpeqps (A::ymm0, A::ymm1, &l); // vcmpeqps 0x1c(%rip), %ymm1, %ymm0
1128 a.vpcmpeqd (A::ymm0, A::ymm1, A::ymm2);
1129 a.vpcmpgtd (A::ymm0, A::ymm1, A::ymm2);
1130 a.vcmpeqps (A::ymm0, A::ymm1, A::ymm2);
1131 a.vcmpltps (A::ymm0, A::ymm1, A::ymm2);
1132 a.vcmpleps (A::ymm0, A::ymm1, A::ymm2);
1133 a.vcmpneqps(A::ymm0, A::ymm1, A::ymm2);
1134 a.label(&l); // 28 bytes after the vcmpeqps that uses it.
1135 },{
1136 0xc5,0xf4,0xc2,0x05,0x1c,0x00,0x00,0x00,0x00,
1137 0xc5,0xf5,0x76,0xc2,
1138 0xc5,0xf5,0x66,0xc2,
1139 0xc5,0xf4,0xc2,0xc2,0x00,
1140 0xc5,0xf4,0xc2,0xc2,0x01,
1141 0xc5,0xf4,0xc2,0xc2,0x02,
1142 0xc5,0xf4,0xc2,0xc2,0x04,
1143 });
1144
1145 test_asm(r, [&](A& a) {
1146 a.vminps(A::ymm0, A::ymm1, A::ymm2);
1147 a.vmaxps(A::ymm0, A::ymm1, A::ymm2);
1148 },{
1149 0xc5,0xf4,0x5d,0xc2,
1150 0xc5,0xf4,0x5f,0xc2,
1151 });
1152
1153 test_asm(r, [&](A& a) {
1154 a.vpblendvb(A::ymm0, A::ymm1, A::ymm2, A::ymm3);
1155 },{
1156 0xc4,0xe3,0x75, 0x4c, 0xc2, 0x30,
1157 });
1158
1159 test_asm(r, [&](A& a) {
1160 a.vpsrld(A::ymm15, A::ymm2, 8);
1161 a.vpsrld(A::ymm0 , A::ymm8, 5);
1162 },{
1163 0xc5, 0x85, 0x72,0xd2, 0x08,
1164 0xc4,0xc1,0x7d, 0x72,0xd0, 0x05,
1165 });
1166
1167 test_asm(r, [&](A& a) {
1168 A::Label l;
1169 a.vpermps(A::ymm1, A::ymm2, A::Mem{A::rdi, 32});
1170 a.vperm2f128(A::ymm1, A::ymm2, &l, 0x20);
1171 a.vpermq(A::ymm1, A::ymm2, 5);
1172 a.label(&l); // 6 bytes after vperm2f128
1173 },{
1174 0xc4,0xe2,0x6d,0x16,0x4f,0x20,
1175 0xc4,0xe3,0x6d,0x06,0x0d,0x06,0x00,0x00,0x00,0x20,
1176 0xc4,0xe3,0xfd, 0x00,0xca, 0x05,
1177 });
1178
1179 test_asm(r, [&](A& a) {
1180 a.vpunpckldq(A::ymm1, A::ymm2, A::Mem{A::rdi});
1181 a.vpunpckhdq(A::ymm1, A::ymm2, A::ymm3);
1182 },{
1183 0xc5,0xed,0x62,0x0f,
1184 0xc5,0xed,0x6a,0xcb,
1185 });
1186
1187 test_asm(r, [&](A& a) {
1188 a.vroundps(A::ymm1, A::ymm2, A::NEAREST);
1189 a.vroundps(A::ymm1, A::ymm2, A::FLOOR);
1190 a.vroundps(A::ymm1, A::ymm2, A::CEIL);
1191 a.vroundps(A::ymm1, A::ymm2, A::TRUNC);
1192 },{
1193 0xc4,0xe3,0x7d,0x08,0xca,0x00,
1194 0xc4,0xe3,0x7d,0x08,0xca,0x01,
1195 0xc4,0xe3,0x7d,0x08,0xca,0x02,
1196 0xc4,0xe3,0x7d,0x08,0xca,0x03,
1197 });
1198
1199 test_asm(r, [&](A& a) {
1200 A::Label l;
1201 a.label(&l);
1202 a.byte(1);
1203 a.byte(2);
1204 a.byte(3);
1205 a.byte(4);
1206
1207 a.vbroadcastss(A::ymm0 , &l);
1208 a.vbroadcastss(A::ymm1 , &l);
1209 a.vbroadcastss(A::ymm8 , &l);
1210 a.vbroadcastss(A::ymm15, &l);
1211
1212 a.vpshufb(A::ymm4, A::ymm3, &l);
1213 a.vpaddd (A::ymm4, A::ymm3, &l);
1214 a.vpsubd (A::ymm4, A::ymm3, &l);
1215
1216 a.vptest(A::ymm4, &l);
1217
1218 a.vmulps (A::ymm4, A::ymm3, &l);
1219 },{
1220 0x01, 0x02, 0x03, 0x4,
1221
1222 /* VEX */ /*op*/ /* ModRM */ /* offset */
1223 0xc4, 0xe2, 0x7d, 0x18, 0b00'000'101, 0xf3,0xff,0xff,0xff, // 0xfffffff3 == -13
1224 0xc4, 0xe2, 0x7d, 0x18, 0b00'001'101, 0xea,0xff,0xff,0xff, // 0xffffffea == -22
1225 0xc4, 0x62, 0x7d, 0x18, 0b00'000'101, 0xe1,0xff,0xff,0xff, // 0xffffffe1 == -31
1226 0xc4, 0x62, 0x7d, 0x18, 0b00'111'101, 0xd8,0xff,0xff,0xff, // 0xffffffd8 == -40
1227
1228 0xc4, 0xe2, 0x65, 0x00, 0b00'100'101, 0xcf,0xff,0xff,0xff, // 0xffffffcf == -49
1229
1230 0xc5, 0xe5, 0xfe, 0b00'100'101, 0xc7,0xff,0xff,0xff, // 0xffffffc7 == -57
1231 0xc5, 0xe5, 0xfa, 0b00'100'101, 0xbf,0xff,0xff,0xff, // 0xffffffbf == -65
1232
1233 0xc4, 0xe2, 0x7d, 0x17, 0b00'100'101, 0xb6,0xff,0xff,0xff, // 0xffffffb6 == -74
1234
1235 0xc5, 0xe4, 0x59, 0b00'100'101, 0xae,0xff,0xff,0xff, // 0xffffffaf == -82
1236 });
1237
1238 test_asm(r, [&](A& a) {
1239 a.vbroadcastss(A::ymm0, A::Mem{A::rdi, 0});
1240 a.vbroadcastss(A::ymm13, A::Mem{A::r14, 7});
1241 a.vbroadcastss(A::ymm8, A::Mem{A::rdx, -12});
1242 a.vbroadcastss(A::ymm8, A::Mem{A::rdx, 400});
1243
1244 a.vbroadcastss(A::ymm8, A::xmm0);
1245 a.vbroadcastss(A::ymm0, A::xmm13);
1246 },{
1247 /* VEX */ /*op*/ /*ModRM*/ /*offset*/
1248 0xc4,0xe2,0x7d, 0x18, 0b00'000'111,
1249 0xc4,0x42,0x7d, 0x18, 0b01'101'110, 0x07,
1250 0xc4,0x62,0x7d, 0x18, 0b01'000'010, 0xf4,
1251 0xc4,0x62,0x7d, 0x18, 0b10'000'010, 0x90,0x01,0x00,0x00,
1252
1253 0xc4,0x62,0x7d, 0x18, 0b11'000'000,
1254 0xc4,0xc2,0x7d, 0x18, 0b11'000'101,
1255 });
1256
1257 test_asm(r, [&](A& a) {
1258 A::Label l;
1259 a.label(&l);
1260 a.jne(&l);
1261 a.jne(&l);
1262 a.je (&l);
1263 a.jmp(&l);
1264 a.jl (&l);
1265 a.jc (&l);
1266
1267 a.cmp(A::rdx, 1);
1268 a.cmp(A::rax, 12);
1269 a.cmp(A::r14, 2000000000);
1270 },{
1271 0x0f,0x85, 0xfa,0xff,0xff,0xff, // near jne -6 bytes
1272 0x0f,0x85, 0xf4,0xff,0xff,0xff, // near jne -12 bytes
1273 0x0f,0x84, 0xee,0xff,0xff,0xff, // near je -18 bytes
1274 0xe9, 0xe9,0xff,0xff,0xff, // near jmp -23 bytes
1275 0x0f,0x8c, 0xe3,0xff,0xff,0xff, // near jl -29 bytes
1276 0x0f,0x82, 0xdd,0xff,0xff,0xff, // near jc -35 bytes
1277
1278 0x48,0x83,0xfa,0x01,
1279 0x48,0x83,0xf8,0x0c,
1280 0x49,0x81,0xfe,0x00,0x94,0x35,0x77,
1281 });
1282
1283 test_asm(r, [&](A& a) {
1284 a.vmovups(A::ymm5, A::Mem{A::rsi});
1285 a.vmovups(A::Mem{A::rsi}, A::ymm5);
1286
1287 a.vmovups(A::xmm5, A::Mem{A::rsi});
1288 a.vmovups(A::Mem{A::rsi}, A::xmm5);
1289
1290 a.vpmovzxwd(A::ymm4, A::Mem{A::rsi});
1291 a.vpmovzxbd(A::ymm4, A::Mem{A::rsi});
1292
1293 a.vmovq(A::Mem{A::rdx}, A::xmm15);
1294 },{
1295 /* VEX */ /*Op*/ /* ModRM */
1296 0xc5, 0xfc, 0x10, 0b00'101'110,
1297 0xc5, 0xfc, 0x11, 0b00'101'110,
1298
1299 0xc5, 0xf8, 0x10, 0b00'101'110,
1300 0xc5, 0xf8, 0x11, 0b00'101'110,
1301
1302 0xc4,0xe2,0x7d, 0x33, 0b00'100'110,
1303 0xc4,0xe2,0x7d, 0x31, 0b00'100'110,
1304
1305 0xc5, 0x79, 0xd6, 0b00'111'010,
1306 });
1307
1308 test_asm(r, [&](A& a) {
1309 a.vmovups(A::ymm5, A::Mem{A::rsp, 0});
1310 a.vmovups(A::ymm5, A::Mem{A::rsp, 64});
1311 a.vmovups(A::ymm5, A::Mem{A::rsp,128});
1312
1313 a.vmovups(A::Mem{A::rsp, 0}, A::ymm5);
1314 a.vmovups(A::Mem{A::rsp, 64}, A::ymm5);
1315 a.vmovups(A::Mem{A::rsp,128}, A::ymm5);
1316 },{
1317 0xc5,0xfc,0x10,0x2c,0x24,
1318 0xc5,0xfc,0x10,0x6c,0x24,0x40,
1319 0xc5,0xfc,0x10,0xac,0x24,0x80,0x00,0x00,0x00,
1320
1321 0xc5,0xfc,0x11,0x2c,0x24,
1322 0xc5,0xfc,0x11,0x6c,0x24,0x40,
1323 0xc5,0xfc,0x11,0xac,0x24,0x80,0x00,0x00,0x00,
1324 });
1325
1326 test_asm(r, [&](A& a) {
1327 a.movzbq(A::rax, A::Mem{A::rsi}); // Low registers for src and dst.
1328 a.movzbq(A::rax, A::Mem{A::r8,}); // High src register.
1329 a.movzbq(A::r8 , A::Mem{A::rsi}); // High dst register.
1330 a.movzbq(A::r8, A::Mem{A::rsi, 12});
1331 a.movzbq(A::r8, A::Mem{A::rsi, 400});
1332
1333 a.movzwq(A::rax, A::Mem{A::rsi}); // Low registers for src and dst.
1334 a.movzwq(A::rax, A::Mem{A::r8,}); // High src register.
1335 a.movzwq(A::r8 , A::Mem{A::rsi}); // High dst register.
1336 a.movzwq(A::r8, A::Mem{A::rsi, 12});
1337 a.movzwq(A::r8, A::Mem{A::rsi, 400});
1338
1339 a.vmovd(A::Mem{A::rax}, A::xmm0);
1340 a.vmovd(A::Mem{A::rax}, A::xmm8);
1341 a.vmovd(A::Mem{A::r8 }, A::xmm0);
1342
1343 a.vmovd(A::xmm0, A::Mem{A::rax});
1344 a.vmovd(A::xmm8, A::Mem{A::rax});
1345 a.vmovd(A::xmm0, A::Mem{A::r8 });
1346
1347 a.vmovd(A::xmm0 , A::Mem{A::rax, 0, A::rcx, A::FOUR});
1348 a.vmovd(A::xmm15, A::Mem{A::rax, 0, A::r8, A::TWO });
1349 a.vmovd(A::xmm0 , A::Mem{A::r8 , 0, A::rcx});
1350
1351 a.vmovd(A::rax, A::xmm0);
1352 a.vmovd(A::rax, A::xmm8);
1353 a.vmovd(A::r8 , A::xmm0);
1354
1355 a.vmovd(A::xmm0, A::rax);
1356 a.vmovd(A::xmm8, A::rax);
1357 a.vmovd(A::xmm0, A::r8 );
1358
1359 a.movb(A::Mem{A::rdx}, A::rax);
1360 a.movb(A::Mem{A::rdx}, A::r8 );
1361 a.movb(A::Mem{A::r8 }, A::rax);
1362
1363 a.movb(A::rdx, A::Mem{A::rax});
1364 a.movb(A::rdx, A::Mem{A::r8 });
1365 a.movb(A::r8 , A::Mem{A::rax});
1366
1367 a.movb(A::rdx, 12);
1368 a.movb(A::rax, 4);
1369 a.movb(A::r8 , -1);
1370
1371 a.movb(A::Mem{A::rdx}, 12);
1372 a.movb(A::Mem{A::rax}, 4);
1373 a.movb(A::Mem{A::r8 }, -1);
1374 },{
1375 0x48,0x0f,0xb6,0x06, // movzbq (%rsi), %rax
1376 0x49,0x0f,0xb6,0x00,
1377 0x4c,0x0f,0xb6,0x06,
1378 0x4c,0x0f,0xb6,0x46, 12,
1379 0x4c,0x0f,0xb6,0x86, 0x90,0x01,0x00,0x00,
1380
1381 0x48,0x0f,0xb7,0x06, // movzwq (%rsi), %rax
1382 0x49,0x0f,0xb7,0x00,
1383 0x4c,0x0f,0xb7,0x06,
1384 0x4c,0x0f,0xb7,0x46, 12,
1385 0x4c,0x0f,0xb7,0x86, 0x90,0x01,0x00,0x00,
1386
1387 0xc5,0xf9,0x7e,0x00,
1388 0xc5,0x79,0x7e,0x00,
1389 0xc4,0xc1,0x79,0x7e,0x00,
1390
1391 0xc5,0xf9,0x6e,0x00,
1392 0xc5,0x79,0x6e,0x00,
1393 0xc4,0xc1,0x79,0x6e,0x00,
1394
1395 0xc5,0xf9,0x6e,0x04,0x88,
1396 0xc4,0x21,0x79,0x6e,0x3c,0x40,
1397 0xc4,0xc1,0x79,0x6e,0x04,0x08,
1398
1399 0xc5,0xf9,0x7e,0xc0,
1400 0xc5,0x79,0x7e,0xc0,
1401 0xc4,0xc1,0x79,0x7e,0xc0,
1402
1403 0xc5,0xf9,0x6e,0xc0,
1404 0xc5,0x79,0x6e,0xc0,
1405 0xc4,0xc1,0x79,0x6e,0xc0,
1406
1407 0x48 ,0x88, 0x02,
1408 0x4c, 0x88, 0x02,
1409 0x49, 0x88, 0x00,
1410
1411 0x48 ,0x8a, 0x10,
1412 0x49, 0x8a, 0x10,
1413 0x4c, 0x8a, 0x00,
1414
1415 0x48, 0xc6, 0xc2, 0x0c,
1416 0x48, 0xc6, 0xc0, 0x04,
1417 0x49, 0xc6, 0xc0, 0xff,
1418
1419 0x48, 0xc6, 0x02, 0x0c,
1420 0x48, 0xc6, 0x00, 0x04,
1421 0x49, 0xc6, 0x00, 0xff,
1422 });
1423
1424 test_asm(r, [&](A& a) {
1425 a.vpinsrd(A::xmm1, A::xmm8, A::Mem{A::rsi}, 1); // vpinsrd $1, (%rsi), %xmm8, %xmm1
1426 a.vpinsrd(A::xmm8, A::xmm1, A::Mem{A::r8 }, 3); // vpinsrd $3, (%r8), %xmm1, %xmm8;
1427
1428 a.vpinsrw(A::xmm1, A::xmm8, A::Mem{A::rsi}, 4); // vpinsrw $4, (%rsi), %xmm8, %xmm1
1429 a.vpinsrw(A::xmm8, A::xmm1, A::Mem{A::r8 }, 12); // vpinrsw $12, (%r8), %xmm1, %xmm8
1430
1431 a.vpinsrb(A::xmm1, A::xmm8, A::Mem{A::rsi}, 4); // vpinsrb $4, (%rsi), %xmm8, %xmm1
1432 a.vpinsrb(A::xmm8, A::xmm1, A::Mem{A::r8 }, 12); // vpinsrb $12, (%r8), %xmm1, %xmm8
1433
1434 a.vextracti128(A::xmm1, A::ymm8, 1); // vextracti128 $1, %ymm8, %xmm1
1435 a.vextracti128(A::xmm8, A::ymm1, 0); // vextracti128 $0, %ymm1, %xmm8
1436
1437 a.vpextrd(A::Mem{A::rsi}, A::xmm8, 3); // vpextrd $3, %xmm8, (%rsi)
1438 a.vpextrd(A::Mem{A::r8 }, A::xmm1, 2); // vpextrd $2, %xmm1, (%r8)
1439
1440 a.vpextrw(A::Mem{A::rsi}, A::xmm8, 7);
1441 a.vpextrw(A::Mem{A::r8 }, A::xmm1, 15);
1442
1443 a.vpextrb(A::Mem{A::rsi}, A::xmm8, 7);
1444 a.vpextrb(A::Mem{A::r8 }, A::xmm1, 15);
1445 },{
1446 0xc4,0xe3,0x39, 0x22, 0x0e, 1,
1447 0xc4,0x43,0x71, 0x22, 0x00, 3,
1448
1449 0xc5,0xb9, 0xc4, 0x0e, 4,
1450 0xc4,0x41,0x71, 0xc4, 0x00, 12,
1451
1452 0xc4,0xe3,0x39, 0x20, 0x0e, 4,
1453 0xc4,0x43,0x71, 0x20, 0x00, 12,
1454
1455 0xc4,0x63,0x7d,0x39,0xc1, 1,
1456 0xc4,0xc3,0x7d,0x39,0xc8, 0,
1457
1458 0xc4,0x63,0x79,0x16,0x06, 3,
1459 0xc4,0xc3,0x79,0x16,0x08, 2,
1460
1461 0xc4,0x63,0x79, 0x15, 0x06, 7,
1462 0xc4,0xc3,0x79, 0x15, 0x08, 15,
1463
1464 0xc4,0x63,0x79, 0x14, 0x06, 7,
1465 0xc4,0xc3,0x79, 0x14, 0x08, 15,
1466 });
1467
1468 test_asm(r, [&](A& a) {
1469 a.vpandn(A::ymm3, A::ymm12, A::ymm2);
1470 },{
1471 0xc5, 0x9d, 0xdf, 0xda,
1472 });
1473
1474 test_asm(r, [&](A& a) {
1475 A::Label l;
1476 a.vmovdqa(A::ymm3, A::ymm2); // vmovdqa %ymm2 , %ymm3
1477
1478 a.vmovdqa(A::ymm3, A::Mem{A::rsi}); // vmovdqa (%rsi) , %ymm3
1479 a.vmovdqa(A::ymm3, A::Mem{A::rsp}); // vmovdqa (%rsp) , %ymm3
1480 a.vmovdqa(A::ymm3, A::Mem{A::r11}); // vmovdqa (%r11) , %ymm3
1481
1482 a.vmovdqa(A::ymm3, A::Mem{A::rsi, 4}); // vmovdqa 4(%rsi) , %ymm3
1483 a.vmovdqa(A::ymm3, A::Mem{A::rsp, 4}); // vmovdqa 4(%rsp) , %ymm3
1484
1485 a.vmovdqa(A::ymm3, A::Mem{A::rsi, 4, A::rax, A::EIGHT}); // vmovdqa 4(%rsi,%rax,8), %ymm3
1486 a.vmovdqa(A::ymm3, A::Mem{A::r11, 4, A::rax, A::TWO }); // vmovdqa 4(%r11,%rax,2), %ymm3
1487 a.vmovdqa(A::ymm3, A::Mem{A::rsi, 4, A::r11, A::FOUR }); // vmovdqa 4(%rsi,%r11,4), %ymm3
1488 a.vmovdqa(A::ymm3, A::Mem{A::rsi, 4, A::r11, A::ONE }); // vmovdqa 4(%rsi,%r11,1), %ymm3
1489 a.vmovdqa(A::ymm3, A::Mem{A::rsi, 4, A::r11}); // vmovdqa 4(%rsi,%r11) , %ymm3
1490
1491 a.vmovdqa(A::ymm3, A::Mem{A::rsi, 64, A::r11}); // vmovdqa 64(%rsi,%r11), %ymm3
1492 a.vmovdqa(A::ymm3, A::Mem{A::rsi, 128, A::r11}); // vmovdqa 128(%rsi,%r11), %ymm3
1493 a.vmovdqa(A::ymm3, &l); // vmovdqa 16(%rip) , %ymm3
1494
1495 a.vcvttps2dq(A::ymm3, A::ymm2);
1496 a.vcvtdq2ps (A::ymm3, A::ymm2);
1497 a.vcvtps2dq (A::ymm3, A::ymm2);
1498 a.vsqrtps (A::ymm3, A::ymm2);
1499 a.label(&l);
1500 },{
1501 0xc5,0xfd,0x6f,0xda,
1502
1503 0xc5,0xfd,0x6f,0x1e,
1504 0xc5,0xfd,0x6f,0x1c,0x24,
1505 0xc4,0xc1,0x7d,0x6f,0x1b,
1506
1507 0xc5,0xfd,0x6f,0x5e,0x04,
1508 0xc5,0xfd,0x6f,0x5c,0x24,0x04,
1509
1510 0xc5,0xfd,0x6f,0x5c,0xc6,0x04,
1511 0xc4,0xc1,0x7d,0x6f,0x5c,0x43,0x04,
1512 0xc4,0xa1,0x7d,0x6f,0x5c,0x9e,0x04,
1513 0xc4,0xa1,0x7d,0x6f,0x5c,0x1e,0x04,
1514 0xc4,0xa1,0x7d,0x6f,0x5c,0x1e,0x04,
1515
1516 0xc4,0xa1,0x7d,0x6f,0x5c,0x1e,0x40,
1517 0xc4,0xa1,0x7d,0x6f,0x9c,0x1e,0x80,0x00,0x00,0x00,
1518
1519 0xc5,0xfd,0x6f,0x1d,0x10,0x00,0x00,0x00,
1520
1521 0xc5,0xfe,0x5b,0xda,
1522 0xc5,0xfc,0x5b,0xda,
1523 0xc5,0xfd,0x5b,0xda,
1524 0xc5,0xfc,0x51,0xda,
1525 });
1526
1527 test_asm(r, [&](A& a) {
1528 a.vcvtps2ph(A::xmm3, A::ymm2, A::CURRENT);
1529 a.vcvtps2ph(A::Mem{A::rsi, 32, A::rax, A::EIGHT}, A::ymm5, A::CEIL);
1530
1531 a.vcvtph2ps(A::ymm15, A::Mem{A::rdi, 12, A::r9, A::ONE});
1532 a.vcvtph2ps(A::ymm2, A::xmm3);
1533 },{
1534 0xc4,0xe3,0x7d,0x1d,0xd3,0x04,
1535 0xc4,0xe3,0x7d,0x1d,0x6c,0xc6,0x20,0x02,
1536
1537 0xc4,0x22,0x7d,0x13,0x7c,0x0f,0x0c,
1538 0xc4,0xe2,0x7d,0x13,0xd3,
1539 });
1540
1541 test_asm(r, [&](A& a) {
1542 a.vgatherdps(A::ymm1 , A::FOUR , A::ymm0 , A::rdi, A::ymm2 );
1543 a.vgatherdps(A::ymm0 , A::ONE , A::ymm2 , A::rax, A::ymm1 );
1544 a.vgatherdps(A::ymm10, A::ONE , A::ymm2 , A::rax, A::ymm1 );
1545 a.vgatherdps(A::ymm0 , A::ONE , A::ymm12, A::rax, A::ymm1 );
1546 a.vgatherdps(A::ymm0 , A::ONE , A::ymm2 , A::r9 , A::ymm1 );
1547 a.vgatherdps(A::ymm0 , A::ONE , A::ymm2 , A::rax, A::ymm12);
1548 a.vgatherdps(A::ymm0 , A::EIGHT, A::ymm2 , A::rax, A::ymm12);
1549 },{
1550 0xc4,0xe2,0x6d,0x92,0x0c,0x87,
1551 0xc4,0xe2,0x75,0x92,0x04,0x10,
1552 0xc4,0x62,0x75,0x92,0x14,0x10,
1553 0xc4,0xa2,0x75,0x92,0x04,0x20,
1554 0xc4,0xc2,0x75,0x92,0x04,0x11,
1555 0xc4,0xe2,0x1d,0x92,0x04,0x10,
1556 0xc4,0xe2,0x1d,0x92,0x04,0xd0,
1557 });
1558
1559 test_asm(r, [&](A& a) {
1560 a.mov(A::rax, A::Mem{A::rdi, 0});
1561 a.mov(A::rax, A::Mem{A::rdi, 1});
1562 a.mov(A::rax, A::Mem{A::rdi, 512});
1563 a.mov(A::r15, A::Mem{A::r13, 42});
1564 a.mov(A::rax, A::Mem{A::r13, 42});
1565 a.mov(A::r15, A::Mem{A::rax, 42});
1566 a.mov(A::rax, 1);
1567 a.mov(A::rax, A::rcx);
1568 },{
1569 0x48, 0x8b, 0x07,
1570 0x48, 0x8b, 0x47, 0x01,
1571 0x48, 0x8b, 0x87, 0x00,0x02,0x00,0x00,
1572 0x4d, 0x8b, 0x7d, 0x2a,
1573 0x49, 0x8b, 0x45, 0x2a,
1574 0x4c, 0x8b, 0x78, 0x2a,
1575 0x48, 0xc7, 0xc0, 0x01,0x00,0x00,0x00,
1576 0x48, 0x89, 0xc8,
1577 });
1578
1579 // echo "fmul v4.4s, v3.4s, v1.4s" | llvm-mc -show-encoding -arch arm64
1580
1581 test_asm(r, [&](A& a) {
1582 a.and16b(A::v4, A::v3, A::v1);
1583 a.orr16b(A::v4, A::v3, A::v1);
1584 a.eor16b(A::v4, A::v3, A::v1);
1585 a.bic16b(A::v4, A::v3, A::v1);
1586 a.bsl16b(A::v4, A::v3, A::v1);
1587 a.not16b(A::v4, A::v3);
1588
1589 a.add4s(A::v4, A::v3, A::v1);
1590 a.sub4s(A::v4, A::v3, A::v1);
1591 a.mul4s(A::v4, A::v3, A::v1);
1592
1593 a.cmeq4s(A::v4, A::v3, A::v1);
1594 a.cmgt4s(A::v4, A::v3, A::v1);
1595
1596 a.sub8h(A::v4, A::v3, A::v1);
1597 a.mul8h(A::v4, A::v3, A::v1);
1598
1599 a.fadd4s(A::v4, A::v3, A::v1);
1600 a.fsub4s(A::v4, A::v3, A::v1);
1601 a.fmul4s(A::v4, A::v3, A::v1);
1602 a.fdiv4s(A::v4, A::v3, A::v1);
1603 a.fmin4s(A::v4, A::v3, A::v1);
1604 a.fmax4s(A::v4, A::v3, A::v1);
1605
1606 a.fneg4s (A::v4, A::v3);
1607 a.fsqrt4s(A::v4, A::v3);
1608
1609 a.fmla4s(A::v4, A::v3, A::v1);
1610 a.fmls4s(A::v4, A::v3, A::v1);
1611
1612 a.fcmeq4s(A::v4, A::v3, A::v1);
1613 a.fcmgt4s(A::v4, A::v3, A::v1);
1614 a.fcmge4s(A::v4, A::v3, A::v1);
1615 },{
1616 0x64,0x1c,0x21,0x4e,
1617 0x64,0x1c,0xa1,0x4e,
1618 0x64,0x1c,0x21,0x6e,
1619 0x64,0x1c,0x61,0x4e,
1620 0x64,0x1c,0x61,0x6e,
1621 0x64,0x58,0x20,0x6e,
1622
1623 0x64,0x84,0xa1,0x4e,
1624 0x64,0x84,0xa1,0x6e,
1625 0x64,0x9c,0xa1,0x4e,
1626
1627 0x64,0x8c,0xa1,0x6e,
1628 0x64,0x34,0xa1,0x4e,
1629
1630 0x64,0x84,0x61,0x6e,
1631 0x64,0x9c,0x61,0x4e,
1632
1633 0x64,0xd4,0x21,0x4e,
1634 0x64,0xd4,0xa1,0x4e,
1635 0x64,0xdc,0x21,0x6e,
1636 0x64,0xfc,0x21,0x6e,
1637 0x64,0xf4,0xa1,0x4e,
1638 0x64,0xf4,0x21,0x4e,
1639
1640 0x64,0xf8,0xa0,0x6e,
1641 0x64,0xf8,0xa1,0x6e,
1642
1643 0x64,0xcc,0x21,0x4e,
1644 0x64,0xcc,0xa1,0x4e,
1645
1646 0x64,0xe4,0x21,0x4e,
1647 0x64,0xe4,0xa1,0x6e,
1648 0x64,0xe4,0x21,0x6e,
1649 });
1650
1651 test_asm(r, [&](A& a) {
1652 a.shl4s(A::v4, A::v3, 0);
1653 a.shl4s(A::v4, A::v3, 1);
1654 a.shl4s(A::v4, A::v3, 8);
1655 a.shl4s(A::v4, A::v3, 16);
1656 a.shl4s(A::v4, A::v3, 31);
1657
1658 a.sshr4s(A::v4, A::v3, 1);
1659 a.sshr4s(A::v4, A::v3, 8);
1660 a.sshr4s(A::v4, A::v3, 31);
1661
1662 a.ushr4s(A::v4, A::v3, 1);
1663 a.ushr4s(A::v4, A::v3, 8);
1664 a.ushr4s(A::v4, A::v3, 31);
1665
1666 a.ushr8h(A::v4, A::v3, 1);
1667 a.ushr8h(A::v4, A::v3, 8);
1668 a.ushr8h(A::v4, A::v3, 15);
1669 },{
1670 0x64,0x54,0x20,0x4f,
1671 0x64,0x54,0x21,0x4f,
1672 0x64,0x54,0x28,0x4f,
1673 0x64,0x54,0x30,0x4f,
1674 0x64,0x54,0x3f,0x4f,
1675
1676 0x64,0x04,0x3f,0x4f,
1677 0x64,0x04,0x38,0x4f,
1678 0x64,0x04,0x21,0x4f,
1679
1680 0x64,0x04,0x3f,0x6f,
1681 0x64,0x04,0x38,0x6f,
1682 0x64,0x04,0x21,0x6f,
1683
1684 0x64,0x04,0x1f,0x6f,
1685 0x64,0x04,0x18,0x6f,
1686 0x64,0x04,0x11,0x6f,
1687 });
1688
1689 test_asm(r, [&](A& a) {
1690 a.sli4s(A::v4, A::v3, 0);
1691 a.sli4s(A::v4, A::v3, 1);
1692 a.sli4s(A::v4, A::v3, 8);
1693 a.sli4s(A::v4, A::v3, 16);
1694 a.sli4s(A::v4, A::v3, 31);
1695 },{
1696 0x64,0x54,0x20,0x6f,
1697 0x64,0x54,0x21,0x6f,
1698 0x64,0x54,0x28,0x6f,
1699 0x64,0x54,0x30,0x6f,
1700 0x64,0x54,0x3f,0x6f,
1701 });
1702
1703 test_asm(r, [&](A& a) {
1704 a.scvtf4s (A::v4, A::v3);
1705 a.fcvtzs4s(A::v4, A::v3);
1706 a.fcvtns4s(A::v4, A::v3);
1707 a.frintp4s(A::v4, A::v3);
1708 a.frintm4s(A::v4, A::v3);
1709 a.fcvtn (A::v4, A::v3);
1710 a.fcvtl (A::v4, A::v3);
1711 },{
1712 0x64,0xd8,0x21,0x4e,
1713 0x64,0xb8,0xa1,0x4e,
1714 0x64,0xa8,0x21,0x4e,
1715 0x64,0x88,0xa1,0x4e,
1716 0x64,0x98,0x21,0x4e,
1717 0x64,0x68,0x21,0x0e,
1718 0x64,0x78,0x21,0x0e,
1719 });
1720
1721 test_asm(r, [&](A& a) {
1722 a.sub (A::sp, A::sp, 32); // sub sp, sp, #32
1723 a.strq(A::v0, A::sp, 1); // str q0, [sp, #16]
1724 a.strq(A::v1, A::sp); // str q1, [sp]
1725 a.strd(A::v0, A::sp, 6); // str s0, [sp, #48]
1726 a.strs(A::v0, A::sp, 6); // str s0, [sp, #24]
1727 a.strh(A::v0, A::sp, 10); // str h0, [sp, #20]
1728 a.strb(A::v0, A::sp, 47); // str b0, [sp, #47]
1729 a.ldrb(A::v9, A::sp, 42); // ldr b9, [sp, #42]
1730 a.ldrh(A::v9, A::sp, 47); // ldr h9, [sp, #94]
1731 a.ldrs(A::v7, A::sp, 10); // ldr s7, [sp, #40]
1732 a.ldrd(A::v7, A::sp, 1); // ldr d7, [sp, #8]
1733 a.ldrq(A::v5, A::sp, 128); // ldr q5, [sp, #2048]
1734 a.add (A::sp, A::sp, 32); // add sp, sp, #32
1735 },{
1736 0xff,0x83,0x00,0xd1,
1737 0xe0,0x07,0x80,0x3d,
1738 0xe1,0x03,0x80,0x3d,
1739 0xe0,0x1b,0x00,0xfd,
1740 0xe0,0x1b,0x00,0xbd,
1741 0xe0,0x2b,0x00,0x7d,
1742 0xe0,0xbf,0x00,0x3d,
1743 0xe9,0xab,0x40,0x3d,
1744 0xe9,0xbf,0x40,0x7d,
1745 0xe7,0x2b,0x40,0xbd,
1746 0xe7,0x07,0x40,0xfd,
1747 0xe5,0x03,0xc2,0x3d,
1748 0xff,0x83,0x00,0x91,
1749 });
1750
1751 test_asm(r, [&](A& a) {
1752 a.brk(0);
1753 a.brk(65535);
1754
1755 a.ret(A::x30); // Conventional ret using link register.
1756 a.ret(A::x13); // Can really return using any register if we like.
1757
1758 a.add(A::x2, A::x2, 4);
1759 a.add(A::x3, A::x2, 32);
1760
1761 a.sub(A::x2, A::x2, 4);
1762 a.sub(A::x3, A::x2, 32);
1763
1764 a.subs(A::x2, A::x2, 4);
1765 a.subs(A::x3, A::x2, 32);
1766
1767 a.subs(A::xzr, A::x2, 4); // These are actually the same instruction!
1768 a.cmp(A::x2, 4);
1769
1770 A::Label l;
1771 a.label(&l);
1772 a.bne(&l);
1773 a.bne(&l);
1774 a.blt(&l);
1775 a.b(&l);
1776 a.cbnz(A::x2, &l);
1777 a.cbz(A::x2, &l);
1778
1779 a.add(A::x3, A::x2, A::x1); // add x3,x2,x1
1780 a.add(A::x3, A::x2, A::x1, A::ASR, 3); // add x3,x2,x1, asr #3
1781 },{
1782 0x00,0x00,0x20,0xd4,
1783 0xe0,0xff,0x3f,0xd4,
1784
1785 0xc0,0x03,0x5f,0xd6,
1786 0xa0,0x01,0x5f,0xd6,
1787
1788 0x42,0x10,0x00,0x91,
1789 0x43,0x80,0x00,0x91,
1790
1791 0x42,0x10,0x00,0xd1,
1792 0x43,0x80,0x00,0xd1,
1793
1794 0x42,0x10,0x00,0xf1,
1795 0x43,0x80,0x00,0xf1,
1796
1797 0x5f,0x10,0x00,0xf1,
1798 0x5f,0x10,0x00,0xf1,
1799
1800 0x01,0x00,0x00,0x54, // b.ne #0
1801 0xe1,0xff,0xff,0x54, // b.ne #-4
1802 0xcb,0xff,0xff,0x54, // b.lt #-8
1803 0xae,0xff,0xff,0x54, // b.al #-12
1804 0x82,0xff,0xff,0xb5, // cbnz x2, #-16
1805 0x62,0xff,0xff,0xb4, // cbz x2, #-20
1806
1807 0x43,0x00,0x01,0x8b,
1808 0x43,0x0c,0x81,0x8b,
1809 });
1810
1811 // Can we cbz() to a not-yet-defined label?
1812 test_asm(r, [&](A& a) {
1813 A::Label l;
1814 a.cbz(A::x2, &l);
1815 a.add(A::x3, A::x2, 32);
1816 a.label(&l);
1817 a.ret(A::x30);
1818 },{
1819 0x42,0x00,0x00,0xb4, // cbz x2, #8
1820 0x43,0x80,0x00,0x91, // add x3, x2, #32
1821 0xc0,0x03,0x5f,0xd6, // ret
1822 });
1823
1824 // If we start a label as a backward label,
1825 // can we redefine it to be a future label?
1826 // (Not sure this is useful... just want to test it works.)
1827 test_asm(r, [&](A& a) {
1828 A::Label l1;
1829 a.label(&l1);
1830 a.add(A::x3, A::x2, 32);
1831 a.cbz(A::x2, &l1); // This will jump backward... nothing sneaky.
1832
1833 A::Label l2; // Start off the same...
1834 a.label(&l2);
1835 a.add(A::x3, A::x2, 32);
1836 a.cbz(A::x2, &l2); // Looks like this will go backward...
1837 a.add(A::x2, A::x2, 4);
1838 a.add(A::x3, A::x2, 32);
1839 a.label(&l2); // But no... actually forward! What a switcheroo!
1840 },{
1841 0x43,0x80,0x00,0x91, // add x3, x2, #32
1842 0xe2,0xff,0xff,0xb4, // cbz x2, #-4
1843
1844 0x43,0x80,0x00,0x91, // add x3, x2, #32
1845 0x62,0x00,0x00,0xb4, // cbz x2, #12
1846 0x42,0x10,0x00,0x91, // add x2, x2, #4
1847 0x43,0x80,0x00,0x91, // add x3, x2, #32
1848 });
1849
1850 // Loading from a label on ARM.
1851 test_asm(r, [&](A& a) {
1852 A::Label fore,aft;
1853 a.label(&fore);
1854 a.word(0x01234567);
1855 a.ldrq(A::v1, &fore);
1856 a.ldrq(A::v2, &aft);
1857 a.label(&aft);
1858 a.word(0x76543210);
1859 },{
1860 0x67,0x45,0x23,0x01,
1861 0xe1,0xff,0xff,0x9c, // ldr q1, #-4
1862 0x22,0x00,0x00,0x9c, // ldr q2, #4
1863 0x10,0x32,0x54,0x76,
1864 });
1865
1866 test_asm(r, [&](A& a) {
1867 a.ldrq(A::v0, A::x8);
1868 a.strq(A::v0, A::x8);
1869 },{
1870 0x00,0x01,0xc0,0x3d,
1871 0x00,0x01,0x80,0x3d,
1872 });
1873
1874 test_asm(r, [&](A& a) {
1875 a.dup4s (A::v0, A::x8);
1876 a.ld1r4s (A::v0, A::x8); // echo 'ld1r.4s {v0}, [x8]' | llvm-mc --show-encoding
1877 a.ld1r8h (A::v0, A::x8);
1878 a.ld1r16b(A::v0, A::x8);
1879 },{
1880 0x00,0x0d,0x04,0x4e,
1881 0x00,0xc9,0x40,0x4d,
1882 0x00,0xc5,0x40,0x4d,
1883 0x00,0xc1,0x40,0x4d,
1884 });
1885
1886 test_asm(r, [&](A& a) {
1887 a.ld24s(A::v0, A::x8); // echo 'ld2.4s {v0,v1}, [x8]' | llvm-mc --show-encoding
1888 a.ld44s(A::v0, A::x8);
1889 a.st24s(A::v0, A::x8);
1890 a.st44s(A::v0, A::x8); // echo 'st4.4s {v0,v1,v2,v3}, [x8]' | llvm-mc --show-encoding
1891
1892 a.ld24s(A::v0, A::x8, 0); //echo 'ld2 {v0.s,v1.s}[0], [x8]' | llvm-mc --show-encoding
1893 a.ld24s(A::v0, A::x8, 1);
1894 a.ld24s(A::v0, A::x8, 2);
1895 a.ld24s(A::v0, A::x8, 3);
1896
1897 a.ld44s(A::v0, A::x8, 0); // ld4 {v0.s,v1.s,v2.s,v3.s}[0], [x8]
1898 a.ld44s(A::v0, A::x8, 1);
1899 a.ld44s(A::v0, A::x8, 2);
1900 a.ld44s(A::v0, A::x8, 3);
1901 },{
1902 0x00,0x89,0x40,0x4c,
1903 0x00,0x09,0x40,0x4c,
1904 0x00,0x89,0x00,0x4c,
1905 0x00,0x09,0x00,0x4c,
1906
1907 0x00,0x81,0x60,0x0d,
1908 0x00,0x91,0x60,0x0d,
1909 0x00,0x81,0x60,0x4d,
1910 0x00,0x91,0x60,0x4d,
1911
1912 0x00,0xa1,0x60,0x0d,
1913 0x00,0xb1,0x60,0x0d,
1914 0x00,0xa1,0x60,0x4d,
1915 0x00,0xb1,0x60,0x4d,
1916 });
1917
1918 test_asm(r, [&](A& a) {
1919 a.xtns2h(A::v0, A::v0);
1920 a.xtnh2b(A::v0, A::v0);
1921 a.strs (A::v0, A::x0);
1922
1923 a.ldrs (A::v0, A::x0);
1924 a.uxtlb2h(A::v0, A::v0);
1925 a.uxtlh2s(A::v0, A::v0);
1926
1927 a.uminv4s(A::v3, A::v4);
1928 a.movs (A::x3, A::v4,0); // mov.s w3,v4[0]
1929 a.movs (A::x3, A::v4,1); // mov.s w3,v4[1]
1930 a.inss (A::v4, A::x3,3); // ins.s v4[3],w3
1931 },{
1932 0x00,0x28,0x61,0x0e,
1933 0x00,0x28,0x21,0x0e,
1934 0x00,0x00,0x00,0xbd,
1935
1936 0x00,0x00,0x40,0xbd,
1937 0x00,0xa4,0x08,0x2f,
1938 0x00,0xa4,0x10,0x2f,
1939
1940 0x83,0xa8,0xb1,0x6e,
1941 0x83,0x3c,0x04,0x0e,
1942 0x83,0x3c,0x0c,0x0e,
1943 0x64,0x1c,0x1c,0x4e,
1944 });
1945
1946 test_asm(r, [&](A& a) {
1947 a.ldrb(A::v0, A::x8);
1948 a.strb(A::v0, A::x8);
1949 },{
1950 0x00,0x01,0x40,0x3d,
1951 0x00,0x01,0x00,0x3d,
1952 });
1953
1954 test_asm(r, [&](A& a) {
1955 a.ldrd(A::x0, A::x1, 3); // ldr x0, [x1, #24]
1956 a.ldrs(A::x0, A::x1, 3); // ldr w0, [x1, #12]
1957 a.ldrh(A::x0, A::x1, 3); // ldrh w0, [x1, #6]
1958 a.ldrb(A::x0, A::x1, 3); // ldrb w0, [x1, #3]
1959
1960 a.strs(A::x0, A::x1, 3); // str w0, [x1, #12]
1961 },{
1962 0x20,0x0c,0x40,0xf9,
1963 0x20,0x0c,0x40,0xb9,
1964 0x20,0x0c,0x40,0x79,
1965 0x20,0x0c,0x40,0x39,
1966
1967 0x20,0x0c,0x00,0xb9,
1968 });
1969
1970 test_asm(r, [&](A& a) {
1971 a.tbl (A::v0, A::v1, A::v2);
1972 a.uzp14s(A::v0, A::v1, A::v2);
1973 a.uzp24s(A::v0, A::v1, A::v2);
1974 a.zip14s(A::v0, A::v1, A::v2);
1975 a.zip24s(A::v0, A::v1, A::v2);
1976 },{
1977 0x20,0x00,0x02,0x4e,
1978 0x20,0x18,0x82,0x4e,
1979 0x20,0x58,0x82,0x4e,
1980 0x20,0x38,0x82,0x4e,
1981 0x20,0x78,0x82,0x4e,
1982 });
1983 }
1984
DEF_TEST(SkVM_approx_math,r)1985 DEF_TEST(SkVM_approx_math, r) {
1986 auto eval = [](int N, float values[], auto fn) {
1987 skvm::Builder b;
1988 skvm::Ptr inout = b.varying<float>();
1989
1990 b.storeF(inout, fn(&b, b.loadF(inout)));
1991
1992 b.done().eval(N, values);
1993 };
1994
1995 auto compare = [r](int N, const float values[], const float expected[]) {
1996 for (int i = 0; i < N; ++i) {
1997 REPORTER_ASSERT(r, SkScalarNearlyEqual(values[i], expected[i], 0.001f));
1998 }
1999 };
2000
2001 // log2
2002 {
2003 float values[] = {0.25f, 0.5f, 1, 2, 4, 8};
2004 constexpr int N = SK_ARRAY_COUNT(values);
2005 eval(N, values, [](skvm::Builder* b, skvm::F32 v) {
2006 return b->approx_log2(v);
2007 });
2008 const float expected[] = {-2, -1, 0, 1, 2, 3};
2009 compare(N, values, expected);
2010 }
2011
2012 // pow2
2013 {
2014 float values[] = {-2, -1, 0, 1, 2, 3};
2015 constexpr int N = SK_ARRAY_COUNT(values);
2016 eval(N, values, [](skvm::Builder* b, skvm::F32 v) {
2017 return b->approx_pow2(v);
2018 });
2019 const float expected[] = {0.25f, 0.5f, 1, 2, 4, 8};
2020 compare(N, values, expected);
2021 }
2022
2023 // powf -- x^0.5
2024 {
2025 float bases[] = {0, 1, 4, 9, 16};
2026 constexpr int N = SK_ARRAY_COUNT(bases);
2027 eval(N, bases, [](skvm::Builder* b, skvm::F32 base) {
2028 return b->approx_powf(base, b->splat(0.5f));
2029 });
2030 const float expected[] = {0, 1, 2, 3, 4};
2031 compare(N, bases, expected);
2032 }
2033 // powf -- 3^x
2034 {
2035 float exps[] = {-2, -1, 0, 1, 2};
2036 constexpr int N = SK_ARRAY_COUNT(exps);
2037 eval(N, exps, [](skvm::Builder* b, skvm::F32 exp) {
2038 return b->approx_powf(b->splat(3.0f), exp);
2039 });
2040 const float expected[] = {1/9.0f, 1/3.0f, 1, 3, 9};
2041 compare(N, exps, expected);
2042 }
2043
2044 auto test = [r](float arg, float expected, float tolerance, auto prog) {
2045 skvm::Builder b;
2046 skvm::Ptr inout = b.varying<float>();
2047 b.storeF(inout, prog(b.loadF(inout)));
2048 float actual = arg;
2049 b.done().eval(1, &actual);
2050
2051 float err = std::abs(actual - expected);
2052
2053 if (err > tolerance) {
2054 // SkDebugf("arg %g, expected %g, actual %g\n", arg, expected, actual);
2055 REPORTER_ASSERT(r, true);
2056 }
2057 return err;
2058 };
2059
2060 auto test2 = [r](float arg0, float arg1, float expected, float tolerance, auto prog) {
2061 skvm::Builder b;
2062 skvm::Ptr in0 = b.varying<float>();
2063 skvm::Ptr in1 = b.varying<float>();
2064 skvm::Ptr out = b.varying<float>();
2065 b.storeF(out, prog(b.loadF(in0), b.loadF(in1)));
2066 float actual;
2067 b.done().eval(1, &arg0, &arg1, &actual);
2068
2069 float err = std::abs(actual - expected);
2070
2071 if (err > tolerance) {
2072 // SkDebugf("[%g, %g]: expected %g, actual %g\n", arg0, arg1, expected, actual);
2073 REPORTER_ASSERT(r, true);
2074 }
2075 return err;
2076 };
2077
2078 // sine, cosine, tangent
2079 {
2080 constexpr float P = SK_ScalarPI;
2081 constexpr float tol = 0.00175f;
2082 for (float rad = -5*P; rad <= 5*P; rad += 0.1f) {
2083 test(rad, sk_float_sin(rad), tol, [](skvm::F32 x) {
2084 return approx_sin(x);
2085 });
2086 test(rad, sk_float_cos(rad), tol, [](skvm::F32 x) {
2087 return approx_cos(x);
2088 });
2089 }
2090
2091 // Our tangent diverge more as we get near infinities (x near +- Pi/2),
2092 // so bring in the domain a little.
2093 constexpr float eps = 0.16f;
2094 float err = 0;
2095 for (float rad = -P/2 + eps; rad <= P/2 - eps; rad += 0.01f) {
2096 err += test(rad, sk_float_tan(rad), tol, [](skvm::F32 x) {
2097 return approx_tan(x);
2098 });
2099 // try again with some multiples of P, to check our periodicity
2100 test(rad, sk_float_tan(rad), tol, [=](skvm::F32 x) {
2101 return approx_tan(x + 3*P);
2102 });
2103 test(rad, sk_float_tan(rad), tol, [=](skvm::F32 x) {
2104 return approx_tan(x - 3*P);
2105 });
2106 }
2107 if (0) { SkDebugf("tan error %g\n", err); }
2108 }
2109
2110 // asin, acos, atan
2111 {
2112 constexpr float tol = 0.00175f;
2113 float err = 0;
2114 for (float x = -1; x <= 1; x += 1.0f/64) {
2115 err += test(x, asin(x), tol, [](skvm::F32 x) {
2116 return approx_asin(x);
2117 });
2118 test(x, acos(x), tol, [](skvm::F32 x) {
2119 return approx_acos(x);
2120 });
2121 }
2122 if (0) { SkDebugf("asin error %g\n", err); }
2123
2124 err = 0;
2125 for (float x = -10; x <= 10; x += 1.0f/16) {
2126 err += test(x, atan(x), tol, [](skvm::F32 x) {
2127 return approx_atan(x);
2128 });
2129 }
2130 if (0) { SkDebugf("atan error %g\n", err); }
2131
2132 for (float y = -3; y <= 3; y += 1) {
2133 for (float x = -3; x <= 3; x += 1) {
2134 err += test2(y, x, atan2(y,x), tol, [](skvm::F32 y, skvm::F32 x) {
2135 return approx_atan2(y,x);
2136 });
2137 }
2138 }
2139 if (0) { SkDebugf("atan2 error %g\n", err); }
2140 }
2141 }
2142
DEF_TEST(SkVM_min_max,r)2143 DEF_TEST(SkVM_min_max, r) {
2144 // min() and max() have subtle behavior when one argument is NaN and
2145 // the other isn't. It's not sound to blindly swap their arguments.
2146 //
2147 // All backends must behave like std::min() and std::max(), which are
2148 //
2149 // min(x,y) = y<x ? y : x
2150 // max(x,y) = x<y ? y : x
2151
2152 // ±NaN, ±0, ±1, ±inf
2153 const uint32_t bits[] = {0x7f80'0001, 0xff80'0001, 0x0000'0000, 0x8000'0000,
2154 0x3f80'0000, 0xbf80'0000, 0x7f80'0000, 0xff80'0000};
2155
2156 float f[8];
2157 memcpy(f, bits, sizeof(bits));
2158
2159 auto identical = [&](float x, float y) {
2160 uint32_t X,Y;
2161 memcpy(&X, &x, 4);
2162 memcpy(&Y, &y, 4);
2163 return X == Y;
2164 };
2165
2166 // Test min/max with non-constant x, non-constant y.
2167 // (Whether x and y are varying or uniform shouldn't make any difference.)
2168 {
2169 skvm::Builder b;
2170 {
2171 skvm::Ptr src = b.varying<float>(),
2172 mn = b.varying<float>(),
2173 mx = b.varying<float>();
2174
2175 skvm::F32 x = b.loadF(src),
2176 y = b.uniformF(b.uniform(), 0);
2177
2178 b.storeF(mn, b.min(x,y));
2179 b.storeF(mx, b.max(x,y));
2180 }
2181
2182 test_jit_and_interpreter(b, [&](const skvm::Program& program){
2183 float mn[8], mx[8];
2184 for (int i = 0; i < 8; i++) {
2185 // min() and max() everything with f[i].
2186 program.eval(8, f,mn,mx, &f[i]);
2187
2188 for (int j = 0; j < 8; j++) {
2189 REPORTER_ASSERT(r, identical(mn[j], std::min(f[j], f[i])));
2190 REPORTER_ASSERT(r, identical(mx[j], std::max(f[j], f[i])));
2191 }
2192 }
2193 });
2194 }
2195
2196 // Test each with constant on the right.
2197 for (int i = 0; i < 8; i++) {
2198 skvm::Builder b;
2199 {
2200 skvm::Ptr src = b.varying<float>(),
2201 mn = b.varying<float>(),
2202 mx = b.varying<float>();
2203
2204 skvm::F32 x = b.loadF(src),
2205 y = b.splat(f[i]);
2206
2207 b.storeF(mn, b.min(x,y));
2208 b.storeF(mx, b.max(x,y));
2209 }
2210
2211 test_jit_and_interpreter(b, [&](const skvm::Program& program){
2212 float mn[8], mx[8];
2213 program.eval(8, f,mn,mx);
2214 for (int j = 0; j < 8; j++) {
2215 REPORTER_ASSERT(r, identical(mn[j], std::min(f[j], f[i])));
2216 REPORTER_ASSERT(r, identical(mx[j], std::max(f[j], f[i])));
2217 }
2218 });
2219 }
2220
2221 // Test each with constant on the left.
2222 for (int i = 0; i < 8; i++) {
2223 skvm::Builder b;
2224 {
2225 skvm::Ptr src = b.varying<float>(),
2226 mn = b.varying<float>(),
2227 mx = b.varying<float>();
2228
2229 skvm::F32 x = b.splat(f[i]),
2230 y = b.loadF(src);
2231
2232 b.storeF(mn, b.min(x,y));
2233 b.storeF(mx, b.max(x,y));
2234 }
2235
2236 test_jit_and_interpreter(b, [&](const skvm::Program& program){
2237 float mn[8], mx[8];
2238 program.eval(8, f,mn,mx);
2239 for (int j = 0; j < 8; j++) {
2240 REPORTER_ASSERT(r, identical(mn[j], std::min(f[i], f[j])));
2241 REPORTER_ASSERT(r, identical(mx[j], std::max(f[i], f[j])));
2242 }
2243 });
2244 }
2245 }
2246
DEF_TEST(SkVM_halfs,r)2247 DEF_TEST(SkVM_halfs, r) {
2248 const uint16_t hs[] = {0x0000,0x3800,0x3c00,0x4000,
2249 0xc400,0xb800,0xbc00,0xc000};
2250 const float fs[] = {+0.0f,+0.5f,+1.0f,+2.0f,
2251 -4.0f,-0.5f,-1.0f,-2.0f};
2252 {
2253 skvm::Builder b;
2254 skvm::Ptr src = b.varying<uint16_t>(),
2255 dst = b.varying<float>();
2256 b.storeF(dst, b.from_fp16(b.load16(src)));
2257
2258 test_jit_and_interpreter(b, [&](const skvm::Program& program){
2259 float dst[8];
2260 program.eval(8, hs, dst);
2261 for (int i = 0; i < 8; i++) {
2262 REPORTER_ASSERT(r, dst[i] == fs[i]);
2263 }
2264 });
2265 }
2266 {
2267 skvm::Builder b;
2268 skvm::Ptr src = b.varying<float>(),
2269 dst = b.varying<uint16_t>();
2270 b.store16(dst, b.to_fp16(b.loadF(src)));
2271
2272 test_jit_and_interpreter(b, [&](const skvm::Program& program){
2273 uint16_t dst[8];
2274 program.eval(8, fs, dst);
2275 for (int i = 0; i < 8; i++) {
2276 REPORTER_ASSERT(r, dst[i] == hs[i]);
2277 }
2278 });
2279 }
2280 }
2281
DEF_TEST(SkVM_64bit,r)2282 DEF_TEST(SkVM_64bit, r) {
2283 uint32_t lo[65],
2284 hi[65];
2285 uint64_t wide[65];
2286 for (int i = 0; i < 65; i++) {
2287 lo[i] = 2*i+0;
2288 hi[i] = 2*i+1;
2289 wide[i] = ((uint64_t)lo[i] << 0)
2290 | ((uint64_t)hi[i] << 32);
2291 }
2292
2293 {
2294 skvm::Builder b;
2295 {
2296 skvm::Ptr widePtr = b.varying<uint64_t>(),
2297 loPtr = b.varying<int>(),
2298 hiPtr = b.varying<int>();
2299 b.store32(loPtr, b.load64(widePtr, 0));
2300 b.store32(hiPtr, b.load64(widePtr, 1));
2301 }
2302 test_jit_and_interpreter(b, [&](const skvm::Program& program){
2303 uint32_t l[65], h[65];
2304 program.eval(65, wide,l,h);
2305 for (int i = 0; i < 65; i++) {
2306 REPORTER_ASSERT(r, l[i] == lo[i]);
2307 REPORTER_ASSERT(r, h[i] == hi[i]);
2308 }
2309 });
2310 }
2311
2312 {
2313 skvm::Builder b;
2314 {
2315 skvm::Ptr widePtr = b.varying<uint64_t>(),
2316 loPtr = b.varying<int>(),
2317 hiPtr = b.varying<int>();
2318 b.store64(widePtr, b.load32(loPtr), b.load32(hiPtr));
2319 }
2320 test_jit_and_interpreter(b, [&](const skvm::Program& program){
2321 uint64_t w[65];
2322 program.eval(65, w,lo,hi);
2323 for (int i = 0; i < 65; i++) {
2324 REPORTER_ASSERT(r, w[i] == wide[i]);
2325 }
2326 });
2327 }
2328 }
2329
DEF_TEST(SkVM_128bit,r)2330 DEF_TEST(SkVM_128bit, r) {
2331 float floats[4*63];
2332 uint8_t packed[4*63];
2333
2334 for (int i = 0; i < 4*63; i++) {
2335 floats[i] = i * (1/255.0f);
2336 }
2337
2338 skvm::PixelFormat rgba_ffff = skvm::SkColorType_to_PixelFormat(kRGBA_F32_SkColorType),
2339 rgba_8888 = skvm::SkColorType_to_PixelFormat(kRGBA_8888_SkColorType);
2340
2341 { // Convert RGBA F32 to RGBA 8888, testing 128-bit loads.
2342 skvm::Builder b;
2343 {
2344 skvm::Ptr dst = b.varying(4),
2345 src = b.varying(16);
2346
2347 skvm::Color c = b.load(rgba_ffff, src);
2348 b.store(rgba_8888, dst, c);
2349 }
2350 test_jit_and_interpreter(b, [&](const skvm::Program& program){
2351 memset(packed, 0, sizeof(packed));
2352 program.eval(63, packed, floats);
2353 for (int i = 0; i < 4*63; i++) {
2354 REPORTER_ASSERT(r, packed[i] == i);
2355 }
2356 });
2357 }
2358
2359
2360 { // Convert RGBA 8888 to RGBA F32, testing 128-bit stores.
2361 skvm::Builder b;
2362 {
2363 skvm::Ptr dst = b.varying(16),
2364 src = b.varying(4);
2365
2366 skvm::Color c = b.load(rgba_8888, src);
2367 b.store(rgba_ffff, dst, c);
2368 }
2369 test_jit_and_interpreter(b, [&](const skvm::Program& program){
2370 memset(floats, 0, sizeof(floats));
2371 program.eval(63, floats, packed);
2372 for (int i = 0; i < 4*63; i++) {
2373 REPORTER_ASSERT(r, floats[i] == i * (1/255.0f));
2374 }
2375 });
2376 }
2377
2378 }
2379
DEF_TEST(SkVM_is_NaN_is_finite,r)2380 DEF_TEST(SkVM_is_NaN_is_finite, r) {
2381 skvm::Builder b;
2382 {
2383 skvm::Ptr src = b.varying<float>(),
2384 nan = b.varying<int>(),
2385 fin = b.varying<int>();
2386 b.store32(nan, is_NaN (b.loadF(src)));
2387 b.store32(fin, is_finite(b.loadF(src)));
2388 }
2389 test_jit_and_interpreter(b, [&](const skvm::Program& program){
2390 // ±NaN, ±0, ±1, ±inf
2391 const uint32_t bits[] = {0x7f80'0001, 0xff80'0001, 0x0000'0000, 0x8000'0000,
2392 0x3f80'0000, 0xbf80'0000, 0x7f80'0000, 0xff80'0000};
2393 uint32_t nan[8], fin[8];
2394 program.eval(8, bits, nan,fin);
2395
2396 for (int i = 0; i < 8; i++) {
2397 REPORTER_ASSERT(r, nan[i] == ((i == 0 || i == 1) ? 0xffffffff : 0));
2398 REPORTER_ASSERT(r, fin[i] == ((i == 2 || i == 3 ||
2399 i == 4 || i == 5) ? 0xffffffff : 0));
2400 }
2401 });
2402 }
2403
DEF_TEST(SkVM_args,r)2404 DEF_TEST(SkVM_args, r) {
2405 // Test we can handle at least six arguments.
2406 skvm::Builder b;
2407 {
2408 skvm::Ptr dst = b.varying<float>(),
2409 A = b.varying<float>(),
2410 B = b.varying<float>(),
2411 C = b.varying<float>(),
2412 D = b.varying<float>(),
2413 E = b.varying<float>();
2414 storeF(dst, b.loadF(A)
2415 + b.loadF(B)
2416 + b.loadF(C)
2417 + b.loadF(D)
2418 + b.loadF(E));
2419 }
2420
2421 test_jit_and_interpreter(b, [&](const skvm::Program& program){
2422 float dst[17],A[17],B[17],C[17],D[17],E[17];
2423 for (int i = 0; i < 17; i++) {
2424 A[i] = B[i] = C[i] = D[i] = E[i] = (float)i;
2425 }
2426 program.eval(17, dst,A,B,C,D,E);
2427 for (int i = 0; i < 17; i++) {
2428 REPORTER_ASSERT(r, dst[i] == 5.0f*i);
2429 }
2430 });
2431 }
2432
DEF_TEST(SkVM_badpack,reporter)2433 DEF_TEST(SkVM_badpack, reporter) {
2434 // Test case distilled from actual failing draw,
2435 // originally with a bad arm64 implementation of pack().
2436 skvm::Builder p;
2437 {
2438 skvm::UPtr uniforms = p.uniform();
2439 skvm::Ptr dst = p.varying<uint16_t>();
2440
2441 skvm::I32 r = round(p.uniformF(uniforms, 8) * 15),
2442 a = p.splat(0xf);
2443
2444 skvm::I32 _4444 = p.splat(0);
2445 _4444 = pack(_4444, r, 12);
2446 _4444 = pack(_4444, a, 0);
2447 store16(dst, _4444);
2448 }
2449
2450 test_jit_and_interpreter(p, [&](const skvm::Program& program){
2451 const float uniforms[] = { 0.0f, 0.0f,
2452 1.0f, 0.0f, 0.0f, 1.0f };
2453
2454 uint16_t dst[17] = {0};
2455 program.eval(17, uniforms,dst);
2456 for (int i = 0; i < 17; i++) {
2457 REPORTER_ASSERT(reporter, dst[i] == 0xf00f, "got %04x, want %04x\n", dst[i], 0xf00f);
2458 }
2459 });
2460 }
2461
DEF_TEST(SkVM_features,r)2462 DEF_TEST(SkVM_features, r) {
2463 auto build_program = [](skvm::Builder* b) {
2464 skvm::F32 x = b->loadF(b->varying<float>());
2465 b->storeF(b->varying<float>(), x*x+x);
2466 };
2467
2468 { // load-fma-store with FMA available.
2469 skvm::Features features;
2470 features.fma = true;
2471 skvm::Builder b(features);
2472 build_program(&b);
2473 REPORTER_ASSERT(r, b.optimize().size() == 3);
2474 }
2475
2476 { // load-mul-add-store without FMA.
2477 skvm::Features features;
2478 features.fma = false;
2479 skvm::Builder b(features);
2480 build_program(&b);
2481 REPORTER_ASSERT(r, b.optimize().size() == 4);
2482 }
2483
2484 { // Auto-detected, could be either.
2485 skvm::Builder b;
2486 build_program(&b);
2487 REPORTER_ASSERT(r, b.optimize().size() == 3
2488 || b.optimize().size() == 4);
2489 }
2490 }
2491
DEF_TEST(SkVM_gather_can_hoist,r)2492 DEF_TEST(SkVM_gather_can_hoist, r) {
2493 // A gather instruction isn't necessarily varying... it's whatever its index is.
2494 // First a typical gather scenario with varying index.
2495 {
2496 skvm::Builder b;
2497 skvm::UPtr uniforms = b.uniform();
2498 skvm::Ptr buf = b.varying<int>();
2499 skvm::I32 ix = b.load32(buf);
2500 b.store32(buf, b.gather32(uniforms,0, ix));
2501
2502 skvm::Program p = b.done();
2503
2504 // ix is varying, so the gather is too.
2505 //
2506 // loop:
2507 // v0 = load32 buf
2508 // v1 = gather32 uniforms+0 v0
2509 // store32 buf v1
2510 REPORTER_ASSERT(r, p.instructions().size() == 3);
2511 REPORTER_ASSERT(r, p.loop() == 0);
2512 }
2513
2514 // Now the same but with a uniform index instead.
2515 {
2516 skvm::Builder b;
2517 skvm::UPtr uniforms = b.uniform();
2518 skvm::Ptr buf = b.varying<int>();
2519 skvm::I32 ix = b.uniform32(uniforms,8);
2520 b.store32(buf, b.gather32(uniforms,0, ix));
2521
2522 skvm::Program p = b.done();
2523
2524 // ix is uniform, so the gather is too.
2525 //
2526 // v0 = uniform32 uniforms+8
2527 // v1 = gather32 uniforms+0 v0
2528 // loop:
2529 // store32 buf v1
2530 REPORTER_ASSERT(r, p.instructions().size() == 3);
2531 REPORTER_ASSERT(r, p.loop() == 2);
2532 }
2533 }
2534
DEF_TEST(SkVM_dont_dedup_loads,r)2535 DEF_TEST(SkVM_dont_dedup_loads, r) {
2536 // We've been assuming that all Ops with the same arguments produce the same value
2537 // and deduplicating them, which results in a simple common subexpression eliminator.
2538 //
2539 // But we can't soundly dedup two identical loads with a store between.
2540 // If we dedup the loads in this test program it will always increment by 1, not K.
2541 constexpr int K = 2;
2542 skvm::Builder b;
2543 {
2544 skvm::Ptr buf = b.varying<int>();
2545 for (int i = 0; i < K; i++) {
2546 b.store32(buf, b.load32(buf) + 1);
2547 }
2548 }
2549
2550 test_jit_and_interpreter(b, [&](const skvm::Program& program){
2551 int buf[] = { 0,1,2,3,4 };
2552 program.eval(SK_ARRAY_COUNT(buf), buf);
2553 for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
2554 REPORTER_ASSERT(r, buf[i] == i+K);
2555 }
2556 });
2557 }
2558
DEF_TEST(SkVM_dont_dedup_stores,r)2559 DEF_TEST(SkVM_dont_dedup_stores, r) {
2560 // Following a similar line of reasoning to SkVM_dont_dedup_loads,
2561 // we cannot dedup stores either. A different store between two identical stores
2562 // will invalidate the first store, meaning we do need to reissue that store operation.
2563 skvm::Builder b;
2564 {
2565 skvm::Ptr buf = b.varying<int>();
2566 b.store32(buf, b.splat(4));
2567 b.store32(buf, b.splat(5));
2568 b.store32(buf, b.splat(4)); // If we dedup'd, we'd skip this store.
2569 }
2570
2571 test_jit_and_interpreter(b, [&](const skvm::Program& program){
2572 int buf[42];
2573 program.eval(SK_ARRAY_COUNT(buf), buf);
2574 for (int x : buf) {
2575 REPORTER_ASSERT(r, x == 4);
2576 }
2577 });
2578 }
2579
DEF_TEST(SkVM_fast_mul,r)2580 DEF_TEST(SkVM_fast_mul, r) {
2581 skvm::Builder b;
2582 {
2583 skvm::Ptr src = b.varying<float>(),
2584 fast = b.varying<float>(),
2585 slow = b.varying<float>();
2586 skvm::F32 x = b.loadF(src);
2587 b.storeF(fast, fast_mul(0.0f, x));
2588 b.storeF(slow, 0.0f * x);
2589 }
2590 test_jit_and_interpreter(b, [&](const skvm::Program& program){
2591 const uint32_t bits[] = {
2592 0x0000'0000, 0x8000'0000, //±0
2593 0x3f80'0000, 0xbf80'0000, //±1
2594 0x7f80'0000, 0xff80'0000, //±inf
2595 0x7f80'0001, 0xff80'0001, //±NaN
2596 };
2597 float fast[8],
2598 slow[8];
2599 program.eval(8,bits,fast,slow);
2600
2601 for (int i = 0; i < 8; i++) {
2602 REPORTER_ASSERT(r, fast[i] == 0.0f);
2603
2604 if (i < 4) {
2605 REPORTER_ASSERT(r, slow[i] == 0.0f);
2606 } else {
2607 REPORTER_ASSERT(r, isnan(slow[i]));
2608 }
2609 }
2610 });
2611 }
2612