1 /*
2 * Copyright 2019 Google LLC
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "include/core/SkColorPriv.h"
9 #include "include/private/SkColorData.h"
10 #include "src/core/SkMSAN.h"
11 #include "src/core/SkVM.h"
12 #include "tests/Test.h"
13 #include "tools/Resources.h"
14 #include "tools/SkVMBuilders.h"
15
16 using Fmt = SrcoverBuilder_F32::Fmt;
fmt_name(Fmt fmt)17 const char* fmt_name(Fmt fmt) {
18 switch (fmt) {
19 case Fmt::A8: return "A8";
20 case Fmt::G8: return "G8";
21 case Fmt::RGBA_8888: return "RGBA_8888";
22 }
23 return "";
24 }
25
dump(skvm::Builder & builder,SkWStream * o)26 static void dump(skvm::Builder& builder, SkWStream* o) {
27 skvm::Program program = builder.done();
28 builder.dump(o);
29 o->writeText("\n");
30 program.dump(o);
31 o->writeText("\n");
32 }
33
34 // TODO: I'd like this to go away and have every test in here run both JIT and interpreter.
35 template <typename Fn>
test_interpreter_only(skiatest::Reporter * r,skvm::Program && program,Fn && test)36 static void test_interpreter_only(skiatest::Reporter* r, skvm::Program&& program, Fn&& test) {
37 REPORTER_ASSERT(r, !program.hasJIT());
38 test((const skvm::Program&) program);
39 }
40
41 template <typename Fn>
test_jit_and_interpreter(skiatest::Reporter * r,skvm::Program && program,Fn && test)42 static void test_jit_and_interpreter(skiatest::Reporter* r, skvm::Program&& program, Fn&& test) {
43 static const bool can_jit = []{
44 // This is about the simplest program we can write, setting an int buffer to a constant.
45 // If this can't JIT, the platform does not support JITing.
46 skvm::Builder b;
47 b.store32(b.varying<int>(), b.splat(42));
48 skvm::Program p = b.done();
49 return p.hasJIT();
50 }();
51
52 if (can_jit) {
53 REPORTER_ASSERT(r, program.hasJIT());
54 test((const skvm::Program&) program);
55 program.dropJIT();
56 }
57 test_interpreter_only(r, std::move(program), std::move(test));
58 }
59
60
DEF_TEST(SkVM,r)61 DEF_TEST(SkVM, r) {
62 SkDynamicMemoryWStream buf;
63
64 // Write all combinations of SrcoverBuilder_F32
65 for (int s = 0; s < 3; s++)
66 for (int d = 0; d < 3; d++) {
67 auto srcFmt = (Fmt)s,
68 dstFmt = (Fmt)d;
69 SrcoverBuilder_F32 builder{srcFmt, dstFmt};
70
71 buf.writeText(fmt_name(srcFmt));
72 buf.writeText(" over ");
73 buf.writeText(fmt_name(dstFmt));
74 buf.writeText("\n");
75 dump(builder, &buf);
76 }
77
78 // Write the I32 Srcovers also.
79 {
80 SrcoverBuilder_I32_Naive builder;
81 buf.writeText("I32 (Naive) 8888 over 8888\n");
82 dump(builder, &buf);
83 }
84 {
85 SrcoverBuilder_I32 builder;
86 buf.writeText("I32 8888 over 8888\n");
87 dump(builder, &buf);
88 }
89 {
90 SrcoverBuilder_I32_SWAR builder;
91 buf.writeText("I32 (SWAR) 8888 over 8888\n");
92 dump(builder, &buf);
93 }
94
95 {
96 skvm::Builder b;
97 skvm::Arg arg = b.varying<int>();
98
99 // x and y can both be hoisted,
100 // and x can die at y, while y must live for the loop.
101 skvm::I32 x = b.splat(1),
102 y = b.add(x, b.splat(2));
103 b.store32(arg, b.mul(b.load32(arg), y));
104
105 skvm::Program program = b.done();
106 REPORTER_ASSERT(r, program.nregs() == 2);
107
108 std::vector<skvm::OptimizedInstruction> insts = b.optimize();
109 REPORTER_ASSERT(r, insts.size() == 6);
110 REPORTER_ASSERT(r, insts[0].can_hoist && insts[0].death == 2 && !insts[0].used_in_loop);
111 REPORTER_ASSERT(r, insts[1].can_hoist && insts[1].death == 2 && !insts[1].used_in_loop);
112 REPORTER_ASSERT(r, insts[2].can_hoist && insts[2].death == 4 && insts[2].used_in_loop);
113 REPORTER_ASSERT(r, !insts[3].can_hoist);
114 REPORTER_ASSERT(r, !insts[4].can_hoist);
115 REPORTER_ASSERT(r, !insts[5].can_hoist);
116
117 dump(b, &buf);
118
119 test_jit_and_interpreter(r, std::move(program), [&](const skvm::Program& program) {
120 int arg[] = {0,1,2,3,4,5,6,7,8,9};
121
122 program.eval(SK_ARRAY_COUNT(arg), arg);
123
124 for (int i = 0; i < (int)SK_ARRAY_COUNT(arg); i++) {
125 REPORTER_ASSERT(r, arg[i] == i*3);
126 }
127 });
128 }
129
130 {
131 // Demonstrate the value of program reordering.
132 skvm::Builder b;
133 skvm::Arg sp = b.varying<int>(),
134 dp = b.varying<int>();
135
136 skvm::I32 byte = b.splat(0xff);
137
138 skvm::I32 src = b.load32(sp),
139 sr = b.extract(src, 0, byte),
140 sg = b.extract(src, 8, byte),
141 sb = b.extract(src, 16, byte),
142 sa = b.extract(src, 24, byte);
143
144 skvm::I32 dst = b.load32(dp),
145 dr = b.extract(dst, 0, byte),
146 dg = b.extract(dst, 8, byte),
147 db = b.extract(dst, 16, byte),
148 da = b.extract(dst, 24, byte);
149
150 skvm::I32 R = b.add(sr, dr),
151 G = b.add(sg, dg),
152 B = b.add(sb, db),
153 A = b.add(sa, da);
154
155 skvm::I32 rg = b.pack(R, G, 8),
156 ba = b.pack(B, A, 8),
157 rgba = b.pack(rg, ba, 16);
158
159 b.store32(dp, rgba);
160
161 dump(b, &buf);
162 }
163
164 sk_sp<SkData> blob = buf.detachAsData();
165 {
166
167 sk_sp<SkData> expected = GetResourceAsData("SkVMTest.expected");
168 REPORTER_ASSERT(r, expected, "Couldn't load SkVMTest.expected.");
169 if (expected) {
170 if (blob->size() != expected->size()
171 || 0 != memcmp(blob->data(), expected->data(), blob->size())) {
172
173 ERRORF(r, "SkVMTest expected\n%.*s\nbut got\n%.*s\n",
174 expected->size(), expected->data(),
175 blob->size(), blob->data());
176 }
177
178 SkFILEWStream out(GetResourcePath("SkVMTest.expected").c_str());
179 if (out.isValid()) {
180 out.write(blob->data(), blob->size());
181 }
182 }
183 }
184
185 auto test_8888 = [&](skvm::Program&& program) {
186 uint32_t src[9];
187 uint32_t dst[SK_ARRAY_COUNT(src)];
188
189 test_jit_and_interpreter(r, std::move(program), [&](const skvm::Program& program) {
190 for (int i = 0; i < (int)SK_ARRAY_COUNT(src); i++) {
191 src[i] = 0xbb007733;
192 dst[i] = 0xffaaccee;
193 }
194
195 SkPMColor expected = SkPMSrcOver(src[0], dst[0]); // 0xff2dad73
196
197 program.eval((int)SK_ARRAY_COUNT(src), src, dst);
198
199 // dst is probably 0xff2dad72.
200 for (auto got : dst) {
201 auto want = expected;
202 for (int i = 0; i < 4; i++) {
203 uint8_t d = got & 0xff,
204 w = want & 0xff;
205 if (abs(d-w) >= 2) {
206 SkDebugf("d %02x, w %02x\n", d,w);
207 }
208 REPORTER_ASSERT(r, abs(d-w) < 2);
209 got >>= 8;
210 want >>= 8;
211 }
212 }
213 });
214 };
215
216 test_8888(SrcoverBuilder_F32{Fmt::RGBA_8888, Fmt::RGBA_8888}.done("srcover_f32"));
217 test_8888(SrcoverBuilder_I32_Naive{}.done("srcover_i32_naive"));
218 test_8888(SrcoverBuilder_I32{}.done("srcover_i32"));
219 test_8888(SrcoverBuilder_I32_SWAR{}.done("srcover_i32_SWAR"));
220
221 test_jit_and_interpreter(r, SrcoverBuilder_F32{Fmt::RGBA_8888, Fmt::G8}.done(),
222 [&](const skvm::Program& program) {
223 uint32_t src[9];
224 uint8_t dst[SK_ARRAY_COUNT(src)];
225
226 for (int i = 0; i < (int)SK_ARRAY_COUNT(src); i++) {
227 src[i] = 0xbb007733;
228 dst[i] = 0x42;
229 }
230
231 SkPMColor over = SkPMSrcOver(SkPackARGB32(0xbb, 0x33, 0x77, 0x00),
232 0xff424242);
233
234 uint8_t want = SkComputeLuminance(SkGetPackedR32(over),
235 SkGetPackedG32(over),
236 SkGetPackedB32(over));
237 program.eval((int)SK_ARRAY_COUNT(src), src, dst);
238
239 for (auto got : dst) {
240 REPORTER_ASSERT(r, abs(got-want) < 3);
241 }
242 });
243
244 test_jit_and_interpreter(r, SrcoverBuilder_F32{Fmt::A8, Fmt::A8}.done(),
245 [&](const skvm::Program& program) {
246 uint8_t src[256],
247 dst[256];
248 for (int i = 0; i < 256; i++) {
249 src[i] = 255 - i;
250 dst[i] = i;
251 }
252
253 program.eval(256, src, dst);
254
255 for (int i = 0; i < 256; i++) {
256 uint8_t want = SkGetPackedA32(SkPMSrcOver(SkPackARGB32(src[i], 0,0,0),
257 SkPackARGB32( i, 0,0,0)));
258 REPORTER_ASSERT(r, abs(dst[i]-want) < 2);
259 }
260 });
261 }
262
DEF_TEST(SkVM_Pointless,r)263 DEF_TEST(SkVM_Pointless, r) {
264 // Let's build a program with no memory arguments.
265 // It should all be pegged as dead code, but we should be able to "run" it.
266 skvm::Builder b;
267 {
268 b.add(b.splat(5.0f),
269 b.splat(4.0f));
270 }
271
272 test_jit_and_interpreter(r, b.done(), [&](const skvm::Program& program) {
273 for (int N = 0; N < 64; N++) {
274 program.eval(N);
275 }
276 });
277
278 for (const skvm::OptimizedInstruction& inst : b.optimize()) {
279 REPORTER_ASSERT(r, inst.death == 0 && inst.can_hoist == true);
280 }
281 }
282
DEF_TEST(SkVM_LoopCounts,r)283 DEF_TEST(SkVM_LoopCounts, r) {
284 // Make sure we cover all the exact N we want.
285
286 // buf[i] += 1
287 skvm::Builder b;
288 skvm::Arg arg = b.varying<int>();
289 b.store32(arg,
290 b.add(b.splat(1),
291 b.load32(arg)));
292
293 test_jit_and_interpreter(r, b.done(), [&](const skvm::Program& program) {
294 int buf[64];
295 for (int N = 0; N <= (int)SK_ARRAY_COUNT(buf); N++) {
296 for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
297 buf[i] = i;
298 }
299 program.eval(N, buf);
300
301 for (int i = 0; i < N; i++) {
302 REPORTER_ASSERT(r, buf[i] == i+1);
303 }
304 for (int i = N; i < (int)SK_ARRAY_COUNT(buf); i++) {
305 REPORTER_ASSERT(r, buf[i] == i);
306 }
307 }
308 });
309 }
310
DEF_TEST(SkVM_gather32,r)311 DEF_TEST(SkVM_gather32, r) {
312 skvm::Builder b;
313 {
314 skvm::Arg uniforms = b.uniform(),
315 buf = b.varying<int>();
316 skvm::I32 x = b.load32(buf);
317 b.store32(buf, b.gather32(uniforms,0, b.bit_and(x, b.splat(7))));
318 }
319
320 #if defined(SK_CPU_X86)
321 test_jit_and_interpreter
322 #else
323 test_interpreter_only
324 #endif
325 (r, b.done(), [&](const skvm::Program& program) {
326 const int img[] = {12,34,56,78, 90,98,76,54};
327
328 int buf[20];
329 for (int i = 0; i < 20; i++) {
330 buf[i] = i;
331 }
332
333 struct Uniforms {
334 const int* img;
335 } uniforms{img};
336
337 program.eval(20, &uniforms, buf);
338 int i = 0;
339 REPORTER_ASSERT(r, buf[i] == 12); i++;
340 REPORTER_ASSERT(r, buf[i] == 34); i++;
341 REPORTER_ASSERT(r, buf[i] == 56); i++;
342 REPORTER_ASSERT(r, buf[i] == 78); i++;
343 REPORTER_ASSERT(r, buf[i] == 90); i++;
344 REPORTER_ASSERT(r, buf[i] == 98); i++;
345 REPORTER_ASSERT(r, buf[i] == 76); i++;
346 REPORTER_ASSERT(r, buf[i] == 54); i++;
347
348 REPORTER_ASSERT(r, buf[i] == 12); i++;
349 REPORTER_ASSERT(r, buf[i] == 34); i++;
350 REPORTER_ASSERT(r, buf[i] == 56); i++;
351 REPORTER_ASSERT(r, buf[i] == 78); i++;
352 REPORTER_ASSERT(r, buf[i] == 90); i++;
353 REPORTER_ASSERT(r, buf[i] == 98); i++;
354 REPORTER_ASSERT(r, buf[i] == 76); i++;
355 REPORTER_ASSERT(r, buf[i] == 54); i++;
356
357 REPORTER_ASSERT(r, buf[i] == 12); i++;
358 REPORTER_ASSERT(r, buf[i] == 34); i++;
359 REPORTER_ASSERT(r, buf[i] == 56); i++;
360 REPORTER_ASSERT(r, buf[i] == 78); i++;
361 });
362 }
363
DEF_TEST(SkVM_gathers,r)364 DEF_TEST(SkVM_gathers, r) {
365 skvm::Builder b;
366 {
367 skvm::Arg uniforms = b.uniform(),
368 buf32 = b.varying<int>(),
369 buf16 = b.varying<uint16_t>(),
370 buf8 = b.varying<uint8_t>();
371
372 skvm::I32 x = b.load32(buf32);
373
374 b.store32(buf32, b.gather32(uniforms,0, b.bit_and(x, b.splat( 7))));
375 b.store16(buf16, b.gather16(uniforms,0, b.bit_and(x, b.splat(15))));
376 b.store8 (buf8 , b.gather8 (uniforms,0, b.bit_and(x, b.splat(31))));
377 }
378
379 test_interpreter_only(r, b.done(), [&](const skvm::Program& program) {
380 const int img[] = {12,34,56,78, 90,98,76,54};
381
382 constexpr int N = 20;
383 int buf32[N];
384 uint16_t buf16[N];
385 uint8_t buf8 [N];
386
387 for (int i = 0; i < 20; i++) {
388 buf32[i] = i;
389 }
390
391 struct Uniforms {
392 const int* img;
393 } uniforms{img};
394
395 program.eval(N, &uniforms, buf32, buf16, buf8);
396 int i = 0;
397 REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 12 && buf8[i] == 12); i++;
398 REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] == 0 && buf8[i] == 0); i++;
399 REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 34 && buf8[i] == 0); i++;
400 REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] == 0 && buf8[i] == 0); i++;
401 REPORTER_ASSERT(r, buf32[i] == 90 && buf16[i] == 56 && buf8[i] == 34); i++;
402 REPORTER_ASSERT(r, buf32[i] == 98 && buf16[i] == 0 && buf8[i] == 0); i++;
403 REPORTER_ASSERT(r, buf32[i] == 76 && buf16[i] == 78 && buf8[i] == 0); i++;
404 REPORTER_ASSERT(r, buf32[i] == 54 && buf16[i] == 0 && buf8[i] == 0); i++;
405
406 REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 90 && buf8[i] == 56); i++;
407 REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] == 0 && buf8[i] == 0); i++;
408 REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 98 && buf8[i] == 0); i++;
409 REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] == 0 && buf8[i] == 0); i++;
410 REPORTER_ASSERT(r, buf32[i] == 90 && buf16[i] == 76 && buf8[i] == 78); i++;
411 REPORTER_ASSERT(r, buf32[i] == 98 && buf16[i] == 0 && buf8[i] == 0); i++;
412 REPORTER_ASSERT(r, buf32[i] == 76 && buf16[i] == 54 && buf8[i] == 0); i++;
413 REPORTER_ASSERT(r, buf32[i] == 54 && buf16[i] == 0 && buf8[i] == 0); i++;
414
415 REPORTER_ASSERT(r, buf32[i] == 12 && buf16[i] == 12 && buf8[i] == 90); i++;
416 REPORTER_ASSERT(r, buf32[i] == 34 && buf16[i] == 0 && buf8[i] == 0); i++;
417 REPORTER_ASSERT(r, buf32[i] == 56 && buf16[i] == 34 && buf8[i] == 0); i++;
418 REPORTER_ASSERT(r, buf32[i] == 78 && buf16[i] == 0 && buf8[i] == 0); i++;
419 });
420 }
421
DEF_TEST(SkVM_bitops,r)422 DEF_TEST(SkVM_bitops, r) {
423 skvm::Builder b;
424 {
425 skvm::Arg ptr = b.varying<int>();
426
427 skvm::I32 x = b.load32(ptr);
428
429 x = b.bit_and (x, b.splat(0xf1)); // 0x40
430 x = b.bit_or (x, b.splat(0x80)); // 0xc0
431 x = b.bit_xor (x, b.splat(0xfe)); // 0x3e
432 x = b.bit_clear(x, b.splat(0x30)); // 0x0e
433
434 x = b.shl(x, 28); // 0xe000'0000
435 x = b.sra(x, 28); // 0xffff'fffe
436 x = b.shr(x, 1); // 0x7fff'ffff
437
438 b.store32(ptr, x);
439 }
440
441 test_jit_and_interpreter(r, b.done(), [&](const skvm::Program& program) {
442 int x = 0x42;
443 program.eval(1, &x);
444 REPORTER_ASSERT(r, x == 0x7fff'ffff);
445 });
446 }
447
DEF_TEST(SkVM_f32,r)448 DEF_TEST(SkVM_f32, r) {
449 skvm::Builder b;
450 {
451 skvm::Arg arg = b.varying<float>();
452
453 skvm::F32 x = b.bit_cast(b.load32(arg)),
454 y = b.add(x,x), // y = 2x
455 z = b.sub(y,x), // z = 2x-x = x
456 w = b.div(z,x); // w = x/x = 1
457 b.store32(arg, b.bit_cast(w));
458 }
459
460 test_jit_and_interpreter(r, b.done(), [&](const skvm::Program& program) {
461 float buf[] = { 1,2,3,4,5,6,7,8,9 };
462 program.eval(SK_ARRAY_COUNT(buf), buf);
463 for (float v : buf) {
464 REPORTER_ASSERT(r, v == 1.0f);
465 }
466 });
467 }
468
DEF_TEST(SkVM_cmp_i32,r)469 DEF_TEST(SkVM_cmp_i32, r) {
470 skvm::Builder b;
471 {
472 skvm::I32 x = b.load32(b.varying<int>());
473
474 auto to_bit = [&](int shift, skvm::I32 mask) {
475 return b.shl(b.bit_and(mask, b.splat(0x1)), shift);
476 };
477
478 skvm::I32 m = b.splat(0);
479 m = b.bit_or(m, to_bit(0, b. eq(x, b.splat(0))));
480 m = b.bit_or(m, to_bit(1, b.neq(x, b.splat(1))));
481 m = b.bit_or(m, to_bit(2, b. lt(x, b.splat(2))));
482 m = b.bit_or(m, to_bit(3, b.lte(x, b.splat(3))));
483 m = b.bit_or(m, to_bit(4, b. gt(x, b.splat(4))));
484 m = b.bit_or(m, to_bit(5, b.gte(x, b.splat(5))));
485
486 b.store32(b.varying<int>(), m);
487 }
488
489 test_interpreter_only(r, b.done(), [&](const skvm::Program& program) {
490 int in[] = { 0,1,2,3,4,5,6,7,8,9 };
491 int out[SK_ARRAY_COUNT(in)];
492
493 program.eval(SK_ARRAY_COUNT(in), in, out);
494
495 REPORTER_ASSERT(r, out[0] == 0b001111);
496 REPORTER_ASSERT(r, out[1] == 0b001100);
497 REPORTER_ASSERT(r, out[2] == 0b001010);
498 REPORTER_ASSERT(r, out[3] == 0b001010);
499 REPORTER_ASSERT(r, out[4] == 0b000010);
500 for (int i = 5; i < (int)SK_ARRAY_COUNT(out); i++) {
501 REPORTER_ASSERT(r, out[i] == 0b110010);
502 }
503 });
504 }
505
DEF_TEST(SkVM_cmp_f32,r)506 DEF_TEST(SkVM_cmp_f32, r) {
507 skvm::Builder b;
508 {
509 skvm::F32 x = b.bit_cast(b.load32(b.varying<float>()));
510
511 auto to_bit = [&](int shift, skvm::I32 mask) {
512 return b.shl(b.bit_and(mask, b.splat(0x1)), shift);
513 };
514
515 skvm::I32 m = b.splat(0);
516 m = b.bit_or(m, to_bit(0, b. eq(x, b.splat(0.0f))));
517 m = b.bit_or(m, to_bit(1, b.neq(x, b.splat(1.0f))));
518 m = b.bit_or(m, to_bit(2, b. lt(x, b.splat(2.0f))));
519 m = b.bit_or(m, to_bit(3, b.lte(x, b.splat(3.0f))));
520 m = b.bit_or(m, to_bit(4, b. gt(x, b.splat(4.0f))));
521 m = b.bit_or(m, to_bit(5, b.gte(x, b.splat(5.0f))));
522
523 b.store32(b.varying<int>(), m);
524 }
525
526 test_jit_and_interpreter(r, b.done(), [&](const skvm::Program& program) {
527 float in[] = { 0,1,2,3,4,5,6,7,8,9 };
528 int out[SK_ARRAY_COUNT(in)];
529
530 program.eval(SK_ARRAY_COUNT(in), in, out);
531
532 REPORTER_ASSERT(r, out[0] == 0b001111);
533 REPORTER_ASSERT(r, out[1] == 0b001100);
534 REPORTER_ASSERT(r, out[2] == 0b001010);
535 REPORTER_ASSERT(r, out[3] == 0b001010);
536 REPORTER_ASSERT(r, out[4] == 0b000010);
537 for (int i = 5; i < (int)SK_ARRAY_COUNT(out); i++) {
538 REPORTER_ASSERT(r, out[i] == 0b110010);
539 }
540 });
541 }
542
DEF_TEST(SkVM_i16x2,r)543 DEF_TEST(SkVM_i16x2, r) {
544 skvm::Builder b;
545 {
546 skvm::Arg buf = b.varying<int>();
547
548 skvm::I32 x = b.load32(buf),
549 y = b.add_16x2(x,x), // y = 2x
550 z = b.mul_16x2(x,y), // z = 2x^2
551 w = b.sub_16x2(z,x), // w = x(2x-1)
552 v = b.shl_16x2(w,7), // These shifts will be a no-op
553 u = b.sra_16x2(v,7); // for all but x=12 and x=13.
554 b.store32(buf, u);
555 }
556
557 test_interpreter_only(r, b.done(), [&](const skvm::Program& program) {
558 uint16_t buf[] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13 };
559
560 program.eval(SK_ARRAY_COUNT(buf)/2, buf);
561 for (int i = 0; i < 12; i++) {
562 REPORTER_ASSERT(r, buf[i] == i*(2*i-1));
563 }
564 REPORTER_ASSERT(r, buf[12] == 0xff14); // 12*23 = 0x114
565 REPORTER_ASSERT(r, buf[13] == 0xff45); // 13*25 = 0x145
566 });
567 }
568
DEF_TEST(SkVM_cmp_i16,r)569 DEF_TEST(SkVM_cmp_i16, r) {
570 skvm::Builder b;
571 {
572 skvm::Arg buf = b.varying<int>();
573 skvm::I32 x = b.load32(buf);
574
575 auto to_bit = [&](int shift, skvm::I32 mask) {
576 return b.shl_16x2(b.bit_and(mask, b.splat(0x0001'0001)), shift);
577 };
578
579 skvm::I32 m = b.splat(0);
580 m = b.bit_or(m, to_bit(0, b. eq_16x2(x, b.splat(0x0000'0000))));
581 m = b.bit_or(m, to_bit(1, b.neq_16x2(x, b.splat(0x0001'0001))));
582 m = b.bit_or(m, to_bit(2, b. lt_16x2(x, b.splat(0x0002'0002))));
583 m = b.bit_or(m, to_bit(3, b.lte_16x2(x, b.splat(0x0003'0003))));
584 m = b.bit_or(m, to_bit(4, b. gt_16x2(x, b.splat(0x0004'0004))));
585 m = b.bit_or(m, to_bit(5, b.gte_16x2(x, b.splat(0x0005'0005))));
586
587 b.store32(buf, m);
588 }
589
590 test_interpreter_only(r, b.done(), [&](const skvm::Program& program) {
591 int16_t buf[] = { 0,1, 2,3, 4,5, 6,7, 8,9 };
592
593 program.eval(SK_ARRAY_COUNT(buf)/2, buf);
594
595 REPORTER_ASSERT(r, buf[0] == 0b001111);
596 REPORTER_ASSERT(r, buf[1] == 0b001100);
597 REPORTER_ASSERT(r, buf[2] == 0b001010);
598 REPORTER_ASSERT(r, buf[3] == 0b001010);
599 REPORTER_ASSERT(r, buf[4] == 0b000010);
600 for (int i = 5; i < (int)SK_ARRAY_COUNT(buf); i++) {
601 REPORTER_ASSERT(r, buf[i] == 0b110010);
602 }
603 });
604 }
605
606
DEF_TEST(SkVM_mad,r)607 DEF_TEST(SkVM_mad, r) {
608 // This program is designed to exercise the tricky corners of instruction
609 // and register selection for Op::mad_f32.
610
611 skvm::Builder b;
612 {
613 skvm::Arg arg = b.varying<int>();
614
615 skvm::F32 x = b.to_f32(b.load32(arg)),
616 y = b.mad(x,x,x), // x is needed in the future, so r[x] != r[y].
617 z = b.mad(y,y,x), // y is needed in the future, but r[z] = r[x] is ok.
618 w = b.mad(z,z,y), // w can alias z but not y.
619 v = b.mad(w,y,w); // Got to stop somewhere.
620 b.store32(arg, b.trunc(v));
621 }
622
623 test_jit_and_interpreter(r, b.done(), [&](const skvm::Program& program) {
624 int x = 2;
625 program.eval(1, &x);
626 // x = 2
627 // y = 2*2 + 2 = 6
628 // z = 6*6 + 2 = 38
629 // w = 38*38 + 6 = 1450
630 // v = 1450*6 + 1450 = 10150
631 REPORTER_ASSERT(r, x == 10150);
632 });
633 }
634
DEF_TEST(SkVM_madder,r)635 DEF_TEST(SkVM_madder, r) {
636 skvm::Builder b;
637 {
638 skvm::Arg arg = b.varying<float>();
639
640 skvm::F32 x = b.bit_cast(b.load32(arg)),
641 y = b.mad(x,x,x), // x is needed in the future, so r[x] != r[y].
642 z = b.mad(y,x,y), // r[x] can be reused after this instruction, but not r[y].
643 w = b.mad(y,y,z);
644 b.store32(arg, b.bit_cast(w));
645 }
646
647 test_jit_and_interpreter(r, b.done(), [&](const skvm::Program& program) {
648 float x = 2.0f;
649 // y = 2*2 + 2 = 6
650 // z = 6*2 + 6 = 18
651 // w = 6*6 + 18 = 54
652 program.eval(1, &x);
653 REPORTER_ASSERT(r, x == 54.0f);
654 });
655 }
656
DEF_TEST(SkVM_floor,r)657 DEF_TEST(SkVM_floor, r) {
658 skvm::Builder b;
659 {
660 skvm::Arg arg = b.varying<float>();
661 b.store32(arg, b.bit_cast(b.floor(b.bit_cast(b.load32(arg)))));
662 }
663
664 #if defined(SK_CPU_X86)
665 test_jit_and_interpreter
666 #else
667 test_interpreter_only
668 #endif
669 (r, b.done(), [&](const skvm::Program& program) {
670 float buf[] = { -2.0f, -1.5f, -1.0f, 0.0f, 1.0f, 1.5f, 2.0f };
671 float want[] = { -2.0f, -2.0f, -1.0f, 0.0f, 1.0f, 1.0f, 2.0f };
672 program.eval(SK_ARRAY_COUNT(buf), buf);
673 for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
674 REPORTER_ASSERT(r, buf[i] == want[i]);
675 }
676 });
677 }
678
DEF_TEST(SkVM_hoist,r)679 DEF_TEST(SkVM_hoist, r) {
680 // This program uses enough constants that it will fail to JIT if we hoist them.
681 // The JIT will try again without hoisting, and that'll just need 2 registers.
682 skvm::Builder b;
683 {
684 skvm::Arg arg = b.varying<int>();
685 skvm::I32 x = b.load32(arg);
686 for (int i = 0; i < 32; i++) {
687 x = b.add(x, b.splat(i));
688 }
689 b.store32(arg, x);
690 }
691
692 test_jit_and_interpreter(r, b.done(), [&](const skvm::Program& program) {
693 int x = 4;
694 program.eval(1, &x);
695 // x += 0 + 1 + 2 + 3 + ... + 30 + 31
696 // x += 496
697 REPORTER_ASSERT(r, x == 500);
698 });
699 }
700
DEF_TEST(SkVM_select,r)701 DEF_TEST(SkVM_select, r) {
702 skvm::Builder b;
703 {
704 skvm::Arg buf = b.varying<int>();
705
706 skvm::I32 x = b.load32(buf);
707
708 x = b.select( b.gt(x, b.splat(4)), x, b.splat(42) );
709
710 b.store32(buf, x);
711 }
712
713 test_jit_and_interpreter(r, b.done(), [&](const skvm::Program& program) {
714 int buf[] = { 0,1,2,3,4,5,6,7,8 };
715 program.eval(SK_ARRAY_COUNT(buf), buf);
716 for (int i = 0; i < (int)SK_ARRAY_COUNT(buf); i++) {
717 REPORTER_ASSERT(r, buf[i] == (i > 4 ? i : 42));
718 }
719 });
720 }
721
DEF_TEST(SkVM_NewOps,r)722 DEF_TEST(SkVM_NewOps, r) {
723 // Exercise a somewhat arbitrary set of new ops.
724 skvm::Builder b;
725 {
726 skvm::Arg buf = b.varying<int16_t>(),
727 uniforms = b.uniform();
728
729 skvm::I32 x = b.load16(buf);
730
731 const size_t kPtr = sizeof(const int*);
732
733 x = b.add(x, b.uniform32(uniforms, kPtr+0));
734 x = b.mul(x, b.uniform8 (uniforms, kPtr+4));
735 x = b.sub(x, b.uniform16(uniforms, kPtr+6));
736
737 skvm::I32 limit = b.uniform32(uniforms, kPtr+8);
738 x = b.select(b.lt(x, b.splat(0)), b.splat(0), x);
739 x = b.select(b.gt(x, limit ), limit , x);
740
741 x = b.gather8(uniforms,0, x);
742
743 b.store16(buf, x);
744 }
745
746 if ((false)) {
747 SkDynamicMemoryWStream buf;
748 dump(b, &buf);
749 sk_sp<SkData> blob = buf.detachAsData();
750 SkDebugf("%.*s\n", blob->size(), blob->data());
751 }
752
753 test_interpreter_only(r, b.done(), [&](const skvm::Program& program) {
754 const int N = 31;
755 int16_t buf[N];
756 for (int i = 0; i < N; i++) {
757 buf[i] = i;
758 }
759
760 const int M = 16;
761 uint8_t img[M];
762 for (int i = 0; i < M; i++) {
763 img[i] = i*i;
764 }
765
766 struct {
767 const uint8_t* img;
768 int add = 5;
769 uint8_t mul = 3;
770 uint16_t sub = 18;
771 int limit = M-1;
772 } uniforms{img};
773
774 program.eval(N, buf, &uniforms);
775
776 for (int i = 0; i < N; i++) {
777 // Our first math calculates x = (i+5)*3 - 18 a.k.a 3*(i-1).
778 int x = 3*(i-1);
779
780 // Then that's pinned to the limits of img.
781 if (i < 2) { x = 0; } // Notice i == 1 hits x == 0 exactly...
782 if (i > 5) { x = 15; } // ...and i == 6 hits x == 15 exactly
783 REPORTER_ASSERT(r, buf[i] == img[x]);
784 }
785 });
786 }
787
DEF_TEST(SkVM_MSAN,r)788 DEF_TEST(SkVM_MSAN, r) {
789 // This little memset32() program should be able to JIT, but if we run that
790 // JIT code in an MSAN build, it won't see the writes initialize buf. So
791 // this tests that we're using the interpreter instead.
792 skvm::Builder b;
793 b.store32(b.varying<int>(), b.splat(42));
794
795 test_jit_and_interpreter(r, b.done(), [&](const skvm::Program& program) {
796 constexpr int K = 17;
797 int buf[K]; // Intentionally uninitialized.
798 program.eval(K, buf);
799 sk_msan_assert_initialized(buf, buf+K);
800 for (int x : buf) {
801 REPORTER_ASSERT(r, x == 42);
802 }
803 });
804 }
805
DEF_TEST(SkVM_assert,r)806 DEF_TEST(SkVM_assert, r) {
807 skvm::Builder b;
808 b.assert_true(b.lt(b.load32(b.varying<int>()),
809 b.splat(42)));
810
811 test_jit_and_interpreter(r, b.done(), [&](const skvm::Program& program) {
812 int buf[] = { 0,1,2,3,4,5,6,7,8,9 };
813 program.eval(SK_ARRAY_COUNT(buf), buf);
814 });
815 }
816
DEF_TEST(SkVM_premul,reporter)817 DEF_TEST(SkVM_premul, reporter) {
818 // Test that premul is short-circuited when alpha is known opaque.
819 {
820 skvm::Builder p;
821 auto rptr = p.varying<int>(),
822 aptr = p.varying<int>();
823
824 skvm::F32 r = p.bit_cast(p.load32(rptr)),
825 g = p.splat(0.0f),
826 b = p.splat(0.0f),
827 a = p.bit_cast(p.load32(aptr));
828
829 p.premul(&r, &g, &b, a);
830 p.store32(rptr, p.bit_cast(r));
831
832 // load red, load alpha, red *= alpha, store red
833 REPORTER_ASSERT(reporter, p.done().instructions().size() == 4);
834 }
835
836 {
837 skvm::Builder p;
838 auto rptr = p.varying<int>();
839
840 skvm::F32 r = p.bit_cast(p.load32(rptr)),
841 g = p.splat(0.0f),
842 b = p.splat(0.0f),
843 a = p.splat(1.0f);
844
845 p.premul(&r, &g, &b, a);
846 p.store32(rptr, p.bit_cast(r));
847
848 // load red, store red
849 REPORTER_ASSERT(reporter, p.done().instructions().size() == 2);
850 }
851
852 // Same deal for unpremul.
853 {
854 skvm::Builder p;
855 auto rptr = p.varying<int>(),
856 aptr = p.varying<int>();
857
858 skvm::F32 r = p.bit_cast(p.load32(rptr)),
859 g = p.splat(0.0f),
860 b = p.splat(0.0f),
861 a = p.bit_cast(p.load32(aptr));
862
863 p.unpremul(&r, &g, &b, a);
864 p.store32(rptr, p.bit_cast(r));
865
866 // load red, load alpha, a bunch of unpremul instructions, store red
867 REPORTER_ASSERT(reporter, p.done().instructions().size() >= 4);
868 }
869
870 {
871 skvm::Builder p;
872 auto rptr = p.varying<int>();
873
874 skvm::F32 r = p.bit_cast(p.load32(rptr)),
875 g = p.splat(0.0f),
876 b = p.splat(0.0f),
877 a = p.splat(1.0f);
878
879 p.unpremul(&r, &g, &b, a);
880 p.store32(rptr, p.bit_cast(r));
881
882 // load red, store red
883 REPORTER_ASSERT(reporter, p.done().instructions().size() == 2);
884 }
885 }
886
887 template <typename Fn>
test_asm(skiatest::Reporter * r,Fn && fn,std::initializer_list<uint8_t> expected)888 static void test_asm(skiatest::Reporter* r, Fn&& fn, std::initializer_list<uint8_t> expected) {
889 uint8_t buf[4096];
890 skvm::Assembler a{buf};
891 fn(a);
892
893 REPORTER_ASSERT(r, a.size() == expected.size());
894
895 auto got = (const uint8_t*)buf,
896 want = expected.begin();
897 for (int i = 0; i < (int)std::min(a.size(), expected.size()); i++) {
898 REPORTER_ASSERT(r, got[i] == want[i],
899 "byte %d was %02x, want %02x", i, got[i], want[i]);
900 }
901 }
902
DEF_TEST(SkVM_Assembler,r)903 DEF_TEST(SkVM_Assembler, r) {
904 // Easiest way to generate test cases is
905 //
906 // echo '...some asm...' | llvm-mc -show-encoding -x86-asm-syntax=intel
907 //
908 // The -x86-asm-syntax=intel bit is optional, controlling the
909 // input syntax only; the output will always be AT&T op x,y,dst style.
910 // Our APIs read more like Intel op dst,x,y as op(dst,x,y), so I find
911 // that a bit easier to use here, despite maybe favoring AT&T overall.
912
913 using A = skvm::Assembler;
914 // Our exit strategy from AVX code.
915 test_asm(r, [&](A& a) {
916 a.int3();
917 a.vzeroupper();
918 a.ret();
919 },{
920 0xcc,
921 0xc5, 0xf8, 0x77,
922 0xc3,
923 });
924
925 // Align should pad with zero
926 test_asm(r, [&](A& a) {
927 a.ret();
928 a.align(4);
929 },{
930 0xc3,
931 0x00, 0x00, 0x00,
932 });
933
934 test_asm(r, [&](A& a) {
935 a.add(A::rax, 8); // Always good to test rax.
936 a.sub(A::rax, 32);
937
938 a.add(A::rdi, 12); // Last 0x48 REX
939 a.sub(A::rdi, 8);
940
941 a.add(A::r8 , 7); // First 0x49 REX
942 a.sub(A::r8 , 4);
943
944 a.add(A::rsi, 128); // Requires 4 byte immediate.
945 a.sub(A::r8 , 1000000);
946 },{
947 0x48, 0x83, 0b11'000'000, 0x08,
948 0x48, 0x83, 0b11'101'000, 0x20,
949
950 0x48, 0x83, 0b11'000'111, 0x0c,
951 0x48, 0x83, 0b11'101'111, 0x08,
952
953 0x49, 0x83, 0b11'000'000, 0x07,
954 0x49, 0x83, 0b11'101'000, 0x04,
955
956 0x48, 0x81, 0b11'000'110, 0x80, 0x00, 0x00, 0x00,
957 0x49, 0x81, 0b11'101'000, 0x40, 0x42, 0x0f, 0x00,
958 });
959
960
961 test_asm(r, [&](A& a) {
962 a.vpaddd (A::ymm0, A::ymm1, A::ymm2); // Low registers and 0x0f map -> 2-byte VEX.
963 a.vpaddd (A::ymm8, A::ymm1, A::ymm2); // A high dst register is ok -> 2-byte VEX.
964 a.vpaddd (A::ymm0, A::ymm8, A::ymm2); // A high first argument register -> 2-byte VEX.
965 a.vpaddd (A::ymm0, A::ymm1, A::ymm8); // A high second argument -> 3-byte VEX.
966 a.vpmulld(A::ymm0, A::ymm1, A::ymm2); // Using non-0x0f map instruction -> 3-byte VEX.
967 a.vpsubd (A::ymm0, A::ymm1, A::ymm2); // Test vpsubd to ensure argument order is right.
968 },{
969 /* VEX */ /*op*/ /*modRM*/
970 0xc5, 0xf5, 0xfe, 0xc2,
971 0xc5, 0x75, 0xfe, 0xc2,
972 0xc5, 0xbd, 0xfe, 0xc2,
973 0xc4, 0xc1, 0x75, 0xfe, 0xc0,
974 0xc4, 0xe2, 0x75, 0x40, 0xc2,
975 0xc5, 0xf5, 0xfa, 0xc2,
976 });
977
978 test_asm(r, [&](A& a) {
979 a.vpcmpeqd (A::ymm0, A::ymm1, A::ymm2);
980 a.vpcmpgtd (A::ymm0, A::ymm1, A::ymm2);
981 a.vcmpeqps (A::ymm0, A::ymm1, A::ymm2);
982 a.vcmpltps (A::ymm0, A::ymm1, A::ymm2);
983 a.vcmpleps (A::ymm0, A::ymm1, A::ymm2);
984 a.vcmpneqps(A::ymm0, A::ymm1, A::ymm2);
985 },{
986 0xc5,0xf5,0x76,0xc2,
987 0xc5,0xf5,0x66,0xc2,
988 0xc5,0xf4,0xc2,0xc2,0x00,
989 0xc5,0xf4,0xc2,0xc2,0x01,
990 0xc5,0xf4,0xc2,0xc2,0x02,
991 0xc5,0xf4,0xc2,0xc2,0x04,
992 });
993
994 test_asm(r, [&](A& a) {
995 a.vminps(A::ymm0, A::ymm1, A::ymm2);
996 a.vmaxps(A::ymm0, A::ymm1, A::ymm2);
997 },{
998 0xc5,0xf4,0x5d,0xc2,
999 0xc5,0xf4,0x5f,0xc2,
1000 });
1001
1002 test_asm(r, [&](A& a) {
1003 a.vpblendvb(A::ymm0, A::ymm1, A::ymm2, A::ymm3);
1004 },{
1005 0xc4,0xe3,0x75, 0x4c, 0xc2, 0x30,
1006 });
1007
1008 test_asm(r, [&](A& a) {
1009 a.vpsrld(A::ymm15, A::ymm2, 8);
1010 a.vpsrld(A::ymm0 , A::ymm8, 5);
1011 },{
1012 0xc5, 0x85, 0x72,0xd2, 0x08,
1013 0xc4,0xc1,0x7d, 0x72,0xd0, 0x05,
1014 });
1015
1016 test_asm(r, [&](A& a) {
1017 a.vpermq(A::ymm1, A::ymm2, 5);
1018 },{
1019 0xc4,0xe3,0xfd, 0x00,0xca, 0x05,
1020 });
1021
1022 test_asm(r, [&](A& a) {
1023 a.vroundps(A::ymm1, A::ymm2, A::NEAREST);
1024 a.vroundps(A::ymm1, A::ymm2, A::FLOOR);
1025 a.vroundps(A::ymm1, A::ymm2, A::CEIL);
1026 a.vroundps(A::ymm1, A::ymm2, A::TRUNC);
1027 },{
1028 0xc4,0xe3,0x7d,0x08,0xca,0x00,
1029 0xc4,0xe3,0x7d,0x08,0xca,0x01,
1030 0xc4,0xe3,0x7d,0x08,0xca,0x02,
1031 0xc4,0xe3,0x7d,0x08,0xca,0x03,
1032 });
1033
1034 test_asm(r, [&](A& a) {
1035 A::Label l = a.here();
1036 a.byte(1);
1037 a.byte(2);
1038 a.byte(3);
1039 a.byte(4);
1040
1041 a.vbroadcastss(A::ymm0 , &l);
1042 a.vbroadcastss(A::ymm1 , &l);
1043 a.vbroadcastss(A::ymm8 , &l);
1044 a.vbroadcastss(A::ymm15, &l);
1045
1046 a.vpshufb(A::ymm4, A::ymm3, &l);
1047 a.vpaddd (A::ymm4, A::ymm3, &l);
1048 a.vpsubd (A::ymm4, A::ymm3, &l);
1049
1050 a.vptest(A::ymm4, &l);
1051
1052 a.vmulps (A::ymm4, A::ymm3, &l);
1053 },{
1054 0x01, 0x02, 0x03, 0x4,
1055
1056 /* VEX */ /*op*/ /* ModRM */ /* offset */
1057 0xc4, 0xe2, 0x7d, 0x18, 0b00'000'101, 0xf3,0xff,0xff,0xff, // 0xfffffff3 == -13
1058 0xc4, 0xe2, 0x7d, 0x18, 0b00'001'101, 0xea,0xff,0xff,0xff, // 0xffffffea == -22
1059 0xc4, 0x62, 0x7d, 0x18, 0b00'000'101, 0xe1,0xff,0xff,0xff, // 0xffffffe1 == -31
1060 0xc4, 0x62, 0x7d, 0x18, 0b00'111'101, 0xd8,0xff,0xff,0xff, // 0xffffffd8 == -40
1061
1062 0xc4, 0xe2, 0x65, 0x00, 0b00'100'101, 0xcf,0xff,0xff,0xff, // 0xffffffcf == -49
1063
1064 0xc5, 0xe5, 0xfe, 0b00'100'101, 0xc7,0xff,0xff,0xff, // 0xffffffc7 == -57
1065 0xc5, 0xe5, 0xfa, 0b00'100'101, 0xbf,0xff,0xff,0xff, // 0xffffffbf == -65
1066
1067 0xc4, 0xe2, 0x7d, 0x17, 0b00'100'101, 0xb6,0xff,0xff,0xff, // 0xffffffb6 == -74
1068
1069 0xc5, 0xe4, 0x59, 0b00'100'101, 0xae,0xff,0xff,0xff, // 0xffffffaf == -82
1070 });
1071
1072 test_asm(r, [&](A& a) {
1073 a.vbroadcastss(A::ymm0, A::rdi, 0);
1074 a.vbroadcastss(A::ymm13, A::r14, 7);
1075 a.vbroadcastss(A::ymm8, A::rdx, -12);
1076 a.vbroadcastss(A::ymm8, A::rdx, 400);
1077
1078 a.vbroadcastss(A::ymm8, A::xmm0);
1079 a.vbroadcastss(A::ymm0, A::xmm13);
1080 },{
1081 /* VEX */ /*op*/ /*ModRM*/ /*offset*/
1082 0xc4,0xe2,0x7d, 0x18, 0b00'000'111,
1083 0xc4,0x42,0x7d, 0x18, 0b01'101'110, 0x07,
1084 0xc4,0x62,0x7d, 0x18, 0b01'000'010, 0xf4,
1085 0xc4,0x62,0x7d, 0x18, 0b10'000'010, 0x90,0x01,0x00,0x00,
1086
1087 0xc4,0x62,0x7d, 0x18, 0b11'000'000,
1088 0xc4,0xc2,0x7d, 0x18, 0b11'000'101,
1089 });
1090
1091 test_asm(r, [&](A& a) {
1092 A::Label l = a.here();
1093 a.jne(&l);
1094 a.jne(&l);
1095 a.je (&l);
1096 a.jmp(&l);
1097 a.jl (&l);
1098 a.jc (&l);
1099
1100 a.cmp(A::rdx, 0);
1101 a.cmp(A::rax, 12);
1102 a.cmp(A::r14, 2000000000);
1103 },{
1104 0x0f,0x85, 0xfa,0xff,0xff,0xff, // near jne -6 bytes
1105 0x0f,0x85, 0xf4,0xff,0xff,0xff, // near jne -12 bytes
1106 0x0f,0x84, 0xee,0xff,0xff,0xff, // near je -18 bytes
1107 0xe9, 0xe9,0xff,0xff,0xff, // near jmp -23 bytes
1108 0x0f,0x8c, 0xe3,0xff,0xff,0xff, // near jl -29 bytes
1109 0x0f,0x82, 0xdd,0xff,0xff,0xff, // near jc -35 bytes
1110
1111 0x48,0x83,0xfa,0x00,
1112 0x48,0x83,0xf8,0x0c,
1113 0x49,0x81,0xfe,0x00,0x94,0x35,0x77,
1114 });
1115
1116 test_asm(r, [&](A& a) {
1117 a.vmovups(A::ymm5, A::rsi);
1118 a.vmovups(A::rsi, A::ymm5);
1119
1120 a.vmovups(A::rsi, A::xmm5);
1121
1122 a.vpmovzxwd(A::ymm4, A::rsi);
1123 a.vpmovzxbd(A::ymm4, A::rsi);
1124
1125 a.vmovq(A::rdx, A::xmm15);
1126 },{
1127 /* VEX */ /*Op*/ /* ModRM */
1128 0xc5, 0xfc, 0x10, 0b00'101'110,
1129 0xc5, 0xfc, 0x11, 0b00'101'110,
1130
1131 0xc5, 0xf8, 0x11, 0b00'101'110,
1132
1133 0xc4,0xe2,0x7d, 0x33, 0b00'100'110,
1134 0xc4,0xe2,0x7d, 0x31, 0b00'100'110,
1135
1136 0xc5, 0x79, 0xd6, 0b00'111'010,
1137 });
1138
1139 test_asm(r, [&](A& a) {
1140 a.movzbl(A::rax, A::rsi, 0); // Low registers for src and dst.
1141 a.movzbl(A::rax, A::r8, 0); // High src register.
1142 a.movzbl(A::r8 , A::rsi, 0); // High dst register.
1143 a.movzbl(A::r8, A::rsi, 12);
1144 a.movzbl(A::r8, A::rsi, 400);
1145
1146 a.vmovd(A::rax, A::xmm0);
1147 a.vmovd(A::rax, A::xmm8);
1148 a.vmovd(A::r8, A::xmm0);
1149
1150 a.vmovd(A::xmm0, A::rax);
1151 a.vmovd(A::xmm8, A::rax);
1152 a.vmovd(A::xmm0, A::r8);
1153
1154 a.vmovd(A::xmm0 , A::FOUR, A::rcx, A::rax);
1155 a.vmovd(A::xmm15, A::TWO, A::r8, A::rax);
1156 a.vmovd(A::xmm0 , A::ONE, A::rcx, A::r8);
1157
1158 a.vmovd_direct(A::rax, A::xmm0);
1159 a.vmovd_direct(A::rax, A::xmm8);
1160 a.vmovd_direct(A::r8, A::xmm0);
1161
1162 a.vmovd_direct(A::xmm0, A::rax);
1163 a.vmovd_direct(A::xmm8, A::rax);
1164 a.vmovd_direct(A::xmm0, A::r8);
1165
1166 a.movb(A::rdx, A::rax);
1167 a.movb(A::rdx, A::r8);
1168 a.movb(A::r8 , A::rax);
1169 },{
1170 0x0f,0xb6,0x06,
1171 0x41,0x0f,0xb6,0x00,
1172 0x44,0x0f,0xb6,0x06,
1173 0x44,0x0f,0xb6,0x46, 12,
1174 0x44,0x0f,0xb6,0x86, 0x90,0x01,0x00,0x00,
1175
1176 0xc5,0xf9,0x7e,0x00,
1177 0xc5,0x79,0x7e,0x00,
1178 0xc4,0xc1,0x79,0x7e,0x00,
1179
1180 0xc5,0xf9,0x6e,0x00,
1181 0xc5,0x79,0x6e,0x00,
1182 0xc4,0xc1,0x79,0x6e,0x00,
1183
1184 0xc5,0xf9,0x6e,0x04,0x88,
1185 0xc4,0x21,0x79,0x6e,0x3c,0x40,
1186 0xc4,0xc1,0x79,0x6e,0x04,0x08,
1187
1188 0xc5,0xf9,0x7e,0xc0,
1189 0xc5,0x79,0x7e,0xc0,
1190 0xc4,0xc1,0x79,0x7e,0xc0,
1191
1192 0xc5,0xf9,0x6e,0xc0,
1193 0xc5,0x79,0x6e,0xc0,
1194 0xc4,0xc1,0x79,0x6e,0xc0,
1195
1196 0x88, 0x02,
1197 0x44, 0x88, 0x02,
1198 0x41, 0x88, 0x00,
1199 });
1200
1201 test_asm(r, [&](A& a) {
1202 a.vpinsrw(A::xmm1, A::xmm8, A::rsi, 4);
1203 a.vpinsrw(A::xmm8, A::xmm1, A::r8, 12);
1204
1205 a.vpinsrb(A::xmm1, A::xmm8, A::rsi, 4);
1206 a.vpinsrb(A::xmm8, A::xmm1, A::r8, 12);
1207
1208 a.vpextrw(A::rsi, A::xmm8, 7);
1209 a.vpextrw(A::r8, A::xmm1, 15);
1210
1211 a.vpextrb(A::rsi, A::xmm8, 7);
1212 a.vpextrb(A::r8, A::xmm1, 15);
1213 },{
1214 0xc5,0xb9, 0xc4, 0x0e, 4,
1215 0xc4,0x41,0x71, 0xc4, 0x00, 12,
1216
1217 0xc4,0xe3,0x39, 0x20, 0x0e, 4,
1218 0xc4,0x43,0x71, 0x20, 0x00, 12,
1219
1220 0xc4,0x63,0x79, 0x15, 0x06, 7,
1221 0xc4,0xc3,0x79, 0x15, 0x08, 15,
1222
1223 0xc4,0x63,0x79, 0x14, 0x06, 7,
1224 0xc4,0xc3,0x79, 0x14, 0x08, 15,
1225 });
1226
1227 test_asm(r, [&](A& a) {
1228 a.vpandn(A::ymm3, A::ymm12, A::ymm2);
1229 },{
1230 0xc5, 0x9d, 0xdf, 0xda,
1231 });
1232
1233 test_asm(r, [&](A& a) {
1234 a.vmovdqa (A::ymm3, A::ymm2);
1235 a.vcvttps2dq(A::ymm3, A::ymm2);
1236 a.vcvtdq2ps (A::ymm3, A::ymm2);
1237 a.vcvtps2dq (A::ymm3, A::ymm2);
1238 a.vsqrtps (A::ymm3, A::ymm2);
1239 },{
1240 0xc5,0xfd,0x6f,0xda,
1241 0xc5,0xfe,0x5b,0xda,
1242 0xc5,0xfc,0x5b,0xda,
1243 0xc5,0xfd,0x5b,0xda,
1244 0xc5,0xfc,0x51,0xda,
1245 });
1246
1247 test_asm(r, [&](A& a) {
1248 a.vgatherdps(A::ymm1 , A::FOUR , A::ymm0 , A::rdi, A::ymm2 );
1249 a.vgatherdps(A::ymm0 , A::ONE , A::ymm2 , A::rax, A::ymm1 );
1250 a.vgatherdps(A::ymm10, A::ONE , A::ymm2 , A::rax, A::ymm1 );
1251 a.vgatherdps(A::ymm0 , A::ONE , A::ymm12, A::rax, A::ymm1 );
1252 a.vgatherdps(A::ymm0 , A::ONE , A::ymm2 , A::r9 , A::ymm1 );
1253 a.vgatherdps(A::ymm0 , A::ONE , A::ymm2 , A::rax, A::ymm12);
1254 a.vgatherdps(A::ymm0 , A::EIGHT, A::ymm2 , A::rax, A::ymm12);
1255 },{
1256 0xc4,0xe2,0x6d,0x92,0x0c,0x87,
1257 0xc4,0xe2,0x75,0x92,0x04,0x10,
1258 0xc4,0x62,0x75,0x92,0x14,0x10,
1259 0xc4,0xa2,0x75,0x92,0x04,0x20,
1260 0xc4,0xc2,0x75,0x92,0x04,0x11,
1261 0xc4,0xe2,0x1d,0x92,0x04,0x10,
1262 0xc4,0xe2,0x1d,0x92,0x04,0xd0,
1263 });
1264
1265 test_asm(r, [&](A& a) {
1266 a.movq(A::rax, A::rdi, 0);
1267 a.movq(A::rax, A::rdi, 1);
1268 a.movq(A::rax, A::rdi, 512);
1269 a.movq(A::r15, A::r13, 42);
1270 a.movq(A::rax, A::r13, 42);
1271 a.movq(A::r15, A::rax, 42);
1272 },{
1273 0x48, 0x8b, 0x07,
1274 0x48, 0x8b, 0x47, 0x01,
1275 0x48, 0x8b, 0x87, 0x00,0x02,0x00,0x00,
1276 0x4d, 0x8b, 0x7d, 0x2a,
1277 0x49, 0x8b, 0x45, 0x2a,
1278 0x4c, 0x8b, 0x78, 0x2a,
1279 });
1280
1281 // echo "fmul v4.4s, v3.4s, v1.4s" | llvm-mc -show-encoding -arch arm64
1282
1283 test_asm(r, [&](A& a) {
1284 a.and16b(A::v4, A::v3, A::v1);
1285 a.orr16b(A::v4, A::v3, A::v1);
1286 a.eor16b(A::v4, A::v3, A::v1);
1287 a.bic16b(A::v4, A::v3, A::v1);
1288 a.bsl16b(A::v4, A::v3, A::v1);
1289 a.not16b(A::v4, A::v3);
1290
1291 a.add4s(A::v4, A::v3, A::v1);
1292 a.sub4s(A::v4, A::v3, A::v1);
1293 a.mul4s(A::v4, A::v3, A::v1);
1294
1295 a.cmeq4s(A::v4, A::v3, A::v1);
1296 a.cmgt4s(A::v4, A::v3, A::v1);
1297
1298 a.sub8h(A::v4, A::v3, A::v1);
1299 a.mul8h(A::v4, A::v3, A::v1);
1300
1301 a.fadd4s(A::v4, A::v3, A::v1);
1302 a.fsub4s(A::v4, A::v3, A::v1);
1303 a.fmul4s(A::v4, A::v3, A::v1);
1304 a.fdiv4s(A::v4, A::v3, A::v1);
1305 a.fmin4s(A::v4, A::v3, A::v1);
1306 a.fmax4s(A::v4, A::v3, A::v1);
1307
1308 a.fmla4s(A::v4, A::v3, A::v1);
1309 a.fmls4s(A::v4, A::v3, A::v1);
1310
1311 a.fcmeq4s(A::v4, A::v3, A::v1);
1312 a.fcmgt4s(A::v4, A::v3, A::v1);
1313 a.fcmge4s(A::v4, A::v3, A::v1);
1314 },{
1315 0x64,0x1c,0x21,0x4e,
1316 0x64,0x1c,0xa1,0x4e,
1317 0x64,0x1c,0x21,0x6e,
1318 0x64,0x1c,0x61,0x4e,
1319 0x64,0x1c,0x61,0x6e,
1320 0x64,0x58,0x20,0x6e,
1321
1322 0x64,0x84,0xa1,0x4e,
1323 0x64,0x84,0xa1,0x6e,
1324 0x64,0x9c,0xa1,0x4e,
1325
1326 0x64,0x8c,0xa1,0x6e,
1327 0x64,0x34,0xa1,0x4e,
1328
1329 0x64,0x84,0x61,0x6e,
1330 0x64,0x9c,0x61,0x4e,
1331
1332 0x64,0xd4,0x21,0x4e,
1333 0x64,0xd4,0xa1,0x4e,
1334 0x64,0xdc,0x21,0x6e,
1335 0x64,0xfc,0x21,0x6e,
1336 0x64,0xf4,0xa1,0x4e,
1337 0x64,0xf4,0x21,0x4e,
1338
1339 0x64,0xcc,0x21,0x4e,
1340 0x64,0xcc,0xa1,0x4e,
1341
1342 0x64,0xe4,0x21,0x4e,
1343 0x64,0xe4,0xa1,0x6e,
1344 0x64,0xe4,0x21,0x6e,
1345 });
1346
1347 test_asm(r, [&](A& a) {
1348 a.shl4s(A::v4, A::v3, 0);
1349 a.shl4s(A::v4, A::v3, 1);
1350 a.shl4s(A::v4, A::v3, 8);
1351 a.shl4s(A::v4, A::v3, 16);
1352 a.shl4s(A::v4, A::v3, 31);
1353
1354 a.sshr4s(A::v4, A::v3, 1);
1355 a.sshr4s(A::v4, A::v3, 8);
1356 a.sshr4s(A::v4, A::v3, 31);
1357
1358 a.ushr4s(A::v4, A::v3, 1);
1359 a.ushr4s(A::v4, A::v3, 8);
1360 a.ushr4s(A::v4, A::v3, 31);
1361
1362 a.ushr8h(A::v4, A::v3, 1);
1363 a.ushr8h(A::v4, A::v3, 8);
1364 a.ushr8h(A::v4, A::v3, 15);
1365 },{
1366 0x64,0x54,0x20,0x4f,
1367 0x64,0x54,0x21,0x4f,
1368 0x64,0x54,0x28,0x4f,
1369 0x64,0x54,0x30,0x4f,
1370 0x64,0x54,0x3f,0x4f,
1371
1372 0x64,0x04,0x3f,0x4f,
1373 0x64,0x04,0x38,0x4f,
1374 0x64,0x04,0x21,0x4f,
1375
1376 0x64,0x04,0x3f,0x6f,
1377 0x64,0x04,0x38,0x6f,
1378 0x64,0x04,0x21,0x6f,
1379
1380 0x64,0x04,0x1f,0x6f,
1381 0x64,0x04,0x18,0x6f,
1382 0x64,0x04,0x11,0x6f,
1383 });
1384
1385 test_asm(r, [&](A& a) {
1386 a.sli4s(A::v4, A::v3, 0);
1387 a.sli4s(A::v4, A::v3, 1);
1388 a.sli4s(A::v4, A::v3, 8);
1389 a.sli4s(A::v4, A::v3, 16);
1390 a.sli4s(A::v4, A::v3, 31);
1391 },{
1392 0x64,0x54,0x20,0x6f,
1393 0x64,0x54,0x21,0x6f,
1394 0x64,0x54,0x28,0x6f,
1395 0x64,0x54,0x30,0x6f,
1396 0x64,0x54,0x3f,0x6f,
1397 });
1398
1399 test_asm(r, [&](A& a) {
1400 a.scvtf4s (A::v4, A::v3);
1401 a.fcvtzs4s(A::v4, A::v3);
1402 a.fcvtns4s(A::v4, A::v3);
1403 },{
1404 0x64,0xd8,0x21,0x4e,
1405 0x64,0xb8,0xa1,0x4e,
1406 0x64,0xa8,0x21,0x4e,
1407 });
1408
1409 test_asm(r, [&](A& a) {
1410 a.brk(0);
1411 a.brk(65535);
1412
1413 a.ret(A::x30); // Conventional ret using link register.
1414 a.ret(A::x13); // Can really return using any register if we like.
1415
1416 a.add(A::x2, A::x2, 4);
1417 a.add(A::x3, A::x2, 32);
1418
1419 a.sub(A::x2, A::x2, 4);
1420 a.sub(A::x3, A::x2, 32);
1421
1422 a.subs(A::x2, A::x2, 4);
1423 a.subs(A::x3, A::x2, 32);
1424
1425 a.subs(A::xzr, A::x2, 4); // These are actually the same instruction!
1426 a.cmp(A::x2, 4);
1427
1428 A::Label l = a.here();
1429 a.bne(&l);
1430 a.bne(&l);
1431 a.blt(&l);
1432 a.b(&l);
1433 a.cbnz(A::x2, &l);
1434 a.cbz(A::x2, &l);
1435 },{
1436 0x00,0x00,0x20,0xd4,
1437 0xe0,0xff,0x3f,0xd4,
1438
1439 0xc0,0x03,0x5f,0xd6,
1440 0xa0,0x01,0x5f,0xd6,
1441
1442 0x42,0x10,0x00,0x91,
1443 0x43,0x80,0x00,0x91,
1444
1445 0x42,0x10,0x00,0xd1,
1446 0x43,0x80,0x00,0xd1,
1447
1448 0x42,0x10,0x00,0xf1,
1449 0x43,0x80,0x00,0xf1,
1450
1451 0x5f,0x10,0x00,0xf1,
1452 0x5f,0x10,0x00,0xf1,
1453
1454 0x01,0x00,0x00,0x54, // b.ne #0
1455 0xe1,0xff,0xff,0x54, // b.ne #-4
1456 0xcb,0xff,0xff,0x54, // b.lt #-8
1457 0xae,0xff,0xff,0x54, // b.al #-12
1458 0x82,0xff,0xff,0xb5, // cbnz x2, #-16
1459 0x62,0xff,0xff,0xb4, // cbz x2, #-20
1460 });
1461
1462 // Can we cbz() to a not-yet-defined label?
1463 test_asm(r, [&](A& a) {
1464 A::Label l;
1465 a.cbz(A::x2, &l);
1466 a.add(A::x3, A::x2, 32);
1467 a.label(&l);
1468 a.ret(A::x30);
1469 },{
1470 0x42,0x00,0x00,0xb4, // cbz x2, #8
1471 0x43,0x80,0x00,0x91, // add x3, x2, #32
1472 0xc0,0x03,0x5f,0xd6, // ret
1473 });
1474
1475 // If we start a label as a backward label,
1476 // can we redefine it to be a future label?
1477 // (Not sure this is useful... just want to test it works.)
1478 test_asm(r, [&](A& a) {
1479 A::Label l1 = a.here();
1480 a.add(A::x3, A::x2, 32);
1481 a.cbz(A::x2, &l1); // This will jump backward... nothing sneaky.
1482
1483 A::Label l2 = a.here(); // Start off the same...
1484 a.add(A::x3, A::x2, 32);
1485 a.cbz(A::x2, &l2); // Looks like this will go backward...
1486 a.add(A::x2, A::x2, 4);
1487 a.add(A::x3, A::x2, 32);
1488 a.label(&l2); // But no... actually forward! What a switcheroo!
1489 },{
1490 0x43,0x80,0x00,0x91, // add x3, x2, #32
1491 0xe2,0xff,0xff,0xb4, // cbz x2, #-4
1492
1493 0x43,0x80,0x00,0x91, // add x3, x2, #32
1494 0x62,0x00,0x00,0xb4, // cbz x2, #12
1495 0x42,0x10,0x00,0x91, // add x2, x2, #4
1496 0x43,0x80,0x00,0x91, // add x3, x2, #32
1497 });
1498
1499 // Loading from a label on ARM.
1500 test_asm(r, [&](A& a) {
1501 A::Label fore,aft;
1502 a.label(&fore);
1503 a.word(0x01234567);
1504 a.ldrq(A::v1, &fore);
1505 a.ldrq(A::v2, &aft);
1506 a.label(&aft);
1507 a.word(0x76543210);
1508 },{
1509 0x67,0x45,0x23,0x01,
1510 0xe1,0xff,0xff,0x9c, // ldr q1, #-4
1511 0x22,0x00,0x00,0x9c, // ldr q2, #4
1512 0x10,0x32,0x54,0x76,
1513 });
1514
1515 test_asm(r, [&](A& a) {
1516 a.ldrq(A::v0, A::x8);
1517 a.strq(A::v0, A::x8);
1518 },{
1519 0x00,0x01,0xc0,0x3d,
1520 0x00,0x01,0x80,0x3d,
1521 });
1522
1523 test_asm(r, [&](A& a) {
1524 a.xtns2h(A::v0, A::v0);
1525 a.xtnh2b(A::v0, A::v0);
1526 a.strs (A::v0, A::x0);
1527
1528 a.ldrs (A::v0, A::x0);
1529 a.uxtlb2h(A::v0, A::v0);
1530 a.uxtlh2s(A::v0, A::v0);
1531
1532 a.uminv4s(A::v3, A::v4);
1533 a.fmovs (A::x3, A::v4); // fmov w3,s4
1534 },{
1535 0x00,0x28,0x61,0x0e,
1536 0x00,0x28,0x21,0x0e,
1537 0x00,0x00,0x00,0xbd,
1538
1539 0x00,0x00,0x40,0xbd,
1540 0x00,0xa4,0x08,0x2f,
1541 0x00,0xa4,0x10,0x2f,
1542
1543 0x83,0xa8,0xb1,0x6e,
1544 0x83,0x00,0x26,0x1e,
1545 });
1546
1547 test_asm(r, [&](A& a) {
1548 a.ldrb(A::v0, A::x8);
1549 a.strb(A::v0, A::x8);
1550 },{
1551 0x00,0x01,0x40,0x3d,
1552 0x00,0x01,0x00,0x3d,
1553 });
1554
1555 test_asm(r, [&](A& a) {
1556 a.tbl(A::v0, A::v1, A::v2);
1557 },{
1558 0x20,0x00,0x02,0x4e,
1559 });
1560 }
1561