1
2 /* A program to test SSE4.1/SSE4.2 instructions.
3 Revisions: Nov.208 - wrote this file
4 Apr.10.2010 - added PEXTR* tests
5 Apr.16.2010 - added PINS* tests
6 */
7
8 /* HOW TO COMPILE:
9 gcc -m64 -g -O -Wall -o sse4-64 sse4-64.c
10 */
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <assert.h>
15 //#include "tests/malloc.h" // reenable when reintegrated
16 #include <string.h>
17
18
19
20 // rmme when reintegrated
21 // Allocates a 16-aligned block. Asserts if the allocation fails.
22 #ifdef VGO_darwin
23 #include <stdlib.h>
24 #else
25 #include <malloc.h>
26 #endif
27 __attribute__((unused))
memalign16(size_t szB)28 static void* memalign16(size_t szB)
29 {
30 void* x;
31 #if defined(VGO_darwin)
32 // Darwin lacks memalign, but its malloc is always 16-aligned anyway.
33 x = malloc(szB);
34 #else
35 x = memalign(16, szB);
36 #endif
37 assert(x);
38 assert(0 == ((16-1) & (unsigned long)x));
39 return x;
40 }
41
42
43
44 typedef unsigned char V128[16];
45 typedef unsigned int UInt;
46 typedef signed int Int;
47 typedef unsigned char UChar;
48 typedef unsigned long long int ULong;
49
50 typedef unsigned char Bool;
51 #define False ((Bool)0)
52 #define True ((Bool)1)
53
54
55 typedef
56 struct {
57 V128 arg1;
58 V128 arg2;
59 V128 res;
60 }
61 RRArgs;
62
63 typedef
64 struct {
65 V128 arg1;
66 V128 res;
67 }
68 RMArgs;
69
do64HLtoV128(V128 * res,ULong wHi,ULong wLo)70 static void do64HLtoV128 ( /*OUT*/V128* res, ULong wHi, ULong wLo )
71 {
72 // try to sidestep strict-aliasing snafus by memcpying explicitly
73 UChar* p = (UChar*)res;
74 memcpy(&p[8], (UChar*)&wHi, 8);
75 memcpy(&p[0], (UChar*)&wLo, 8);
76 }
77
randUChar(void)78 static UChar randUChar ( void )
79 {
80 static UInt seed = 80021;
81 seed = 1103515245 * seed + 12345;
82 return (seed >> 17) & 0xFF;
83 }
84
randULong(void)85 static ULong randULong ( void )
86 {
87 Int i;
88 ULong r = 0;
89 for (i = 0; i < 8; i++) {
90 r = (r << 8) | (ULong)(0xFF & randUChar());
91 }
92 return r;
93 }
94
randV128(V128 * v)95 static void randV128 ( V128* v )
96 {
97 Int i;
98 for (i = 0; i < 16; i++)
99 (*v)[i] = randUChar();
100 }
101
showV128(V128 * v)102 static void showV128 ( V128* v )
103 {
104 Int i;
105 for (i = 15; i >= 0; i--)
106 printf("%02x", (Int)(*v)[i]);
107 }
108
showMaskedV128(V128 * v,V128 * mask)109 static void showMaskedV128 ( V128* v, V128* mask )
110 {
111 Int i;
112 for (i = 15; i >= 0; i--)
113 printf("%02x", (Int)( ((*v)[i]) & ((*mask)[i]) ));
114 }
115
showIGVV(char * rOrM,char * op,Int imm,ULong src64,V128 * dst,V128 * res)116 static void showIGVV( char* rOrM, char* op, Int imm,
117 ULong src64, V128* dst, V128* res )
118 {
119 printf("%s %10s $%d ", rOrM, op, imm);
120 printf("%016llx", src64);
121 printf(" ");
122 showV128(dst);
123 printf(" ");
124 showV128(res);
125 printf("\n");
126 }
127
showIAG(char * rOrM,char * op,Int imm,V128 * argL,ULong argR,ULong res)128 static void showIAG ( char* rOrM, char* op, Int imm,
129 V128* argL, ULong argR, ULong res )
130 {
131 printf("%s %10s $%d ", rOrM, op, imm);
132 showV128(argL);
133 printf(" ");
134 printf("%016llx", argR);
135 printf(" ");
136 printf("%016llx", res);
137 printf("\n");
138 }
139
showIAA(char * rOrM,char * op,Int imm,RRArgs * rra,V128 * rmask)140 static void showIAA ( char* rOrM, char* op, Int imm, RRArgs* rra, V128* rmask )
141 {
142 printf("%s %10s $%d ", rOrM, op, imm);
143 showV128(&rra->arg1);
144 printf(" ");
145 showV128(&rra->arg2);
146 printf(" ");
147 showMaskedV128(&rra->res, rmask);
148 printf("\n");
149 }
150
showAA(char * rOrM,char * op,RRArgs * rra,V128 * rmask)151 static void showAA ( char* rOrM, char* op, RRArgs* rra, V128* rmask )
152 {
153 printf("%s %10s ", rOrM, op);
154 showV128(&rra->arg1);
155 printf(" ");
156 showV128(&rra->arg2);
157 printf(" ");
158 showMaskedV128(&rra->res, rmask);
159 printf("\n");
160 }
161
162 /* Note: these are little endian. Hence first byte is the least
163 significant byte of lane zero. */
164
165 /* Mask for insns where all result bits are non-approximated. */
166 static V128 AllMask = { 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
167 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF };
168
169 /* Mark for insns which produce approximated vector short results. */
170 __attribute__((unused))
171 static V128 ApproxPS = { 0x00,0x00,0x80,0xFF, 0x00,0x00,0x80,0xFF,
172 0x00,0x00,0x80,0xFF, 0x00,0x00,0x80,0xFF };
173
174 /* Mark for insns which produce approximated scalar short results. */
175 __attribute__((unused))
176 static V128 ApproxSS = { 0x00,0x00,0x80,0xFF, 0xFF,0xFF,0xFF,0xFF,
177 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF };
178
179 static V128 fives = { 0x55,0x55,0x55,0x55, 0x55,0x55,0x55,0x55,
180 0x55,0x55,0x55,0x55, 0x55,0x55,0x55,0x55 };
181
182 static V128 zeroes = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,
183 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 };
184
mkPosInf(void)185 double mkPosInf ( void ) { return 1.0 / 0.0; }
mkNegInf(void)186 double mkNegInf ( void ) { return -mkPosInf(); }
mkPosNan(void)187 double mkPosNan ( void ) { return 0.0 / 0.0; }
mkNegNan(void)188 double mkNegNan ( void ) { return -mkPosNan(); }
189
190 __attribute__((noinline))
get_mxcsr(void)191 UInt get_mxcsr ( void )
192 {
193 ULong w64;
194 __asm__ __volatile__(
195 "subq $8, %%rsp" "\n\t"
196 "stmxcsr (%%rsp)" "\n\t"
197 "movq (%%rsp), %0" "\n"
198 "addq $8, %%rsp"
199 : /*OUT*/"=r"(w64) : /*IN*/ : "memory","cc"
200 );
201 if (0) printf("get %08x\n", (UInt)w64);
202 return (UInt)w64;
203 }
204
205 __attribute__((noinline))
set_mxcsr(UInt w32)206 void set_mxcsr ( UInt w32 )
207 {
208 if (0) printf("set %08x\n", w32);
209 ULong w64 = (ULong)w32;
210 __asm__ __volatile__(
211 "subq $8, %%rsp" "\n\t"
212 "movq %0, (%%rsp)" "\n\t"
213 "ldmxcsr (%%rsp)" "\n\t"
214 "addq $8, %%rsp"
215 : /*OUT*/ : /*IN*/"r"(w64) : "memory",/*"mxcsr",*/"cc"
216 );
217 }
218
get_sse_roundingmode(void)219 UInt get_sse_roundingmode ( void )
220 {
221 UInt w = get_mxcsr();
222 return (w >> 13) & 3;
223 }
224
set_sse_roundingmode(UInt m)225 void set_sse_roundingmode ( UInt m )
226 {
227 UInt w;
228 assert(0 == (m & ~3));
229 w = get_mxcsr();
230 w &= ~(3 << 13);
231 w |= (m << 13);
232 set_mxcsr(w);
233 }
234
235
236 #define DO_imm_r_r(_opname, _imm, _src, _dst) \
237 { \
238 V128 _tmp; \
239 __asm__ __volatile__( \
240 "movupd (%0), %%xmm2" "\n\t" \
241 "movupd (%1), %%xmm11" "\n\t" \
242 _opname " $" #_imm ", %%xmm2, %%xmm11" "\n\t" \
243 "movupd %%xmm11, (%2)" "\n" \
244 : /*out*/ : /*in*/ "r"(&(_src)), "r"(&(_dst)), "r"(&(_tmp)) \
245 : "cc", "memory", "xmm2", "xmm11" \
246 ); \
247 RRArgs rra; \
248 memcpy(&rra.arg1, &(_src), sizeof(V128)); \
249 memcpy(&rra.arg2, &(_dst), sizeof(V128)); \
250 memcpy(&rra.res, &(_tmp), sizeof(V128)); \
251 showIAA("r", (_opname), (_imm), &rra, &AllMask); \
252 }
253
254 #define DO_imm_m_r(_opname, _imm, _src, _dst) \
255 { \
256 V128 _tmp; \
257 V128* _srcM = memalign16(sizeof(V128)); \
258 memcpy(_srcM, &(_src), sizeof(V128)); \
259 __asm__ __volatile__( \
260 "movupd (%1), %%xmm11" "\n\t" \
261 _opname " $" #_imm ", (%0), %%xmm11" "\n\t" \
262 "movupd %%xmm11, (%2)" "\n" \
263 : /*out*/ : /*in*/ "r"(_srcM), "r"(&(_dst)), "r"(&(_tmp)) \
264 : "cc", "memory", "xmm11" \
265 ); \
266 RRArgs rra; \
267 memcpy(&rra.arg1, &(_src), sizeof(V128)); \
268 memcpy(&rra.arg2, &(_dst), sizeof(V128)); \
269 memcpy(&rra.res, &(_tmp), sizeof(V128)); \
270 showIAA("m", (_opname), (_imm), &rra, &AllMask); \
271 free(_srcM); \
272 }
273
274 #define DO_imm_mandr_r(_opname, _imm, _src, _dst) \
275 DO_imm_r_r( _opname, _imm, _src, _dst ) \
276 DO_imm_m_r( _opname, _imm, _src, _dst )
277
278
279
280
281
282 #define DO_r_r(_opname, _src, _dst) \
283 { \
284 V128 _tmp; \
285 __asm__ __volatile__( \
286 "movupd (%0), %%xmm2" "\n\t" \
287 "movupd (%1), %%xmm11" "\n\t" \
288 _opname " %%xmm2, %%xmm11" "\n\t" \
289 "movupd %%xmm11, (%2)" "\n" \
290 : /*out*/ : /*in*/ "r"(&(_src)), "r"(&(_dst)), "r"(&(_tmp)) \
291 : "cc", "memory", "xmm2", "xmm11" \
292 ); \
293 RRArgs rra; \
294 memcpy(&rra.arg1, &(_src), sizeof(V128)); \
295 memcpy(&rra.arg2, &(_dst), sizeof(V128)); \
296 memcpy(&rra.res, &(_tmp), sizeof(V128)); \
297 showAA("r", (_opname), &rra, &AllMask); \
298 }
299
300 #define DO_m_r(_opname, _src, _dst) \
301 { \
302 V128 _tmp; \
303 V128* _srcM = memalign16(sizeof(V128)); \
304 memcpy(_srcM, &(_src), sizeof(V128)); \
305 __asm__ __volatile__( \
306 "movupd (%1), %%xmm11" "\n\t" \
307 _opname " (%0), %%xmm11" "\n\t" \
308 "movupd %%xmm11, (%2)" "\n" \
309 : /*out*/ : /*in*/ "r"(_srcM), "r"(&(_dst)), "r"(&(_tmp)) \
310 : "cc", "memory", "xmm11" \
311 ); \
312 RRArgs rra; \
313 memcpy(&rra.arg1, &(_src), sizeof(V128)); \
314 memcpy(&rra.arg2, &(_dst), sizeof(V128)); \
315 memcpy(&rra.res, &(_tmp), sizeof(V128)); \
316 showAA("m", (_opname), &rra, &AllMask); \
317 free(_srcM); \
318 }
319
320 #define DO_mandr_r(_opname, _src, _dst) \
321 DO_r_r(_opname, _src, _dst) \
322 DO_m_r(_opname, _src, _dst)
323
324
325
326
327 #define DO_imm_r_to_rscalar(_opname, _imm, _src, _dstsuffix) \
328 { \
329 ULong _scbefore = 0x5555555555555555ULL; \
330 ULong _scafter = 0xAAAAAAAAAAAAAAAAULL; \
331 /* This assumes that gcc won't make any of %0, %1, %2 */ \
332 /* be r11. That should be ensured (cough, cough) */ \
333 /* by declaring r11 to be clobbered. */ \
334 __asm__ __volatile__( \
335 "movupd (%0), %%xmm2" "\n\t" \
336 "movq (%1), %%r11" "\n\t" \
337 _opname " $" #_imm ", %%xmm2, %%r11" _dstsuffix "\n\t" \
338 "movq %%r11, (%2)" "\n" \
339 : /*out*/ \
340 : /*in*/ "r"(&(_src)), "r"(&(_scbefore)), "r"(&(_scafter)) \
341 : "cc", "memory", "xmm2", "r11" \
342 ); \
343 showIAG("r", (_opname), (_imm), &(_src), (_scbefore), (_scafter)); \
344 }
345
346 #define DO_imm_r_to_mscalar(_opname, _imm, _src) \
347 { \
348 ULong _scbefore = 0x5555555555555555ULL; \
349 ULong _scafter = _scbefore; \
350 __asm__ __volatile__( \
351 "movupd (%0), %%xmm2" "\n\t" \
352 _opname " $" #_imm ", %%xmm2, (%1)" "\n\t" \
353 : /*out*/ \
354 : /*in*/ "r"(&(_src)), "r"(&(_scafter)) \
355 : "cc", "memory", "xmm2" \
356 ); \
357 showIAG("m", (_opname), (_imm), &(_src), (_scbefore), (_scafter)); \
358 }
359
360 #define DO_imm_r_to_mandrscalar(_opname, _imm, _src, _dstsuffix) \
361 DO_imm_r_to_rscalar( _opname, _imm, _src, _dstsuffix ) \
362 DO_imm_r_to_mscalar( _opname, _imm, _src )
363
364
365
366
367
368
369
370
371 #define DO_imm_rscalar_to_r(_opname, _imm, _src, _srcsuffix) \
372 { \
373 V128 dstv; \
374 V128 res; \
375 ULong src64 = (ULong)(_src); \
376 memcpy(dstv, fives, sizeof(dstv)); \
377 memcpy(res, zeroes, sizeof(res)); \
378 /* This assumes that gcc won't make any of %0, %1, %2 */ \
379 /* be r11. That should be ensured (cough, cough) */ \
380 /* by declaring r11 to be clobbered. */ \
381 __asm__ __volatile__( \
382 "movupd (%0), %%xmm2" "\n\t" /*dstv*/ \
383 "movq (%1), %%r11" "\n\t" /*src64*/ \
384 _opname " $" #_imm ", %%r11" _srcsuffix ", %%xmm2" "\n\t" \
385 "movupd %%xmm2, (%2)" "\n" /*res*/ \
386 : /*out*/ \
387 : /*in*/ "r"(&dstv), "r"(&src64), "r"(&res) \
388 : "cc", "memory", "xmm2", "r11" \
389 ); \
390 showIGVV("r", (_opname), (_imm), src64, &dstv, &res); \
391 }
392 #define DO_imm_mscalar_to_r(_opname, _imm, _src) \
393 { \
394 V128 dstv; \
395 V128 res; \
396 ULong src64 = (ULong)(_src); \
397 memcpy(dstv, fives, sizeof(dstv)); \
398 memcpy(res, zeroes, sizeof(res)); \
399 __asm__ __volatile__( \
400 "movupd (%0), %%xmm2" "\n\t" /*dstv*/ \
401 _opname " $" #_imm ", (%1), %%xmm2" "\n\t" \
402 "movupd %%xmm2, (%2)" "\n" /*res*/ \
403 : /*out*/ \
404 : /*in*/ "r"(&dstv), "r"(&src64), "r"(&res) \
405 : "cc", "memory", "xmm2" \
406 ); \
407 showIGVV("m", (_opname), (_imm), src64, &dstv, &res); \
408 }
409
410 #define DO_imm_mandrscalar_to_r(_opname, _imm, _src, _dstsuffix) \
411 DO_imm_rscalar_to_r( _opname, _imm, _src, _dstsuffix ) \
412 DO_imm_mscalar_to_r( _opname, _imm, _src )
413
414
415
416
417
test_BLENDPD(void)418 void test_BLENDPD ( void )
419 {
420 V128 src, dst;
421 Int i;
422 for (i = 0; i < 10; i++) {
423 randV128(&src);
424 randV128(&dst);
425 DO_imm_mandr_r("blendpd", 0, src, dst);
426 DO_imm_mandr_r("blendpd", 1, src, dst);
427 DO_imm_mandr_r("blendpd", 2, src, dst);
428 DO_imm_mandr_r("blendpd", 3, src, dst);
429 }
430 }
431
test_BLENDPS(void)432 void test_BLENDPS ( void )
433 {
434 V128 src, dst;
435 Int i;
436 for (i = 0; i < 10; i++) {
437 randV128(&src);
438 randV128(&dst);
439 DO_imm_mandr_r("blendps", 0, src, dst);
440 DO_imm_mandr_r("blendps", 1, src, dst);
441 DO_imm_mandr_r("blendps", 2, src, dst);
442 DO_imm_mandr_r("blendps", 3, src, dst);
443 DO_imm_mandr_r("blendps", 4, src, dst);
444 DO_imm_mandr_r("blendps", 5, src, dst);
445 DO_imm_mandr_r("blendps", 6, src, dst);
446 DO_imm_mandr_r("blendps", 7, src, dst);
447 DO_imm_mandr_r("blendps", 8, src, dst);
448 DO_imm_mandr_r("blendps", 9, src, dst);
449 DO_imm_mandr_r("blendps", 10, src, dst);
450 DO_imm_mandr_r("blendps", 11, src, dst);
451 DO_imm_mandr_r("blendps", 12, src, dst);
452 DO_imm_mandr_r("blendps", 13, src, dst);
453 DO_imm_mandr_r("blendps", 14, src, dst);
454 DO_imm_mandr_r("blendps", 15, src, dst);
455 }
456 }
457
test_DPPD(void)458 void test_DPPD ( void )
459 {
460 V128 src, dst;
461 {
462 *(double*)(&src[0]) = 1.2345;
463 *(double*)(&src[8]) = -6.78910;
464 *(double*)(&dst[0]) = -11.121314;
465 *(double*)(&dst[8]) = 15.161718;
466 DO_imm_mandr_r("dppd", 0, src, dst);
467 DO_imm_mandr_r("dppd", 1, src, dst);
468 DO_imm_mandr_r("dppd", 2, src, dst);
469 DO_imm_mandr_r("dppd", 3, src, dst);
470 DO_imm_mandr_r("dppd", 4, src, dst);
471 DO_imm_mandr_r("dppd", 5, src, dst);
472 DO_imm_mandr_r("dppd", 6, src, dst);
473 DO_imm_mandr_r("dppd", 7, src, dst);
474 DO_imm_mandr_r("dppd", 8, src, dst);
475 DO_imm_mandr_r("dppd", 9, src, dst);
476 DO_imm_mandr_r("dppd", 10, src, dst);
477 DO_imm_mandr_r("dppd", 11, src, dst);
478 DO_imm_mandr_r("dppd", 12, src, dst);
479 DO_imm_mandr_r("dppd", 13, src, dst);
480 DO_imm_mandr_r("dppd", 14, src, dst);
481 DO_imm_mandr_r("dppd", 15, src, dst);
482 DO_imm_mandr_r("dppd", 16, src, dst);
483 DO_imm_mandr_r("dppd", 17, src, dst);
484 DO_imm_mandr_r("dppd", 18, src, dst);
485 DO_imm_mandr_r("dppd", 19, src, dst);
486 DO_imm_mandr_r("dppd", 20, src, dst);
487 DO_imm_mandr_r("dppd", 21, src, dst);
488 DO_imm_mandr_r("dppd", 22, src, dst);
489 DO_imm_mandr_r("dppd", 23, src, dst);
490 DO_imm_mandr_r("dppd", 24, src, dst);
491 DO_imm_mandr_r("dppd", 25, src, dst);
492 DO_imm_mandr_r("dppd", 26, src, dst);
493 DO_imm_mandr_r("dppd", 27, src, dst);
494 DO_imm_mandr_r("dppd", 28, src, dst);
495 DO_imm_mandr_r("dppd", 29, src, dst);
496 DO_imm_mandr_r("dppd", 30, src, dst);
497 DO_imm_mandr_r("dppd", 31, src, dst);
498 DO_imm_mandr_r("dppd", 32, src, dst);
499 DO_imm_mandr_r("dppd", 33, src, dst);
500 DO_imm_mandr_r("dppd", 34, src, dst);
501 DO_imm_mandr_r("dppd", 35, src, dst);
502 DO_imm_mandr_r("dppd", 36, src, dst);
503 DO_imm_mandr_r("dppd", 37, src, dst);
504 DO_imm_mandr_r("dppd", 38, src, dst);
505 DO_imm_mandr_r("dppd", 39, src, dst);
506 DO_imm_mandr_r("dppd", 40, src, dst);
507 DO_imm_mandr_r("dppd", 41, src, dst);
508 DO_imm_mandr_r("dppd", 42, src, dst);
509 DO_imm_mandr_r("dppd", 43, src, dst);
510 DO_imm_mandr_r("dppd", 44, src, dst);
511 DO_imm_mandr_r("dppd", 45, src, dst);
512 DO_imm_mandr_r("dppd", 46, src, dst);
513 DO_imm_mandr_r("dppd", 47, src, dst);
514 DO_imm_mandr_r("dppd", 48, src, dst);
515 DO_imm_mandr_r("dppd", 49, src, dst);
516 DO_imm_mandr_r("dppd", 50, src, dst);
517 DO_imm_mandr_r("dppd", 51, src, dst);
518 DO_imm_mandr_r("dppd", 52, src, dst);
519 DO_imm_mandr_r("dppd", 53, src, dst);
520 DO_imm_mandr_r("dppd", 54, src, dst);
521 DO_imm_mandr_r("dppd", 55, src, dst);
522 DO_imm_mandr_r("dppd", 56, src, dst);
523 DO_imm_mandr_r("dppd", 57, src, dst);
524 DO_imm_mandr_r("dppd", 58, src, dst);
525 DO_imm_mandr_r("dppd", 59, src, dst);
526 DO_imm_mandr_r("dppd", 60, src, dst);
527 DO_imm_mandr_r("dppd", 61, src, dst);
528 DO_imm_mandr_r("dppd", 62, src, dst);
529 DO_imm_mandr_r("dppd", 63, src, dst);
530 DO_imm_mandr_r("dppd", 64, src, dst);
531 DO_imm_mandr_r("dppd", 65, src, dst);
532 DO_imm_mandr_r("dppd", 66, src, dst);
533 DO_imm_mandr_r("dppd", 67, src, dst);
534 DO_imm_mandr_r("dppd", 68, src, dst);
535 DO_imm_mandr_r("dppd", 69, src, dst);
536 DO_imm_mandr_r("dppd", 70, src, dst);
537 DO_imm_mandr_r("dppd", 71, src, dst);
538 DO_imm_mandr_r("dppd", 72, src, dst);
539 DO_imm_mandr_r("dppd", 73, src, dst);
540 DO_imm_mandr_r("dppd", 74, src, dst);
541 DO_imm_mandr_r("dppd", 75, src, dst);
542 DO_imm_mandr_r("dppd", 76, src, dst);
543 DO_imm_mandr_r("dppd", 77, src, dst);
544 DO_imm_mandr_r("dppd", 78, src, dst);
545 DO_imm_mandr_r("dppd", 79, src, dst);
546 DO_imm_mandr_r("dppd", 80, src, dst);
547 DO_imm_mandr_r("dppd", 81, src, dst);
548 DO_imm_mandr_r("dppd", 82, src, dst);
549 DO_imm_mandr_r("dppd", 83, src, dst);
550 DO_imm_mandr_r("dppd", 84, src, dst);
551 DO_imm_mandr_r("dppd", 85, src, dst);
552 DO_imm_mandr_r("dppd", 86, src, dst);
553 DO_imm_mandr_r("dppd", 87, src, dst);
554 DO_imm_mandr_r("dppd", 88, src, dst);
555 DO_imm_mandr_r("dppd", 89, src, dst);
556 DO_imm_mandr_r("dppd", 90, src, dst);
557 DO_imm_mandr_r("dppd", 91, src, dst);
558 DO_imm_mandr_r("dppd", 92, src, dst);
559 DO_imm_mandr_r("dppd", 93, src, dst);
560 DO_imm_mandr_r("dppd", 94, src, dst);
561 DO_imm_mandr_r("dppd", 95, src, dst);
562 DO_imm_mandr_r("dppd", 96, src, dst);
563 DO_imm_mandr_r("dppd", 97, src, dst);
564 DO_imm_mandr_r("dppd", 98, src, dst);
565 DO_imm_mandr_r("dppd", 99, src, dst);
566 DO_imm_mandr_r("dppd", 100, src, dst);
567 DO_imm_mandr_r("dppd", 101, src, dst);
568 DO_imm_mandr_r("dppd", 102, src, dst);
569 DO_imm_mandr_r("dppd", 103, src, dst);
570 DO_imm_mandr_r("dppd", 104, src, dst);
571 DO_imm_mandr_r("dppd", 105, src, dst);
572 DO_imm_mandr_r("dppd", 106, src, dst);
573 DO_imm_mandr_r("dppd", 107, src, dst);
574 DO_imm_mandr_r("dppd", 108, src, dst);
575 DO_imm_mandr_r("dppd", 109, src, dst);
576 DO_imm_mandr_r("dppd", 110, src, dst);
577 DO_imm_mandr_r("dppd", 111, src, dst);
578 DO_imm_mandr_r("dppd", 112, src, dst);
579 DO_imm_mandr_r("dppd", 113, src, dst);
580 DO_imm_mandr_r("dppd", 114, src, dst);
581 DO_imm_mandr_r("dppd", 115, src, dst);
582 DO_imm_mandr_r("dppd", 116, src, dst);
583 DO_imm_mandr_r("dppd", 117, src, dst);
584 DO_imm_mandr_r("dppd", 118, src, dst);
585 DO_imm_mandr_r("dppd", 119, src, dst);
586 DO_imm_mandr_r("dppd", 120, src, dst);
587 DO_imm_mandr_r("dppd", 121, src, dst);
588 DO_imm_mandr_r("dppd", 122, src, dst);
589 DO_imm_mandr_r("dppd", 123, src, dst);
590 DO_imm_mandr_r("dppd", 124, src, dst);
591 DO_imm_mandr_r("dppd", 125, src, dst);
592 DO_imm_mandr_r("dppd", 126, src, dst);
593 DO_imm_mandr_r("dppd", 127, src, dst);
594 DO_imm_mandr_r("dppd", 128, src, dst);
595 DO_imm_mandr_r("dppd", 129, src, dst);
596 DO_imm_mandr_r("dppd", 130, src, dst);
597 DO_imm_mandr_r("dppd", 131, src, dst);
598 DO_imm_mandr_r("dppd", 132, src, dst);
599 DO_imm_mandr_r("dppd", 133, src, dst);
600 DO_imm_mandr_r("dppd", 134, src, dst);
601 DO_imm_mandr_r("dppd", 135, src, dst);
602 DO_imm_mandr_r("dppd", 136, src, dst);
603 DO_imm_mandr_r("dppd", 137, src, dst);
604 DO_imm_mandr_r("dppd", 138, src, dst);
605 DO_imm_mandr_r("dppd", 139, src, dst);
606 DO_imm_mandr_r("dppd", 140, src, dst);
607 DO_imm_mandr_r("dppd", 141, src, dst);
608 DO_imm_mandr_r("dppd", 142, src, dst);
609 DO_imm_mandr_r("dppd", 143, src, dst);
610 DO_imm_mandr_r("dppd", 144, src, dst);
611 DO_imm_mandr_r("dppd", 145, src, dst);
612 DO_imm_mandr_r("dppd", 146, src, dst);
613 DO_imm_mandr_r("dppd", 147, src, dst);
614 DO_imm_mandr_r("dppd", 148, src, dst);
615 DO_imm_mandr_r("dppd", 149, src, dst);
616 DO_imm_mandr_r("dppd", 150, src, dst);
617 DO_imm_mandr_r("dppd", 151, src, dst);
618 DO_imm_mandr_r("dppd", 152, src, dst);
619 DO_imm_mandr_r("dppd", 153, src, dst);
620 DO_imm_mandr_r("dppd", 154, src, dst);
621 DO_imm_mandr_r("dppd", 155, src, dst);
622 DO_imm_mandr_r("dppd", 156, src, dst);
623 DO_imm_mandr_r("dppd", 157, src, dst);
624 DO_imm_mandr_r("dppd", 158, src, dst);
625 DO_imm_mandr_r("dppd", 159, src, dst);
626 DO_imm_mandr_r("dppd", 160, src, dst);
627 DO_imm_mandr_r("dppd", 161, src, dst);
628 DO_imm_mandr_r("dppd", 162, src, dst);
629 DO_imm_mandr_r("dppd", 163, src, dst);
630 DO_imm_mandr_r("dppd", 164, src, dst);
631 DO_imm_mandr_r("dppd", 165, src, dst);
632 DO_imm_mandr_r("dppd", 166, src, dst);
633 DO_imm_mandr_r("dppd", 167, src, dst);
634 DO_imm_mandr_r("dppd", 168, src, dst);
635 DO_imm_mandr_r("dppd", 169, src, dst);
636 DO_imm_mandr_r("dppd", 170, src, dst);
637 DO_imm_mandr_r("dppd", 171, src, dst);
638 DO_imm_mandr_r("dppd", 172, src, dst);
639 DO_imm_mandr_r("dppd", 173, src, dst);
640 DO_imm_mandr_r("dppd", 174, src, dst);
641 DO_imm_mandr_r("dppd", 175, src, dst);
642 DO_imm_mandr_r("dppd", 176, src, dst);
643 DO_imm_mandr_r("dppd", 177, src, dst);
644 DO_imm_mandr_r("dppd", 178, src, dst);
645 DO_imm_mandr_r("dppd", 179, src, dst);
646 DO_imm_mandr_r("dppd", 180, src, dst);
647 DO_imm_mandr_r("dppd", 181, src, dst);
648 DO_imm_mandr_r("dppd", 182, src, dst);
649 DO_imm_mandr_r("dppd", 183, src, dst);
650 DO_imm_mandr_r("dppd", 184, src, dst);
651 DO_imm_mandr_r("dppd", 185, src, dst);
652 DO_imm_mandr_r("dppd", 186, src, dst);
653 DO_imm_mandr_r("dppd", 187, src, dst);
654 DO_imm_mandr_r("dppd", 188, src, dst);
655 DO_imm_mandr_r("dppd", 189, src, dst);
656 DO_imm_mandr_r("dppd", 190, src, dst);
657 DO_imm_mandr_r("dppd", 191, src, dst);
658 DO_imm_mandr_r("dppd", 192, src, dst);
659 DO_imm_mandr_r("dppd", 193, src, dst);
660 DO_imm_mandr_r("dppd", 194, src, dst);
661 DO_imm_mandr_r("dppd", 195, src, dst);
662 DO_imm_mandr_r("dppd", 196, src, dst);
663 DO_imm_mandr_r("dppd", 197, src, dst);
664 DO_imm_mandr_r("dppd", 198, src, dst);
665 DO_imm_mandr_r("dppd", 199, src, dst);
666 DO_imm_mandr_r("dppd", 200, src, dst);
667 DO_imm_mandr_r("dppd", 201, src, dst);
668 DO_imm_mandr_r("dppd", 202, src, dst);
669 DO_imm_mandr_r("dppd", 203, src, dst);
670 DO_imm_mandr_r("dppd", 204, src, dst);
671 DO_imm_mandr_r("dppd", 205, src, dst);
672 DO_imm_mandr_r("dppd", 206, src, dst);
673 DO_imm_mandr_r("dppd", 207, src, dst);
674 DO_imm_mandr_r("dppd", 208, src, dst);
675 DO_imm_mandr_r("dppd", 209, src, dst);
676 DO_imm_mandr_r("dppd", 210, src, dst);
677 DO_imm_mandr_r("dppd", 211, src, dst);
678 DO_imm_mandr_r("dppd", 212, src, dst);
679 DO_imm_mandr_r("dppd", 213, src, dst);
680 DO_imm_mandr_r("dppd", 214, src, dst);
681 DO_imm_mandr_r("dppd", 215, src, dst);
682 DO_imm_mandr_r("dppd", 216, src, dst);
683 DO_imm_mandr_r("dppd", 217, src, dst);
684 DO_imm_mandr_r("dppd", 218, src, dst);
685 DO_imm_mandr_r("dppd", 219, src, dst);
686 DO_imm_mandr_r("dppd", 220, src, dst);
687 DO_imm_mandr_r("dppd", 221, src, dst);
688 DO_imm_mandr_r("dppd", 222, src, dst);
689 DO_imm_mandr_r("dppd", 223, src, dst);
690 DO_imm_mandr_r("dppd", 224, src, dst);
691 DO_imm_mandr_r("dppd", 225, src, dst);
692 DO_imm_mandr_r("dppd", 226, src, dst);
693 DO_imm_mandr_r("dppd", 227, src, dst);
694 DO_imm_mandr_r("dppd", 228, src, dst);
695 DO_imm_mandr_r("dppd", 229, src, dst);
696 DO_imm_mandr_r("dppd", 230, src, dst);
697 DO_imm_mandr_r("dppd", 231, src, dst);
698 DO_imm_mandr_r("dppd", 232, src, dst);
699 DO_imm_mandr_r("dppd", 233, src, dst);
700 DO_imm_mandr_r("dppd", 234, src, dst);
701 DO_imm_mandr_r("dppd", 235, src, dst);
702 DO_imm_mandr_r("dppd", 236, src, dst);
703 DO_imm_mandr_r("dppd", 237, src, dst);
704 DO_imm_mandr_r("dppd", 238, src, dst);
705 DO_imm_mandr_r("dppd", 239, src, dst);
706 DO_imm_mandr_r("dppd", 240, src, dst);
707 DO_imm_mandr_r("dppd", 241, src, dst);
708 DO_imm_mandr_r("dppd", 242, src, dst);
709 DO_imm_mandr_r("dppd", 243, src, dst);
710 DO_imm_mandr_r("dppd", 244, src, dst);
711 DO_imm_mandr_r("dppd", 245, src, dst);
712 DO_imm_mandr_r("dppd", 246, src, dst);
713 DO_imm_mandr_r("dppd", 247, src, dst);
714 DO_imm_mandr_r("dppd", 248, src, dst);
715 DO_imm_mandr_r("dppd", 249, src, dst);
716 DO_imm_mandr_r("dppd", 250, src, dst);
717 DO_imm_mandr_r("dppd", 251, src, dst);
718 DO_imm_mandr_r("dppd", 252, src, dst);
719 DO_imm_mandr_r("dppd", 253, src, dst);
720 DO_imm_mandr_r("dppd", 254, src, dst);
721 DO_imm_mandr_r("dppd", 255, src, dst);
722 }
723 }
724
test_DPPS(void)725 void test_DPPS ( void )
726 {
727 V128 src, dst;
728 {
729 *(float*)(&src[0]) = 1.2;
730 *(float*)(&src[4]) = -3.4;
731 *(float*)(&src[8]) = -6.7;
732 *(float*)(&src[12]) = 8.9;
733 *(float*)(&dst[0]) = -10.11;
734 *(float*)(&dst[4]) = 12.13;
735 *(float*)(&dst[8]) = 14.15;
736 *(float*)(&dst[12]) = -16.17;
737 DO_imm_mandr_r("dpps", 0, src, dst);
738 DO_imm_mandr_r("dpps", 1, src, dst);
739 DO_imm_mandr_r("dpps", 2, src, dst);
740 DO_imm_mandr_r("dpps", 3, src, dst);
741 DO_imm_mandr_r("dpps", 4, src, dst);
742 DO_imm_mandr_r("dpps", 5, src, dst);
743 DO_imm_mandr_r("dpps", 6, src, dst);
744 DO_imm_mandr_r("dpps", 7, src, dst);
745 DO_imm_mandr_r("dpps", 8, src, dst);
746 DO_imm_mandr_r("dpps", 9, src, dst);
747 DO_imm_mandr_r("dpps", 10, src, dst);
748 DO_imm_mandr_r("dpps", 11, src, dst);
749 DO_imm_mandr_r("dpps", 12, src, dst);
750 DO_imm_mandr_r("dpps", 13, src, dst);
751 DO_imm_mandr_r("dpps", 14, src, dst);
752 DO_imm_mandr_r("dpps", 15, src, dst);
753 DO_imm_mandr_r("dpps", 16, src, dst);
754 DO_imm_mandr_r("dpps", 17, src, dst);
755 DO_imm_mandr_r("dpps", 18, src, dst);
756 DO_imm_mandr_r("dpps", 19, src, dst);
757 DO_imm_mandr_r("dpps", 20, src, dst);
758 DO_imm_mandr_r("dpps", 21, src, dst);
759 DO_imm_mandr_r("dpps", 22, src, dst);
760 DO_imm_mandr_r("dpps", 23, src, dst);
761 DO_imm_mandr_r("dpps", 24, src, dst);
762 DO_imm_mandr_r("dpps", 25, src, dst);
763 DO_imm_mandr_r("dpps", 26, src, dst);
764 DO_imm_mandr_r("dpps", 27, src, dst);
765 DO_imm_mandr_r("dpps", 28, src, dst);
766 DO_imm_mandr_r("dpps", 29, src, dst);
767 DO_imm_mandr_r("dpps", 30, src, dst);
768 DO_imm_mandr_r("dpps", 31, src, dst);
769 DO_imm_mandr_r("dpps", 32, src, dst);
770 DO_imm_mandr_r("dpps", 33, src, dst);
771 DO_imm_mandr_r("dpps", 34, src, dst);
772 DO_imm_mandr_r("dpps", 35, src, dst);
773 DO_imm_mandr_r("dpps", 36, src, dst);
774 DO_imm_mandr_r("dpps", 37, src, dst);
775 DO_imm_mandr_r("dpps", 38, src, dst);
776 DO_imm_mandr_r("dpps", 39, src, dst);
777 DO_imm_mandr_r("dpps", 40, src, dst);
778 DO_imm_mandr_r("dpps", 41, src, dst);
779 DO_imm_mandr_r("dpps", 42, src, dst);
780 DO_imm_mandr_r("dpps", 43, src, dst);
781 DO_imm_mandr_r("dpps", 44, src, dst);
782 DO_imm_mandr_r("dpps", 45, src, dst);
783 DO_imm_mandr_r("dpps", 46, src, dst);
784 DO_imm_mandr_r("dpps", 47, src, dst);
785 DO_imm_mandr_r("dpps", 48, src, dst);
786 DO_imm_mandr_r("dpps", 49, src, dst);
787 DO_imm_mandr_r("dpps", 50, src, dst);
788 DO_imm_mandr_r("dpps", 51, src, dst);
789 DO_imm_mandr_r("dpps", 52, src, dst);
790 DO_imm_mandr_r("dpps", 53, src, dst);
791 DO_imm_mandr_r("dpps", 54, src, dst);
792 DO_imm_mandr_r("dpps", 55, src, dst);
793 DO_imm_mandr_r("dpps", 56, src, dst);
794 DO_imm_mandr_r("dpps", 57, src, dst);
795 DO_imm_mandr_r("dpps", 58, src, dst);
796 DO_imm_mandr_r("dpps", 59, src, dst);
797 DO_imm_mandr_r("dpps", 60, src, dst);
798 DO_imm_mandr_r("dpps", 61, src, dst);
799 DO_imm_mandr_r("dpps", 62, src, dst);
800 DO_imm_mandr_r("dpps", 63, src, dst);
801 DO_imm_mandr_r("dpps", 64, src, dst);
802 DO_imm_mandr_r("dpps", 65, src, dst);
803 DO_imm_mandr_r("dpps", 66, src, dst);
804 DO_imm_mandr_r("dpps", 67, src, dst);
805 DO_imm_mandr_r("dpps", 68, src, dst);
806 DO_imm_mandr_r("dpps", 69, src, dst);
807 DO_imm_mandr_r("dpps", 70, src, dst);
808 DO_imm_mandr_r("dpps", 71, src, dst);
809 DO_imm_mandr_r("dpps", 72, src, dst);
810 DO_imm_mandr_r("dpps", 73, src, dst);
811 DO_imm_mandr_r("dpps", 74, src, dst);
812 DO_imm_mandr_r("dpps", 75, src, dst);
813 DO_imm_mandr_r("dpps", 76, src, dst);
814 DO_imm_mandr_r("dpps", 77, src, dst);
815 DO_imm_mandr_r("dpps", 78, src, dst);
816 DO_imm_mandr_r("dpps", 79, src, dst);
817 DO_imm_mandr_r("dpps", 80, src, dst);
818 DO_imm_mandr_r("dpps", 81, src, dst);
819 DO_imm_mandr_r("dpps", 82, src, dst);
820 DO_imm_mandr_r("dpps", 83, src, dst);
821 DO_imm_mandr_r("dpps", 84, src, dst);
822 DO_imm_mandr_r("dpps", 85, src, dst);
823 DO_imm_mandr_r("dpps", 86, src, dst);
824 DO_imm_mandr_r("dpps", 87, src, dst);
825 DO_imm_mandr_r("dpps", 88, src, dst);
826 DO_imm_mandr_r("dpps", 89, src, dst);
827 DO_imm_mandr_r("dpps", 90, src, dst);
828 DO_imm_mandr_r("dpps", 91, src, dst);
829 DO_imm_mandr_r("dpps", 92, src, dst);
830 DO_imm_mandr_r("dpps", 93, src, dst);
831 DO_imm_mandr_r("dpps", 94, src, dst);
832 DO_imm_mandr_r("dpps", 95, src, dst);
833 DO_imm_mandr_r("dpps", 96, src, dst);
834 DO_imm_mandr_r("dpps", 97, src, dst);
835 DO_imm_mandr_r("dpps", 98, src, dst);
836 DO_imm_mandr_r("dpps", 99, src, dst);
837 DO_imm_mandr_r("dpps", 100, src, dst);
838 DO_imm_mandr_r("dpps", 101, src, dst);
839 DO_imm_mandr_r("dpps", 102, src, dst);
840 DO_imm_mandr_r("dpps", 103, src, dst);
841 DO_imm_mandr_r("dpps", 104, src, dst);
842 DO_imm_mandr_r("dpps", 105, src, dst);
843 DO_imm_mandr_r("dpps", 106, src, dst);
844 DO_imm_mandr_r("dpps", 107, src, dst);
845 DO_imm_mandr_r("dpps", 108, src, dst);
846 DO_imm_mandr_r("dpps", 109, src, dst);
847 DO_imm_mandr_r("dpps", 110, src, dst);
848 DO_imm_mandr_r("dpps", 111, src, dst);
849 DO_imm_mandr_r("dpps", 112, src, dst);
850 DO_imm_mandr_r("dpps", 113, src, dst);
851 DO_imm_mandr_r("dpps", 114, src, dst);
852 DO_imm_mandr_r("dpps", 115, src, dst);
853 DO_imm_mandr_r("dpps", 116, src, dst);
854 DO_imm_mandr_r("dpps", 117, src, dst);
855 DO_imm_mandr_r("dpps", 118, src, dst);
856 DO_imm_mandr_r("dpps", 119, src, dst);
857 DO_imm_mandr_r("dpps", 120, src, dst);
858 DO_imm_mandr_r("dpps", 121, src, dst);
859 DO_imm_mandr_r("dpps", 122, src, dst);
860 DO_imm_mandr_r("dpps", 123, src, dst);
861 DO_imm_mandr_r("dpps", 124, src, dst);
862 DO_imm_mandr_r("dpps", 125, src, dst);
863 DO_imm_mandr_r("dpps", 126, src, dst);
864 DO_imm_mandr_r("dpps", 127, src, dst);
865 DO_imm_mandr_r("dpps", 128, src, dst);
866 DO_imm_mandr_r("dpps", 129, src, dst);
867 DO_imm_mandr_r("dpps", 130, src, dst);
868 DO_imm_mandr_r("dpps", 131, src, dst);
869 DO_imm_mandr_r("dpps", 132, src, dst);
870 DO_imm_mandr_r("dpps", 133, src, dst);
871 DO_imm_mandr_r("dpps", 134, src, dst);
872 DO_imm_mandr_r("dpps", 135, src, dst);
873 DO_imm_mandr_r("dpps", 136, src, dst);
874 DO_imm_mandr_r("dpps", 137, src, dst);
875 DO_imm_mandr_r("dpps", 138, src, dst);
876 DO_imm_mandr_r("dpps", 139, src, dst);
877 DO_imm_mandr_r("dpps", 140, src, dst);
878 DO_imm_mandr_r("dpps", 141, src, dst);
879 DO_imm_mandr_r("dpps", 142, src, dst);
880 DO_imm_mandr_r("dpps", 143, src, dst);
881 DO_imm_mandr_r("dpps", 144, src, dst);
882 DO_imm_mandr_r("dpps", 145, src, dst);
883 DO_imm_mandr_r("dpps", 146, src, dst);
884 DO_imm_mandr_r("dpps", 147, src, dst);
885 DO_imm_mandr_r("dpps", 148, src, dst);
886 DO_imm_mandr_r("dpps", 149, src, dst);
887 DO_imm_mandr_r("dpps", 150, src, dst);
888 DO_imm_mandr_r("dpps", 151, src, dst);
889 DO_imm_mandr_r("dpps", 152, src, dst);
890 DO_imm_mandr_r("dpps", 153, src, dst);
891 DO_imm_mandr_r("dpps", 154, src, dst);
892 DO_imm_mandr_r("dpps", 155, src, dst);
893 DO_imm_mandr_r("dpps", 156, src, dst);
894 DO_imm_mandr_r("dpps", 157, src, dst);
895 DO_imm_mandr_r("dpps", 158, src, dst);
896 DO_imm_mandr_r("dpps", 159, src, dst);
897 DO_imm_mandr_r("dpps", 160, src, dst);
898 DO_imm_mandr_r("dpps", 161, src, dst);
899 DO_imm_mandr_r("dpps", 162, src, dst);
900 DO_imm_mandr_r("dpps", 163, src, dst);
901 DO_imm_mandr_r("dpps", 164, src, dst);
902 DO_imm_mandr_r("dpps", 165, src, dst);
903 DO_imm_mandr_r("dpps", 166, src, dst);
904 DO_imm_mandr_r("dpps", 167, src, dst);
905 DO_imm_mandr_r("dpps", 168, src, dst);
906 DO_imm_mandr_r("dpps", 169, src, dst);
907 DO_imm_mandr_r("dpps", 170, src, dst);
908 DO_imm_mandr_r("dpps", 171, src, dst);
909 DO_imm_mandr_r("dpps", 172, src, dst);
910 DO_imm_mandr_r("dpps", 173, src, dst);
911 DO_imm_mandr_r("dpps", 174, src, dst);
912 DO_imm_mandr_r("dpps", 175, src, dst);
913 DO_imm_mandr_r("dpps", 176, src, dst);
914 DO_imm_mandr_r("dpps", 177, src, dst);
915 DO_imm_mandr_r("dpps", 178, src, dst);
916 DO_imm_mandr_r("dpps", 179, src, dst);
917 DO_imm_mandr_r("dpps", 180, src, dst);
918 DO_imm_mandr_r("dpps", 181, src, dst);
919 DO_imm_mandr_r("dpps", 182, src, dst);
920 DO_imm_mandr_r("dpps", 183, src, dst);
921 DO_imm_mandr_r("dpps", 184, src, dst);
922 DO_imm_mandr_r("dpps", 185, src, dst);
923 DO_imm_mandr_r("dpps", 186, src, dst);
924 DO_imm_mandr_r("dpps", 187, src, dst);
925 DO_imm_mandr_r("dpps", 188, src, dst);
926 DO_imm_mandr_r("dpps", 189, src, dst);
927 DO_imm_mandr_r("dpps", 190, src, dst);
928 DO_imm_mandr_r("dpps", 191, src, dst);
929 DO_imm_mandr_r("dpps", 192, src, dst);
930 DO_imm_mandr_r("dpps", 193, src, dst);
931 DO_imm_mandr_r("dpps", 194, src, dst);
932 DO_imm_mandr_r("dpps", 195, src, dst);
933 DO_imm_mandr_r("dpps", 196, src, dst);
934 DO_imm_mandr_r("dpps", 197, src, dst);
935 DO_imm_mandr_r("dpps", 198, src, dst);
936 DO_imm_mandr_r("dpps", 199, src, dst);
937 DO_imm_mandr_r("dpps", 200, src, dst);
938 DO_imm_mandr_r("dpps", 201, src, dst);
939 DO_imm_mandr_r("dpps", 202, src, dst);
940 DO_imm_mandr_r("dpps", 203, src, dst);
941 DO_imm_mandr_r("dpps", 204, src, dst);
942 DO_imm_mandr_r("dpps", 205, src, dst);
943 DO_imm_mandr_r("dpps", 206, src, dst);
944 DO_imm_mandr_r("dpps", 207, src, dst);
945 DO_imm_mandr_r("dpps", 208, src, dst);
946 DO_imm_mandr_r("dpps", 209, src, dst);
947 DO_imm_mandr_r("dpps", 210, src, dst);
948 DO_imm_mandr_r("dpps", 211, src, dst);
949 DO_imm_mandr_r("dpps", 212, src, dst);
950 DO_imm_mandr_r("dpps", 213, src, dst);
951 DO_imm_mandr_r("dpps", 214, src, dst);
952 DO_imm_mandr_r("dpps", 215, src, dst);
953 DO_imm_mandr_r("dpps", 216, src, dst);
954 DO_imm_mandr_r("dpps", 217, src, dst);
955 DO_imm_mandr_r("dpps", 218, src, dst);
956 DO_imm_mandr_r("dpps", 219, src, dst);
957 DO_imm_mandr_r("dpps", 220, src, dst);
958 DO_imm_mandr_r("dpps", 221, src, dst);
959 DO_imm_mandr_r("dpps", 222, src, dst);
960 DO_imm_mandr_r("dpps", 223, src, dst);
961 DO_imm_mandr_r("dpps", 224, src, dst);
962 DO_imm_mandr_r("dpps", 225, src, dst);
963 DO_imm_mandr_r("dpps", 226, src, dst);
964 DO_imm_mandr_r("dpps", 227, src, dst);
965 DO_imm_mandr_r("dpps", 228, src, dst);
966 DO_imm_mandr_r("dpps", 229, src, dst);
967 DO_imm_mandr_r("dpps", 230, src, dst);
968 DO_imm_mandr_r("dpps", 231, src, dst);
969 DO_imm_mandr_r("dpps", 232, src, dst);
970 DO_imm_mandr_r("dpps", 233, src, dst);
971 DO_imm_mandr_r("dpps", 234, src, dst);
972 DO_imm_mandr_r("dpps", 235, src, dst);
973 DO_imm_mandr_r("dpps", 236, src, dst);
974 DO_imm_mandr_r("dpps", 237, src, dst);
975 DO_imm_mandr_r("dpps", 238, src, dst);
976 DO_imm_mandr_r("dpps", 239, src, dst);
977 DO_imm_mandr_r("dpps", 240, src, dst);
978 DO_imm_mandr_r("dpps", 241, src, dst);
979 DO_imm_mandr_r("dpps", 242, src, dst);
980 DO_imm_mandr_r("dpps", 243, src, dst);
981 DO_imm_mandr_r("dpps", 244, src, dst);
982 DO_imm_mandr_r("dpps", 245, src, dst);
983 DO_imm_mandr_r("dpps", 246, src, dst);
984 DO_imm_mandr_r("dpps", 247, src, dst);
985 DO_imm_mandr_r("dpps", 248, src, dst);
986 DO_imm_mandr_r("dpps", 249, src, dst);
987 DO_imm_mandr_r("dpps", 250, src, dst);
988 DO_imm_mandr_r("dpps", 251, src, dst);
989 DO_imm_mandr_r("dpps", 252, src, dst);
990 DO_imm_mandr_r("dpps", 253, src, dst);
991 DO_imm_mandr_r("dpps", 254, src, dst);
992 DO_imm_mandr_r("dpps", 255, src, dst);
993 }
994 }
995
test_INSERTPS(void)996 void test_INSERTPS ( void )
997 {
998 V128 src, dst;
999 {
1000 *(float*)(&src[0]) = 1.2;
1001 *(float*)(&src[4]) = -3.4;
1002 *(float*)(&src[8]) = -6.7;
1003 *(float*)(&src[12]) = 8.9;
1004 *(float*)(&dst[0]) = -10.11;
1005 *(float*)(&dst[4]) = 12.13;
1006 *(float*)(&dst[8]) = 14.15;
1007 *(float*)(&dst[12]) = -16.17;
1008 DO_imm_mandr_r("insertps", 0, src, dst);
1009 DO_imm_mandr_r("insertps", 1, src, dst);
1010 DO_imm_mandr_r("insertps", 2, src, dst);
1011 DO_imm_mandr_r("insertps", 3, src, dst);
1012 DO_imm_mandr_r("insertps", 4, src, dst);
1013 DO_imm_mandr_r("insertps", 5, src, dst);
1014 DO_imm_mandr_r("insertps", 6, src, dst);
1015 DO_imm_mandr_r("insertps", 7, src, dst);
1016 DO_imm_mandr_r("insertps", 8, src, dst);
1017 DO_imm_mandr_r("insertps", 9, src, dst);
1018 DO_imm_mandr_r("insertps", 10, src, dst);
1019 DO_imm_mandr_r("insertps", 11, src, dst);
1020 DO_imm_mandr_r("insertps", 12, src, dst);
1021 DO_imm_mandr_r("insertps", 13, src, dst);
1022 DO_imm_mandr_r("insertps", 14, src, dst);
1023 DO_imm_mandr_r("insertps", 15, src, dst);
1024 DO_imm_mandr_r("insertps", 16, src, dst);
1025 DO_imm_mandr_r("insertps", 17, src, dst);
1026 DO_imm_mandr_r("insertps", 18, src, dst);
1027 DO_imm_mandr_r("insertps", 19, src, dst);
1028 DO_imm_mandr_r("insertps", 20, src, dst);
1029 DO_imm_mandr_r("insertps", 21, src, dst);
1030 DO_imm_mandr_r("insertps", 22, src, dst);
1031 DO_imm_mandr_r("insertps", 23, src, dst);
1032 DO_imm_mandr_r("insertps", 24, src, dst);
1033 DO_imm_mandr_r("insertps", 25, src, dst);
1034 DO_imm_mandr_r("insertps", 26, src, dst);
1035 DO_imm_mandr_r("insertps", 27, src, dst);
1036 DO_imm_mandr_r("insertps", 28, src, dst);
1037 DO_imm_mandr_r("insertps", 29, src, dst);
1038 DO_imm_mandr_r("insertps", 30, src, dst);
1039 DO_imm_mandr_r("insertps", 31, src, dst);
1040 DO_imm_mandr_r("insertps", 32, src, dst);
1041 DO_imm_mandr_r("insertps", 33, src, dst);
1042 DO_imm_mandr_r("insertps", 34, src, dst);
1043 DO_imm_mandr_r("insertps", 35, src, dst);
1044 DO_imm_mandr_r("insertps", 36, src, dst);
1045 DO_imm_mandr_r("insertps", 37, src, dst);
1046 DO_imm_mandr_r("insertps", 38, src, dst);
1047 DO_imm_mandr_r("insertps", 39, src, dst);
1048 DO_imm_mandr_r("insertps", 40, src, dst);
1049 DO_imm_mandr_r("insertps", 41, src, dst);
1050 DO_imm_mandr_r("insertps", 42, src, dst);
1051 DO_imm_mandr_r("insertps", 43, src, dst);
1052 DO_imm_mandr_r("insertps", 44, src, dst);
1053 DO_imm_mandr_r("insertps", 45, src, dst);
1054 DO_imm_mandr_r("insertps", 46, src, dst);
1055 DO_imm_mandr_r("insertps", 47, src, dst);
1056 DO_imm_mandr_r("insertps", 48, src, dst);
1057 DO_imm_mandr_r("insertps", 49, src, dst);
1058 DO_imm_mandr_r("insertps", 50, src, dst);
1059 DO_imm_mandr_r("insertps", 51, src, dst);
1060 DO_imm_mandr_r("insertps", 52, src, dst);
1061 DO_imm_mandr_r("insertps", 53, src, dst);
1062 DO_imm_mandr_r("insertps", 54, src, dst);
1063 DO_imm_mandr_r("insertps", 55, src, dst);
1064 DO_imm_mandr_r("insertps", 56, src, dst);
1065 DO_imm_mandr_r("insertps", 57, src, dst);
1066 DO_imm_mandr_r("insertps", 58, src, dst);
1067 DO_imm_mandr_r("insertps", 59, src, dst);
1068 DO_imm_mandr_r("insertps", 60, src, dst);
1069 DO_imm_mandr_r("insertps", 61, src, dst);
1070 DO_imm_mandr_r("insertps", 62, src, dst);
1071 DO_imm_mandr_r("insertps", 63, src, dst);
1072 DO_imm_mandr_r("insertps", 64, src, dst);
1073 DO_imm_mandr_r("insertps", 65, src, dst);
1074 DO_imm_mandr_r("insertps", 66, src, dst);
1075 DO_imm_mandr_r("insertps", 67, src, dst);
1076 DO_imm_mandr_r("insertps", 68, src, dst);
1077 DO_imm_mandr_r("insertps", 69, src, dst);
1078 DO_imm_mandr_r("insertps", 70, src, dst);
1079 DO_imm_mandr_r("insertps", 71, src, dst);
1080 DO_imm_mandr_r("insertps", 72, src, dst);
1081 DO_imm_mandr_r("insertps", 73, src, dst);
1082 DO_imm_mandr_r("insertps", 74, src, dst);
1083 DO_imm_mandr_r("insertps", 75, src, dst);
1084 DO_imm_mandr_r("insertps", 76, src, dst);
1085 DO_imm_mandr_r("insertps", 77, src, dst);
1086 DO_imm_mandr_r("insertps", 78, src, dst);
1087 DO_imm_mandr_r("insertps", 79, src, dst);
1088 DO_imm_mandr_r("insertps", 80, src, dst);
1089 DO_imm_mandr_r("insertps", 81, src, dst);
1090 DO_imm_mandr_r("insertps", 82, src, dst);
1091 DO_imm_mandr_r("insertps", 83, src, dst);
1092 DO_imm_mandr_r("insertps", 84, src, dst);
1093 DO_imm_mandr_r("insertps", 85, src, dst);
1094 DO_imm_mandr_r("insertps", 86, src, dst);
1095 DO_imm_mandr_r("insertps", 87, src, dst);
1096 DO_imm_mandr_r("insertps", 88, src, dst);
1097 DO_imm_mandr_r("insertps", 89, src, dst);
1098 DO_imm_mandr_r("insertps", 90, src, dst);
1099 DO_imm_mandr_r("insertps", 91, src, dst);
1100 DO_imm_mandr_r("insertps", 92, src, dst);
1101 DO_imm_mandr_r("insertps", 93, src, dst);
1102 DO_imm_mandr_r("insertps", 94, src, dst);
1103 DO_imm_mandr_r("insertps", 95, src, dst);
1104 DO_imm_mandr_r("insertps", 96, src, dst);
1105 DO_imm_mandr_r("insertps", 97, src, dst);
1106 DO_imm_mandr_r("insertps", 98, src, dst);
1107 DO_imm_mandr_r("insertps", 99, src, dst);
1108 DO_imm_mandr_r("insertps", 100, src, dst);
1109 DO_imm_mandr_r("insertps", 101, src, dst);
1110 DO_imm_mandr_r("insertps", 102, src, dst);
1111 DO_imm_mandr_r("insertps", 103, src, dst);
1112 DO_imm_mandr_r("insertps", 104, src, dst);
1113 DO_imm_mandr_r("insertps", 105, src, dst);
1114 DO_imm_mandr_r("insertps", 106, src, dst);
1115 DO_imm_mandr_r("insertps", 107, src, dst);
1116 DO_imm_mandr_r("insertps", 108, src, dst);
1117 DO_imm_mandr_r("insertps", 109, src, dst);
1118 DO_imm_mandr_r("insertps", 110, src, dst);
1119 DO_imm_mandr_r("insertps", 111, src, dst);
1120 DO_imm_mandr_r("insertps", 112, src, dst);
1121 DO_imm_mandr_r("insertps", 113, src, dst);
1122 DO_imm_mandr_r("insertps", 114, src, dst);
1123 DO_imm_mandr_r("insertps", 115, src, dst);
1124 DO_imm_mandr_r("insertps", 116, src, dst);
1125 DO_imm_mandr_r("insertps", 117, src, dst);
1126 DO_imm_mandr_r("insertps", 118, src, dst);
1127 DO_imm_mandr_r("insertps", 119, src, dst);
1128 DO_imm_mandr_r("insertps", 120, src, dst);
1129 DO_imm_mandr_r("insertps", 121, src, dst);
1130 DO_imm_mandr_r("insertps", 122, src, dst);
1131 DO_imm_mandr_r("insertps", 123, src, dst);
1132 DO_imm_mandr_r("insertps", 124, src, dst);
1133 DO_imm_mandr_r("insertps", 125, src, dst);
1134 DO_imm_mandr_r("insertps", 126, src, dst);
1135 DO_imm_mandr_r("insertps", 127, src, dst);
1136 DO_imm_mandr_r("insertps", 128, src, dst);
1137 DO_imm_mandr_r("insertps", 129, src, dst);
1138 DO_imm_mandr_r("insertps", 130, src, dst);
1139 DO_imm_mandr_r("insertps", 131, src, dst);
1140 DO_imm_mandr_r("insertps", 132, src, dst);
1141 DO_imm_mandr_r("insertps", 133, src, dst);
1142 DO_imm_mandr_r("insertps", 134, src, dst);
1143 DO_imm_mandr_r("insertps", 135, src, dst);
1144 DO_imm_mandr_r("insertps", 136, src, dst);
1145 DO_imm_mandr_r("insertps", 137, src, dst);
1146 DO_imm_mandr_r("insertps", 138, src, dst);
1147 DO_imm_mandr_r("insertps", 139, src, dst);
1148 DO_imm_mandr_r("insertps", 140, src, dst);
1149 DO_imm_mandr_r("insertps", 141, src, dst);
1150 DO_imm_mandr_r("insertps", 142, src, dst);
1151 DO_imm_mandr_r("insertps", 143, src, dst);
1152 DO_imm_mandr_r("insertps", 144, src, dst);
1153 DO_imm_mandr_r("insertps", 145, src, dst);
1154 DO_imm_mandr_r("insertps", 146, src, dst);
1155 DO_imm_mandr_r("insertps", 147, src, dst);
1156 DO_imm_mandr_r("insertps", 148, src, dst);
1157 DO_imm_mandr_r("insertps", 149, src, dst);
1158 DO_imm_mandr_r("insertps", 150, src, dst);
1159 DO_imm_mandr_r("insertps", 151, src, dst);
1160 DO_imm_mandr_r("insertps", 152, src, dst);
1161 DO_imm_mandr_r("insertps", 153, src, dst);
1162 DO_imm_mandr_r("insertps", 154, src, dst);
1163 DO_imm_mandr_r("insertps", 155, src, dst);
1164 DO_imm_mandr_r("insertps", 156, src, dst);
1165 DO_imm_mandr_r("insertps", 157, src, dst);
1166 DO_imm_mandr_r("insertps", 158, src, dst);
1167 DO_imm_mandr_r("insertps", 159, src, dst);
1168 DO_imm_mandr_r("insertps", 160, src, dst);
1169 DO_imm_mandr_r("insertps", 161, src, dst);
1170 DO_imm_mandr_r("insertps", 162, src, dst);
1171 DO_imm_mandr_r("insertps", 163, src, dst);
1172 DO_imm_mandr_r("insertps", 164, src, dst);
1173 DO_imm_mandr_r("insertps", 165, src, dst);
1174 DO_imm_mandr_r("insertps", 166, src, dst);
1175 DO_imm_mandr_r("insertps", 167, src, dst);
1176 DO_imm_mandr_r("insertps", 168, src, dst);
1177 DO_imm_mandr_r("insertps", 169, src, dst);
1178 DO_imm_mandr_r("insertps", 170, src, dst);
1179 DO_imm_mandr_r("insertps", 171, src, dst);
1180 DO_imm_mandr_r("insertps", 172, src, dst);
1181 DO_imm_mandr_r("insertps", 173, src, dst);
1182 DO_imm_mandr_r("insertps", 174, src, dst);
1183 DO_imm_mandr_r("insertps", 175, src, dst);
1184 DO_imm_mandr_r("insertps", 176, src, dst);
1185 DO_imm_mandr_r("insertps", 177, src, dst);
1186 DO_imm_mandr_r("insertps", 178, src, dst);
1187 DO_imm_mandr_r("insertps", 179, src, dst);
1188 DO_imm_mandr_r("insertps", 180, src, dst);
1189 DO_imm_mandr_r("insertps", 181, src, dst);
1190 DO_imm_mandr_r("insertps", 182, src, dst);
1191 DO_imm_mandr_r("insertps", 183, src, dst);
1192 DO_imm_mandr_r("insertps", 184, src, dst);
1193 DO_imm_mandr_r("insertps", 185, src, dst);
1194 DO_imm_mandr_r("insertps", 186, src, dst);
1195 DO_imm_mandr_r("insertps", 187, src, dst);
1196 DO_imm_mandr_r("insertps", 188, src, dst);
1197 DO_imm_mandr_r("insertps", 189, src, dst);
1198 DO_imm_mandr_r("insertps", 190, src, dst);
1199 DO_imm_mandr_r("insertps", 191, src, dst);
1200 DO_imm_mandr_r("insertps", 192, src, dst);
1201 DO_imm_mandr_r("insertps", 193, src, dst);
1202 DO_imm_mandr_r("insertps", 194, src, dst);
1203 DO_imm_mandr_r("insertps", 195, src, dst);
1204 DO_imm_mandr_r("insertps", 196, src, dst);
1205 DO_imm_mandr_r("insertps", 197, src, dst);
1206 DO_imm_mandr_r("insertps", 198, src, dst);
1207 DO_imm_mandr_r("insertps", 199, src, dst);
1208 DO_imm_mandr_r("insertps", 200, src, dst);
1209 DO_imm_mandr_r("insertps", 201, src, dst);
1210 DO_imm_mandr_r("insertps", 202, src, dst);
1211 DO_imm_mandr_r("insertps", 203, src, dst);
1212 DO_imm_mandr_r("insertps", 204, src, dst);
1213 DO_imm_mandr_r("insertps", 205, src, dst);
1214 DO_imm_mandr_r("insertps", 206, src, dst);
1215 DO_imm_mandr_r("insertps", 207, src, dst);
1216 DO_imm_mandr_r("insertps", 208, src, dst);
1217 DO_imm_mandr_r("insertps", 209, src, dst);
1218 DO_imm_mandr_r("insertps", 210, src, dst);
1219 DO_imm_mandr_r("insertps", 211, src, dst);
1220 DO_imm_mandr_r("insertps", 212, src, dst);
1221 DO_imm_mandr_r("insertps", 213, src, dst);
1222 DO_imm_mandr_r("insertps", 214, src, dst);
1223 DO_imm_mandr_r("insertps", 215, src, dst);
1224 DO_imm_mandr_r("insertps", 216, src, dst);
1225 DO_imm_mandr_r("insertps", 217, src, dst);
1226 DO_imm_mandr_r("insertps", 218, src, dst);
1227 DO_imm_mandr_r("insertps", 219, src, dst);
1228 DO_imm_mandr_r("insertps", 220, src, dst);
1229 DO_imm_mandr_r("insertps", 221, src, dst);
1230 DO_imm_mandr_r("insertps", 222, src, dst);
1231 DO_imm_mandr_r("insertps", 223, src, dst);
1232 DO_imm_mandr_r("insertps", 224, src, dst);
1233 DO_imm_mandr_r("insertps", 225, src, dst);
1234 DO_imm_mandr_r("insertps", 226, src, dst);
1235 DO_imm_mandr_r("insertps", 227, src, dst);
1236 DO_imm_mandr_r("insertps", 228, src, dst);
1237 DO_imm_mandr_r("insertps", 229, src, dst);
1238 DO_imm_mandr_r("insertps", 230, src, dst);
1239 DO_imm_mandr_r("insertps", 231, src, dst);
1240 DO_imm_mandr_r("insertps", 232, src, dst);
1241 DO_imm_mandr_r("insertps", 233, src, dst);
1242 DO_imm_mandr_r("insertps", 234, src, dst);
1243 DO_imm_mandr_r("insertps", 235, src, dst);
1244 DO_imm_mandr_r("insertps", 236, src, dst);
1245 DO_imm_mandr_r("insertps", 237, src, dst);
1246 DO_imm_mandr_r("insertps", 238, src, dst);
1247 DO_imm_mandr_r("insertps", 239, src, dst);
1248 DO_imm_mandr_r("insertps", 240, src, dst);
1249 DO_imm_mandr_r("insertps", 241, src, dst);
1250 DO_imm_mandr_r("insertps", 242, src, dst);
1251 DO_imm_mandr_r("insertps", 243, src, dst);
1252 DO_imm_mandr_r("insertps", 244, src, dst);
1253 DO_imm_mandr_r("insertps", 245, src, dst);
1254 DO_imm_mandr_r("insertps", 246, src, dst);
1255 DO_imm_mandr_r("insertps", 247, src, dst);
1256 DO_imm_mandr_r("insertps", 248, src, dst);
1257 DO_imm_mandr_r("insertps", 249, src, dst);
1258 DO_imm_mandr_r("insertps", 250, src, dst);
1259 DO_imm_mandr_r("insertps", 251, src, dst);
1260 DO_imm_mandr_r("insertps", 252, src, dst);
1261 DO_imm_mandr_r("insertps", 253, src, dst);
1262 DO_imm_mandr_r("insertps", 254, src, dst);
1263 DO_imm_mandr_r("insertps", 255, src, dst);
1264 }
1265 }
1266
test_MPSADBW(void)1267 void test_MPSADBW ( void )
1268 {
1269 V128 src, dst;
1270 Int i;
1271 for (i = 0; i < 50; i++) {
1272 randV128(&src);
1273 randV128(&dst);
1274 DO_imm_mandr_r("mpsadbw", 0, src, dst);
1275 DO_imm_mandr_r("mpsadbw", 1, src, dst);
1276 DO_imm_mandr_r("mpsadbw", 2, src, dst);
1277 DO_imm_mandr_r("mpsadbw", 3, src, dst);
1278 DO_imm_mandr_r("mpsadbw", 4, src, dst);
1279 DO_imm_mandr_r("mpsadbw", 5, src, dst);
1280 DO_imm_mandr_r("mpsadbw", 6, src, dst);
1281 DO_imm_mandr_r("mpsadbw", 7, src, dst);
1282 }
1283 }
1284
test_PACKUSDW(void)1285 void test_PACKUSDW ( void )
1286 {
1287 V128 src, dst;
1288 Int i;
1289 for (i = 0; i < 10; i++) {
1290 if (i < 9) {
1291 randV128(&src);
1292 randV128(&dst);
1293 } else {
1294 memset(&src, 0, sizeof(src));
1295 memset(&dst, 0, sizeof(src));
1296 src[0] = 0x11; src[1] = 0x22;
1297 src[4] = 0x33; src[5] = 0x44;
1298 src[8] = 0x55; src[9] = 0x66;
1299 src[12] = 0x77; src[13] = 0x88;
1300 dst[0] = 0xaa; dst[1] = 0xbb;
1301 dst[4] = 0xcc; dst[5] = 0xdd;
1302 dst[8] = 0xee; dst[9] = 0xff;
1303 dst[12] = 0xa1; dst[13] = 0xb2;
1304 }
1305 DO_mandr_r("packusdw", src, dst);
1306 }
1307 }
1308
test_PBLENDW(void)1309 void test_PBLENDW ( void )
1310 {
1311 V128 src, dst;
1312 randV128(&src);
1313 randV128(&dst);
1314 {
1315 DO_imm_mandr_r("pblendw", 0, src, dst);
1316 DO_imm_mandr_r("pblendw", 1, src, dst);
1317 DO_imm_mandr_r("pblendw", 2, src, dst);
1318 DO_imm_mandr_r("pblendw", 3, src, dst);
1319 DO_imm_mandr_r("pblendw", 4, src, dst);
1320 DO_imm_mandr_r("pblendw", 5, src, dst);
1321 DO_imm_mandr_r("pblendw", 6, src, dst);
1322 DO_imm_mandr_r("pblendw", 7, src, dst);
1323 DO_imm_mandr_r("pblendw", 8, src, dst);
1324 DO_imm_mandr_r("pblendw", 9, src, dst);
1325 DO_imm_mandr_r("pblendw", 10, src, dst);
1326 DO_imm_mandr_r("pblendw", 11, src, dst);
1327 DO_imm_mandr_r("pblendw", 12, src, dst);
1328 DO_imm_mandr_r("pblendw", 13, src, dst);
1329 DO_imm_mandr_r("pblendw", 14, src, dst);
1330 DO_imm_mandr_r("pblendw", 15, src, dst);
1331 DO_imm_mandr_r("pblendw", 16, src, dst);
1332 DO_imm_mandr_r("pblendw", 17, src, dst);
1333 DO_imm_mandr_r("pblendw", 18, src, dst);
1334 DO_imm_mandr_r("pblendw", 19, src, dst);
1335 DO_imm_mandr_r("pblendw", 20, src, dst);
1336 DO_imm_mandr_r("pblendw", 21, src, dst);
1337 DO_imm_mandr_r("pblendw", 22, src, dst);
1338 DO_imm_mandr_r("pblendw", 23, src, dst);
1339 DO_imm_mandr_r("pblendw", 24, src, dst);
1340 DO_imm_mandr_r("pblendw", 25, src, dst);
1341 DO_imm_mandr_r("pblendw", 26, src, dst);
1342 DO_imm_mandr_r("pblendw", 27, src, dst);
1343 DO_imm_mandr_r("pblendw", 28, src, dst);
1344 DO_imm_mandr_r("pblendw", 29, src, dst);
1345 DO_imm_mandr_r("pblendw", 30, src, dst);
1346 DO_imm_mandr_r("pblendw", 31, src, dst);
1347 DO_imm_mandr_r("pblendw", 32, src, dst);
1348 DO_imm_mandr_r("pblendw", 33, src, dst);
1349 DO_imm_mandr_r("pblendw", 34, src, dst);
1350 DO_imm_mandr_r("pblendw", 35, src, dst);
1351 DO_imm_mandr_r("pblendw", 36, src, dst);
1352 DO_imm_mandr_r("pblendw", 37, src, dst);
1353 DO_imm_mandr_r("pblendw", 38, src, dst);
1354 DO_imm_mandr_r("pblendw", 39, src, dst);
1355 DO_imm_mandr_r("pblendw", 40, src, dst);
1356 DO_imm_mandr_r("pblendw", 41, src, dst);
1357 DO_imm_mandr_r("pblendw", 42, src, dst);
1358 DO_imm_mandr_r("pblendw", 43, src, dst);
1359 DO_imm_mandr_r("pblendw", 44, src, dst);
1360 DO_imm_mandr_r("pblendw", 45, src, dst);
1361 DO_imm_mandr_r("pblendw", 46, src, dst);
1362 DO_imm_mandr_r("pblendw", 47, src, dst);
1363 DO_imm_mandr_r("pblendw", 48, src, dst);
1364 DO_imm_mandr_r("pblendw", 49, src, dst);
1365 DO_imm_mandr_r("pblendw", 50, src, dst);
1366 DO_imm_mandr_r("pblendw", 51, src, dst);
1367 DO_imm_mandr_r("pblendw", 52, src, dst);
1368 DO_imm_mandr_r("pblendw", 53, src, dst);
1369 DO_imm_mandr_r("pblendw", 54, src, dst);
1370 DO_imm_mandr_r("pblendw", 55, src, dst);
1371 DO_imm_mandr_r("pblendw", 56, src, dst);
1372 DO_imm_mandr_r("pblendw", 57, src, dst);
1373 DO_imm_mandr_r("pblendw", 58, src, dst);
1374 DO_imm_mandr_r("pblendw", 59, src, dst);
1375 DO_imm_mandr_r("pblendw", 60, src, dst);
1376 DO_imm_mandr_r("pblendw", 61, src, dst);
1377 DO_imm_mandr_r("pblendw", 62, src, dst);
1378 DO_imm_mandr_r("pblendw", 63, src, dst);
1379 DO_imm_mandr_r("pblendw", 64, src, dst);
1380 DO_imm_mandr_r("pblendw", 65, src, dst);
1381 DO_imm_mandr_r("pblendw", 66, src, dst);
1382 DO_imm_mandr_r("pblendw", 67, src, dst);
1383 DO_imm_mandr_r("pblendw", 68, src, dst);
1384 DO_imm_mandr_r("pblendw", 69, src, dst);
1385 DO_imm_mandr_r("pblendw", 70, src, dst);
1386 DO_imm_mandr_r("pblendw", 71, src, dst);
1387 DO_imm_mandr_r("pblendw", 72, src, dst);
1388 DO_imm_mandr_r("pblendw", 73, src, dst);
1389 DO_imm_mandr_r("pblendw", 74, src, dst);
1390 DO_imm_mandr_r("pblendw", 75, src, dst);
1391 DO_imm_mandr_r("pblendw", 76, src, dst);
1392 DO_imm_mandr_r("pblendw", 77, src, dst);
1393 DO_imm_mandr_r("pblendw", 78, src, dst);
1394 DO_imm_mandr_r("pblendw", 79, src, dst);
1395 DO_imm_mandr_r("pblendw", 80, src, dst);
1396 DO_imm_mandr_r("pblendw", 81, src, dst);
1397 DO_imm_mandr_r("pblendw", 82, src, dst);
1398 DO_imm_mandr_r("pblendw", 83, src, dst);
1399 DO_imm_mandr_r("pblendw", 84, src, dst);
1400 DO_imm_mandr_r("pblendw", 85, src, dst);
1401 DO_imm_mandr_r("pblendw", 86, src, dst);
1402 DO_imm_mandr_r("pblendw", 87, src, dst);
1403 DO_imm_mandr_r("pblendw", 88, src, dst);
1404 DO_imm_mandr_r("pblendw", 89, src, dst);
1405 DO_imm_mandr_r("pblendw", 90, src, dst);
1406 DO_imm_mandr_r("pblendw", 91, src, dst);
1407 DO_imm_mandr_r("pblendw", 92, src, dst);
1408 DO_imm_mandr_r("pblendw", 93, src, dst);
1409 DO_imm_mandr_r("pblendw", 94, src, dst);
1410 DO_imm_mandr_r("pblendw", 95, src, dst);
1411 DO_imm_mandr_r("pblendw", 96, src, dst);
1412 DO_imm_mandr_r("pblendw", 97, src, dst);
1413 DO_imm_mandr_r("pblendw", 98, src, dst);
1414 DO_imm_mandr_r("pblendw", 99, src, dst);
1415 DO_imm_mandr_r("pblendw", 100, src, dst);
1416 DO_imm_mandr_r("pblendw", 101, src, dst);
1417 DO_imm_mandr_r("pblendw", 102, src, dst);
1418 DO_imm_mandr_r("pblendw", 103, src, dst);
1419 DO_imm_mandr_r("pblendw", 104, src, dst);
1420 DO_imm_mandr_r("pblendw", 105, src, dst);
1421 DO_imm_mandr_r("pblendw", 106, src, dst);
1422 DO_imm_mandr_r("pblendw", 107, src, dst);
1423 DO_imm_mandr_r("pblendw", 108, src, dst);
1424 DO_imm_mandr_r("pblendw", 109, src, dst);
1425 DO_imm_mandr_r("pblendw", 110, src, dst);
1426 DO_imm_mandr_r("pblendw", 111, src, dst);
1427 DO_imm_mandr_r("pblendw", 112, src, dst);
1428 DO_imm_mandr_r("pblendw", 113, src, dst);
1429 DO_imm_mandr_r("pblendw", 114, src, dst);
1430 DO_imm_mandr_r("pblendw", 115, src, dst);
1431 DO_imm_mandr_r("pblendw", 116, src, dst);
1432 DO_imm_mandr_r("pblendw", 117, src, dst);
1433 DO_imm_mandr_r("pblendw", 118, src, dst);
1434 DO_imm_mandr_r("pblendw", 119, src, dst);
1435 DO_imm_mandr_r("pblendw", 120, src, dst);
1436 DO_imm_mandr_r("pblendw", 121, src, dst);
1437 DO_imm_mandr_r("pblendw", 122, src, dst);
1438 DO_imm_mandr_r("pblendw", 123, src, dst);
1439 DO_imm_mandr_r("pblendw", 124, src, dst);
1440 DO_imm_mandr_r("pblendw", 125, src, dst);
1441 DO_imm_mandr_r("pblendw", 126, src, dst);
1442 DO_imm_mandr_r("pblendw", 127, src, dst);
1443 DO_imm_mandr_r("pblendw", 128, src, dst);
1444 DO_imm_mandr_r("pblendw", 129, src, dst);
1445 DO_imm_mandr_r("pblendw", 130, src, dst);
1446 DO_imm_mandr_r("pblendw", 131, src, dst);
1447 DO_imm_mandr_r("pblendw", 132, src, dst);
1448 DO_imm_mandr_r("pblendw", 133, src, dst);
1449 DO_imm_mandr_r("pblendw", 134, src, dst);
1450 DO_imm_mandr_r("pblendw", 135, src, dst);
1451 DO_imm_mandr_r("pblendw", 136, src, dst);
1452 DO_imm_mandr_r("pblendw", 137, src, dst);
1453 DO_imm_mandr_r("pblendw", 138, src, dst);
1454 DO_imm_mandr_r("pblendw", 139, src, dst);
1455 DO_imm_mandr_r("pblendw", 140, src, dst);
1456 DO_imm_mandr_r("pblendw", 141, src, dst);
1457 DO_imm_mandr_r("pblendw", 142, src, dst);
1458 DO_imm_mandr_r("pblendw", 143, src, dst);
1459 DO_imm_mandr_r("pblendw", 144, src, dst);
1460 DO_imm_mandr_r("pblendw", 145, src, dst);
1461 DO_imm_mandr_r("pblendw", 146, src, dst);
1462 DO_imm_mandr_r("pblendw", 147, src, dst);
1463 DO_imm_mandr_r("pblendw", 148, src, dst);
1464 DO_imm_mandr_r("pblendw", 149, src, dst);
1465 DO_imm_mandr_r("pblendw", 150, src, dst);
1466 DO_imm_mandr_r("pblendw", 151, src, dst);
1467 DO_imm_mandr_r("pblendw", 152, src, dst);
1468 DO_imm_mandr_r("pblendw", 153, src, dst);
1469 DO_imm_mandr_r("pblendw", 154, src, dst);
1470 DO_imm_mandr_r("pblendw", 155, src, dst);
1471 DO_imm_mandr_r("pblendw", 156, src, dst);
1472 DO_imm_mandr_r("pblendw", 157, src, dst);
1473 DO_imm_mandr_r("pblendw", 158, src, dst);
1474 DO_imm_mandr_r("pblendw", 159, src, dst);
1475 DO_imm_mandr_r("pblendw", 160, src, dst);
1476 DO_imm_mandr_r("pblendw", 161, src, dst);
1477 DO_imm_mandr_r("pblendw", 162, src, dst);
1478 DO_imm_mandr_r("pblendw", 163, src, dst);
1479 DO_imm_mandr_r("pblendw", 164, src, dst);
1480 DO_imm_mandr_r("pblendw", 165, src, dst);
1481 DO_imm_mandr_r("pblendw", 166, src, dst);
1482 DO_imm_mandr_r("pblendw", 167, src, dst);
1483 DO_imm_mandr_r("pblendw", 168, src, dst);
1484 DO_imm_mandr_r("pblendw", 169, src, dst);
1485 DO_imm_mandr_r("pblendw", 170, src, dst);
1486 DO_imm_mandr_r("pblendw", 171, src, dst);
1487 DO_imm_mandr_r("pblendw", 172, src, dst);
1488 DO_imm_mandr_r("pblendw", 173, src, dst);
1489 DO_imm_mandr_r("pblendw", 174, src, dst);
1490 DO_imm_mandr_r("pblendw", 175, src, dst);
1491 DO_imm_mandr_r("pblendw", 176, src, dst);
1492 DO_imm_mandr_r("pblendw", 177, src, dst);
1493 DO_imm_mandr_r("pblendw", 178, src, dst);
1494 DO_imm_mandr_r("pblendw", 179, src, dst);
1495 DO_imm_mandr_r("pblendw", 180, src, dst);
1496 DO_imm_mandr_r("pblendw", 181, src, dst);
1497 DO_imm_mandr_r("pblendw", 182, src, dst);
1498 DO_imm_mandr_r("pblendw", 183, src, dst);
1499 DO_imm_mandr_r("pblendw", 184, src, dst);
1500 DO_imm_mandr_r("pblendw", 185, src, dst);
1501 DO_imm_mandr_r("pblendw", 186, src, dst);
1502 DO_imm_mandr_r("pblendw", 187, src, dst);
1503 DO_imm_mandr_r("pblendw", 188, src, dst);
1504 DO_imm_mandr_r("pblendw", 189, src, dst);
1505 DO_imm_mandr_r("pblendw", 190, src, dst);
1506 DO_imm_mandr_r("pblendw", 191, src, dst);
1507 DO_imm_mandr_r("pblendw", 192, src, dst);
1508 DO_imm_mandr_r("pblendw", 193, src, dst);
1509 DO_imm_mandr_r("pblendw", 194, src, dst);
1510 DO_imm_mandr_r("pblendw", 195, src, dst);
1511 DO_imm_mandr_r("pblendw", 196, src, dst);
1512 DO_imm_mandr_r("pblendw", 197, src, dst);
1513 DO_imm_mandr_r("pblendw", 198, src, dst);
1514 DO_imm_mandr_r("pblendw", 199, src, dst);
1515 DO_imm_mandr_r("pblendw", 200, src, dst);
1516 DO_imm_mandr_r("pblendw", 201, src, dst);
1517 DO_imm_mandr_r("pblendw", 202, src, dst);
1518 DO_imm_mandr_r("pblendw", 203, src, dst);
1519 DO_imm_mandr_r("pblendw", 204, src, dst);
1520 DO_imm_mandr_r("pblendw", 205, src, dst);
1521 DO_imm_mandr_r("pblendw", 206, src, dst);
1522 DO_imm_mandr_r("pblendw", 207, src, dst);
1523 DO_imm_mandr_r("pblendw", 208, src, dst);
1524 DO_imm_mandr_r("pblendw", 209, src, dst);
1525 DO_imm_mandr_r("pblendw", 210, src, dst);
1526 DO_imm_mandr_r("pblendw", 211, src, dst);
1527 DO_imm_mandr_r("pblendw", 212, src, dst);
1528 DO_imm_mandr_r("pblendw", 213, src, dst);
1529 DO_imm_mandr_r("pblendw", 214, src, dst);
1530 DO_imm_mandr_r("pblendw", 215, src, dst);
1531 DO_imm_mandr_r("pblendw", 216, src, dst);
1532 DO_imm_mandr_r("pblendw", 217, src, dst);
1533 DO_imm_mandr_r("pblendw", 218, src, dst);
1534 DO_imm_mandr_r("pblendw", 219, src, dst);
1535 DO_imm_mandr_r("pblendw", 220, src, dst);
1536 DO_imm_mandr_r("pblendw", 221, src, dst);
1537 DO_imm_mandr_r("pblendw", 222, src, dst);
1538 DO_imm_mandr_r("pblendw", 223, src, dst);
1539 DO_imm_mandr_r("pblendw", 224, src, dst);
1540 DO_imm_mandr_r("pblendw", 225, src, dst);
1541 DO_imm_mandr_r("pblendw", 226, src, dst);
1542 DO_imm_mandr_r("pblendw", 227, src, dst);
1543 DO_imm_mandr_r("pblendw", 228, src, dst);
1544 DO_imm_mandr_r("pblendw", 229, src, dst);
1545 DO_imm_mandr_r("pblendw", 230, src, dst);
1546 DO_imm_mandr_r("pblendw", 231, src, dst);
1547 DO_imm_mandr_r("pblendw", 232, src, dst);
1548 DO_imm_mandr_r("pblendw", 233, src, dst);
1549 DO_imm_mandr_r("pblendw", 234, src, dst);
1550 DO_imm_mandr_r("pblendw", 235, src, dst);
1551 DO_imm_mandr_r("pblendw", 236, src, dst);
1552 DO_imm_mandr_r("pblendw", 237, src, dst);
1553 DO_imm_mandr_r("pblendw", 238, src, dst);
1554 DO_imm_mandr_r("pblendw", 239, src, dst);
1555 DO_imm_mandr_r("pblendw", 240, src, dst);
1556 DO_imm_mandr_r("pblendw", 241, src, dst);
1557 DO_imm_mandr_r("pblendw", 242, src, dst);
1558 DO_imm_mandr_r("pblendw", 243, src, dst);
1559 DO_imm_mandr_r("pblendw", 244, src, dst);
1560 DO_imm_mandr_r("pblendw", 245, src, dst);
1561 DO_imm_mandr_r("pblendw", 246, src, dst);
1562 DO_imm_mandr_r("pblendw", 247, src, dst);
1563 DO_imm_mandr_r("pblendw", 248, src, dst);
1564 DO_imm_mandr_r("pblendw", 249, src, dst);
1565 DO_imm_mandr_r("pblendw", 250, src, dst);
1566 DO_imm_mandr_r("pblendw", 251, src, dst);
1567 DO_imm_mandr_r("pblendw", 252, src, dst);
1568 DO_imm_mandr_r("pblendw", 253, src, dst);
1569 DO_imm_mandr_r("pblendw", 254, src, dst);
1570 DO_imm_mandr_r("pblendw", 255, src, dst);
1571 }
1572 }
1573
1574
test_PCMPEQQ(void)1575 void test_PCMPEQQ ( void )
1576 {
1577 V128 src, dst;
1578 Int i;
1579 for (i = 0; i < 10; i++) {
1580 randV128(&src);
1581 randV128(&dst);
1582 switch (i - 6) {
1583 case 0: memset(&src[0], 0x55, 8);
1584 memset(&dst[0], 0x55, 8); break;
1585 case 1: memset(&src[8], 0x55, 8);
1586 memset(&dst[8], 0x55, 8); break;
1587 default:
1588 break;
1589 }
1590 DO_mandr_r("pcmpeqq", src, dst);
1591 }
1592 }
1593
1594
test_PEXTRB(void)1595 void test_PEXTRB ( void )
1596 {
1597 V128 src;
1598 randV128(&src);
1599 DO_imm_r_to_mandrscalar("pextrb", 0, src, "d");
1600 DO_imm_r_to_mandrscalar("pextrb", 1, src, "d");
1601 DO_imm_r_to_mandrscalar("pextrb", 2, src, "d");
1602 DO_imm_r_to_mandrscalar("pextrb", 3, src, "d");
1603 DO_imm_r_to_mandrscalar("pextrb", 4, src, "d");
1604 DO_imm_r_to_mandrscalar("pextrb", 5, src, "d");
1605 DO_imm_r_to_mandrscalar("pextrb", 6, src, "d");
1606 DO_imm_r_to_mandrscalar("pextrb", 7, src, "d");
1607 DO_imm_r_to_mandrscalar("pextrb", 8, src, "d");
1608 DO_imm_r_to_mandrscalar("pextrb", 9, src, "d");
1609 DO_imm_r_to_mandrscalar("pextrb", 10, src, "d");
1610 DO_imm_r_to_mandrscalar("pextrb", 11, src, "d");
1611 DO_imm_r_to_mandrscalar("pextrb", 12, src, "d");
1612 DO_imm_r_to_mandrscalar("pextrb", 13, src, "d");
1613 DO_imm_r_to_mandrscalar("pextrb", 14, src, "d");
1614 DO_imm_r_to_mandrscalar("pextrb", 15, src, "d");
1615 }
1616
test_PINSRB(void)1617 void test_PINSRB ( void )
1618 {
1619 ULong src;
1620 src = randULong();
1621 DO_imm_mandrscalar_to_r("pinsrb", 0, src, "d");
1622 src = randULong();
1623 DO_imm_mandrscalar_to_r("pinsrb", 1, src, "d");
1624 src = randULong();
1625 DO_imm_mandrscalar_to_r("pinsrb", 2, src, "d");
1626 src = randULong();
1627 DO_imm_mandrscalar_to_r("pinsrb", 3, src, "d");
1628 src = randULong();
1629 DO_imm_mandrscalar_to_r("pinsrb", 4, src, "d");
1630 src = randULong();
1631 DO_imm_mandrscalar_to_r("pinsrb", 5, src, "d");
1632 src = randULong();
1633 DO_imm_mandrscalar_to_r("pinsrb", 6, src, "d");
1634 src = randULong();
1635 DO_imm_mandrscalar_to_r("pinsrb", 7, src, "d");
1636 src = randULong();
1637 DO_imm_mandrscalar_to_r("pinsrb", 8, src, "d");
1638 src = randULong();
1639 DO_imm_mandrscalar_to_r("pinsrb", 9, src, "d");
1640 src = randULong();
1641 DO_imm_mandrscalar_to_r("pinsrb", 10, src, "d");
1642 src = randULong();
1643 DO_imm_mandrscalar_to_r("pinsrb", 11, src, "d");
1644 src = randULong();
1645 DO_imm_mandrscalar_to_r("pinsrb", 12, src, "d");
1646 src = randULong();
1647 DO_imm_mandrscalar_to_r("pinsrb", 13, src, "d");
1648 src = randULong();
1649 DO_imm_mandrscalar_to_r("pinsrb", 14, src, "d");
1650 src = randULong();
1651 DO_imm_mandrscalar_to_r("pinsrb", 15, src, "d");
1652 }
1653
1654
test_PEXTRW(void)1655 void test_PEXTRW ( void )
1656 {
1657 V128 src;
1658 randV128(&src);
1659 DO_imm_r_to_mandrscalar("pextrw", 0, src, "d");
1660 DO_imm_r_to_mandrscalar("pextrw", 1, src, "d");
1661 DO_imm_r_to_mandrscalar("pextrw", 2, src, "d");
1662 DO_imm_r_to_mandrscalar("pextrw", 3, src, "d");
1663 DO_imm_r_to_mandrscalar("pextrw", 4, src, "d");
1664 DO_imm_r_to_mandrscalar("pextrw", 5, src, "d");
1665 DO_imm_r_to_mandrscalar("pextrw", 6, src, "d");
1666 DO_imm_r_to_mandrscalar("pextrw", 7, src, "d");
1667 }
1668
test_PINSRW(void)1669 void test_PINSRW ( void )
1670 {
1671 ULong src;
1672 src = randULong();
1673 DO_imm_mandrscalar_to_r("pinsrw", 0, src, "d");
1674 src = randULong();
1675 DO_imm_mandrscalar_to_r("pinsrw", 1, src, "d");
1676 src = randULong();
1677 DO_imm_mandrscalar_to_r("pinsrw", 2, src, "d");
1678 src = randULong();
1679 DO_imm_mandrscalar_to_r("pinsrw", 3, src, "d");
1680 src = randULong();
1681 DO_imm_mandrscalar_to_r("pinsrw", 4, src, "d");
1682 src = randULong();
1683 DO_imm_mandrscalar_to_r("pinsrw", 5, src, "d");
1684 src = randULong();
1685 DO_imm_mandrscalar_to_r("pinsrw", 6, src, "d");
1686 src = randULong();
1687 DO_imm_mandrscalar_to_r("pinsrw", 7, src, "d");
1688 }
1689
1690
test_PEXTRD(void)1691 void test_PEXTRD ( void )
1692 {
1693 V128 src;
1694 randV128(&src);
1695 DO_imm_r_to_mandrscalar("pextrd", 0, src, "d");
1696 DO_imm_r_to_mandrscalar("pextrd", 1, src, "d");
1697 DO_imm_r_to_mandrscalar("pextrd", 2, src, "d");
1698 DO_imm_r_to_mandrscalar("pextrd", 3, src, "d");
1699 }
1700
test_PINSRD(void)1701 void test_PINSRD ( void )
1702 {
1703 ULong src;
1704 src = randULong();
1705 DO_imm_mandrscalar_to_r("pinsrd", 0, src, "d");
1706 src = randULong();
1707 DO_imm_mandrscalar_to_r("pinsrd", 1, src, "d");
1708 src = randULong();
1709 DO_imm_mandrscalar_to_r("pinsrd", 2, src, "d");
1710 src = randULong();
1711 DO_imm_mandrscalar_to_r("pinsrd", 3, src, "d");
1712 }
1713
1714
test_PEXTRQ(void)1715 void test_PEXTRQ ( void )
1716 {
1717 V128 src;
1718 randV128(&src);
1719 DO_imm_r_to_mandrscalar("pextrq", 0, src, "");
1720 DO_imm_r_to_mandrscalar("pextrq", 1, src, "");
1721 }
1722
test_PINSRQ(void)1723 void test_PINSRQ ( void )
1724 {
1725 ULong src;
1726 src = randULong();
1727 DO_imm_mandrscalar_to_r("pinsrq", 0, src, "");
1728 src = randULong();
1729 DO_imm_mandrscalar_to_r("pinsrq", 1, src, "");
1730 }
1731
1732
test_EXTRACTPS(void)1733 void test_EXTRACTPS ( void )
1734 {
1735 V128 src;
1736 randV128(&src);
1737 DO_imm_r_to_mandrscalar("extractps", 0, src, "d");
1738 DO_imm_r_to_mandrscalar("extractps", 1, src, "d");
1739 DO_imm_r_to_mandrscalar("extractps", 2, src, "d");
1740 DO_imm_r_to_mandrscalar("extractps", 3, src, "d");
1741 }
1742
1743
test_PHMINPOSUW(void)1744 void test_PHMINPOSUW ( void )
1745 {
1746 V128 src, dst;
1747 Int i;
1748 for (i = 0; i < 20; i++) {
1749 randV128(&src);
1750 randV128(&dst);
1751 DO_mandr_r("phminposuw", src, dst);
1752 }
1753 memset(src, 0x55, sizeof(src));
1754 memset(dst, 0xAA, sizeof(dst));
1755 DO_mandr_r("phminposuw", src, dst);
1756 }
1757
test_PMAXSB(void)1758 void test_PMAXSB ( void )
1759 {
1760 V128 src, dst;
1761 Int i;
1762 for (i = 0; i < 10; i++) {
1763 randV128(&src);
1764 randV128(&dst);
1765 DO_mandr_r("pmaxsb", src, dst);
1766 }
1767 }
1768
test_PMAXSD(void)1769 void test_PMAXSD ( void )
1770 {
1771 V128 src, dst;
1772 Int i;
1773 for (i = 0; i < 10; i++) {
1774 randV128(&src);
1775 randV128(&dst);
1776 DO_mandr_r("pmaxsd", src, dst);
1777 }
1778 }
1779
test_PMAXUD(void)1780 void test_PMAXUD ( void )
1781 {
1782 V128 src, dst;
1783 Int i;
1784 for (i = 0; i < 10; i++) {
1785 randV128(&src);
1786 randV128(&dst);
1787 DO_mandr_r("pmaxud", src, dst);
1788 }
1789 }
1790
test_PMAXUW(void)1791 void test_PMAXUW ( void )
1792 {
1793 V128 src, dst;
1794 Int i;
1795 for (i = 0; i < 10; i++) {
1796 randV128(&src);
1797 randV128(&dst);
1798 DO_mandr_r("pmaxuw", src, dst);
1799 }
1800 }
1801
test_PMINSB(void)1802 void test_PMINSB ( void )
1803 {
1804 V128 src, dst;
1805 Int i;
1806 for (i = 0; i < 10; i++) {
1807 randV128(&src);
1808 randV128(&dst);
1809 DO_mandr_r("pminsb", src, dst);
1810 }
1811 }
1812
test_PMINSD(void)1813 void test_PMINSD ( void )
1814 {
1815 V128 src, dst;
1816 Int i;
1817 for (i = 0; i < 10; i++) {
1818 randV128(&src);
1819 randV128(&dst);
1820 DO_mandr_r("pminsd", src, dst);
1821 }
1822 }
1823
test_PMINUD(void)1824 void test_PMINUD ( void )
1825 {
1826 V128 src, dst;
1827 Int i;
1828 for (i = 0; i < 10; i++) {
1829 randV128(&src);
1830 randV128(&dst);
1831 DO_mandr_r("pminud", src, dst);
1832 }
1833 }
1834
test_PMINUW(void)1835 void test_PMINUW ( void )
1836 {
1837 V128 src, dst;
1838 Int i;
1839 for (i = 0; i < 10; i++) {
1840 randV128(&src);
1841 randV128(&dst);
1842 DO_mandr_r("pminuw", src, dst);
1843 }
1844 }
1845
test_PMOVSXBW(void)1846 void test_PMOVSXBW ( void )
1847 {
1848 V128 src, dst;
1849 Int i;
1850 for (i = 0; i < 10; i++) {
1851 randV128(&src);
1852 randV128(&dst);
1853 DO_mandr_r("pmovsxbw", src, dst);
1854 }
1855 }
1856
test_PMOVSXBD(void)1857 void test_PMOVSXBD ( void )
1858 {
1859 V128 src, dst;
1860 Int i;
1861 for (i = 0; i < 10; i++) {
1862 randV128(&src);
1863 randV128(&dst);
1864 DO_mandr_r("pmovsxbd", src, dst);
1865 }
1866 }
1867
test_PMOVSXBQ(void)1868 void test_PMOVSXBQ ( void )
1869 {
1870 V128 src, dst;
1871 Int i;
1872 for (i = 0; i < 10; i++) {
1873 randV128(&src);
1874 randV128(&dst);
1875 DO_mandr_r("pmovsxbq", src, dst);
1876 }
1877 }
1878
test_PMOVSXWD(void)1879 void test_PMOVSXWD ( void )
1880 {
1881 V128 src, dst;
1882 Int i;
1883 for (i = 0; i < 10; i++) {
1884 randV128(&src);
1885 randV128(&dst);
1886 DO_mandr_r("pmovsxwd", src, dst);
1887 }
1888 }
1889
test_PMOVSXWQ(void)1890 void test_PMOVSXWQ ( void )
1891 {
1892 V128 src, dst;
1893 Int i;
1894 for (i = 0; i < 10; i++) {
1895 randV128(&src);
1896 randV128(&dst);
1897 DO_mandr_r("pmovsxwq", src, dst);
1898 }
1899 }
1900
test_PMOVSXDQ(void)1901 void test_PMOVSXDQ ( void )
1902 {
1903 V128 src, dst;
1904 Int i;
1905 for (i = 0; i < 10; i++) {
1906 randV128(&src);
1907 randV128(&dst);
1908 DO_mandr_r("pmovsxdq", src, dst);
1909 }
1910 }
1911
test_PMOVZXBW(void)1912 void test_PMOVZXBW ( void )
1913 {
1914 V128 src, dst;
1915 Int i;
1916 for (i = 0; i < 10; i++) {
1917 randV128(&src);
1918 randV128(&dst);
1919 DO_mandr_r("pmovzxbw", src, dst);
1920 }
1921 }
1922
test_PMOVZXBD(void)1923 void test_PMOVZXBD ( void )
1924 {
1925 V128 src, dst;
1926 Int i;
1927 for (i = 0; i < 10; i++) {
1928 randV128(&src);
1929 randV128(&dst);
1930 DO_mandr_r("pmovzxbd", src, dst);
1931 }
1932 }
1933
test_PMOVZXBQ(void)1934 void test_PMOVZXBQ ( void )
1935 {
1936 V128 src, dst;
1937 Int i;
1938 for (i = 0; i < 10; i++) {
1939 randV128(&src);
1940 randV128(&dst);
1941 DO_mandr_r("pmovzxbq", src, dst);
1942 }
1943 }
1944
test_PMOVZXWD(void)1945 void test_PMOVZXWD ( void )
1946 {
1947 V128 src, dst;
1948 Int i;
1949 for (i = 0; i < 10; i++) {
1950 randV128(&src);
1951 randV128(&dst);
1952 DO_mandr_r("pmovzxwd", src, dst);
1953 }
1954 }
1955
test_PMOVZXWQ(void)1956 void test_PMOVZXWQ ( void )
1957 {
1958 V128 src, dst;
1959 Int i;
1960 for (i = 0; i < 10; i++) {
1961 randV128(&src);
1962 randV128(&dst);
1963 DO_mandr_r("pmovzxwq", src, dst);
1964 }
1965 }
1966
test_PMOVZXDQ(void)1967 void test_PMOVZXDQ ( void )
1968 {
1969 V128 src, dst;
1970 Int i;
1971 for (i = 0; i < 10; i++) {
1972 randV128(&src);
1973 randV128(&dst);
1974 DO_mandr_r("pmovzxdq", src, dst);
1975 }
1976 }
1977
test_PMULDQ(void)1978 void test_PMULDQ ( void )
1979 {
1980 V128 src, dst;
1981 Int i;
1982 for (i = 0; i < 10; i++) {
1983 randV128(&src);
1984 randV128(&dst);
1985 DO_mandr_r("pmuldq", src, dst);
1986 }
1987 }
1988
1989
test_PMULLD(void)1990 void test_PMULLD ( void )
1991 {
1992 V128 src, dst;
1993 Int i;
1994 for (i = 0; i < 10; i++) {
1995 randV128(&src);
1996 randV128(&dst);
1997 DO_mandr_r("pmulld", src, dst);
1998 }
1999 }
2000
2001
test_POPCNTQ(void)2002 void test_POPCNTQ ( void )
2003 {
2004 ULong block[4];
2005 Int i;
2006 ULong oszacp_mask = 0x8D5;
2007 for (i = 0; i < 10; i++) {
2008 block[0] = i == 0 ? 0 : randULong();
2009 block[1] = randULong();
2010 block[2] = randULong();
2011 block[3] = randULong();
2012 __asm__ __volatile__(
2013 "movq %0, %%rax" "\n\t"
2014 "movq 0(%%rax), %%rdi" "\n\t"
2015 "movq 8(%%rax), %%r11" "\n\t"
2016 #ifndef VGP_amd64_darwin
2017 "popcntq %%rdi, %%r11" "\n\t"
2018 #else
2019 "popcnt %%rdi, %%r11" "\n\t"
2020 #endif
2021 "movq %%r11, 16(%%rax)" "\n\t"
2022 "pushfq" "\n\t"
2023 "popq %%r12" "\n\t"
2024 "movq %%r12, 24(%%rax)" "\n"
2025 : /*out*/
2026 : /*in*/"r"(&block[0])
2027 : /*trash*/ "cc", "memory", "rdi", "r11", "r12"
2028 );
2029 printf("r popcntq %016llx %016llx %016llx %016llx\n",
2030 block[0], block[1], block[2], block[3] & oszacp_mask);
2031
2032 block[0] = i == 0 ? 0 : randULong();
2033 block[1] = randULong();
2034 block[2] = randULong();
2035 block[3] = randULong();
2036 __asm__ __volatile__(
2037 "movq %0, %%rax" "\n\t"
2038 "movq 8(%%rax), %%r11" "\n\t"
2039 #ifndef VGP_amd64_darwin
2040 "popcntq 0(%%rax), %%r11" "\n\t"
2041 #else
2042 "popcnt 0(%%rax), %%r11" "\n\t"
2043 #endif
2044 "movq %%r11, 16(%%rax)" "\n\t"
2045 "pushfq" "\n\t"
2046 "popq %%r12" "\n\t"
2047 "movq %%r12, 24(%%rax)" "\n"
2048 : /*out*/
2049 : /*in*/"r"(&block[0])
2050 : /*trash*/ "cc", "memory", "r11", "r12"
2051 );
2052 printf("m popcntq %016llx %016llx %016llx %016llx\n",
2053 block[0], block[1], block[2], block[3] & oszacp_mask);
2054 }
2055 }
2056
2057
test_POPCNTL(void)2058 void test_POPCNTL ( void )
2059 {
2060 ULong block[4];
2061 Int i;
2062 ULong oszacp_mask = 0x8D5;
2063 for (i = 0; i < 10; i++) {
2064 block[0] = i == 0 ? 0 : randULong();
2065 block[1] = randULong();
2066 block[2] = randULong();
2067 block[3] = randULong();
2068 __asm__ __volatile__(
2069 "movq %0, %%rax" "\n\t"
2070 "movq 0(%%rax), %%rdi" "\n\t"
2071 "movq 8(%%rax), %%r11" "\n\t"
2072 #ifndef VGP_amd64_darwin
2073 "popcntl %%edi, %%r11d" "\n\t"
2074 #else
2075 "popcnt %%edi, %%r11d" "\n\t"
2076 #endif
2077 "movq %%r11, 16(%%rax)" "\n\t"
2078 "pushfq" "\n\t"
2079 "popq %%r12" "\n\t"
2080 "movq %%r12, 24(%%rax)" "\n"
2081 : /*out*/
2082 : /*in*/"r"(&block[0])
2083 : /*trash*/ "cc", "memory", "rdi", "r11", "r12"
2084 );
2085 printf("r popcntl %016llx %016llx %016llx %016llx\n",
2086 block[0], block[1], block[2], block[3] & oszacp_mask);
2087
2088 block[0] = i == 0 ? 0 : randULong();
2089 block[1] = randULong();
2090 block[2] = randULong();
2091 block[3] = randULong();
2092 __asm__ __volatile__(
2093 "movq %0, %%rax" "\n\t"
2094 "movq 8(%%rax), %%r11" "\n\t"
2095 #ifndef VGP_amd64_darwin
2096 "popcntl 0(%%rax), %%r11d" "\n\t"
2097 #else
2098 "popcnt 0(%%rax), %%r11d" "\n\t"
2099 #endif
2100 "movq %%r11, 16(%%rax)" "\n\t"
2101 "pushfq" "\n\t"
2102 "popq %%r12" "\n\t"
2103 "movq %%r12, 24(%%rax)" "\n"
2104 : /*out*/
2105 : /*in*/"r"(&block[0])
2106 : /*trash*/ "cc", "memory", "r11", "r12"
2107 );
2108 printf("m popcntl %016llx %016llx %016llx %016llx\n",
2109 block[0], block[1], block[2], block[3] & oszacp_mask);
2110 }
2111 }
2112
2113
test_POPCNTW(void)2114 void test_POPCNTW ( void )
2115 {
2116 ULong block[4];
2117 Int i;
2118 ULong oszacp_mask = 0x8D5;
2119 for (i = 0; i < 10; i++) {
2120 block[0] = i == 0 ? 0 : randULong();
2121 block[1] = randULong();
2122 block[2] = randULong();
2123 block[3] = randULong();
2124 __asm__ __volatile__(
2125 "movq %0, %%rax" "\n\t"
2126 "movq 0(%%rax), %%rdi" "\n\t"
2127 "movq 8(%%rax), %%r11" "\n\t"
2128 #ifndef VGP_amd64_darwin
2129 "popcntw %%di, %%r11w" "\n\t"
2130 #else
2131 "popcnt %%di, %%r11w" "\n\t"
2132 #endif
2133 "movq %%r11, 16(%%rax)" "\n\t"
2134 "pushfq" "\n\t"
2135 "popq %%r12" "\n\t"
2136 "movq %%r12, 24(%%rax)" "\n"
2137 : /*out*/
2138 : /*in*/"r"(&block[0])
2139 : /*trash*/ "cc", "memory", "rdi", "r11", "r12"
2140 );
2141 printf("r popcntw %016llx %016llx %016llx %016llx\n",
2142 block[0], block[1], block[2], block[3] & oszacp_mask);
2143
2144 block[0] = i == 0 ? 0 : randULong();
2145 block[1] = randULong();
2146 block[2] = randULong();
2147 block[3] = randULong();
2148 __asm__ __volatile__(
2149 "movq %0, %%rax" "\n\t"
2150 "movq 8(%%rax), %%r11" "\n\t"
2151 #ifndef VGP_amd64_darwin
2152 "popcntw 0(%%rax), %%r11w" "\n\t"
2153 #else
2154 "popcnt 0(%%rax), %%r11w" "\n\t"
2155 #endif
2156 "movq %%r11, 16(%%rax)" "\n\t"
2157 "pushfq" "\n\t"
2158 "popq %%r12" "\n\t"
2159 "movq %%r12, 24(%%rax)" "\n"
2160 : /*out*/
2161 : /*in*/"r"(&block[0])
2162 : /*trash*/ "cc", "memory", "r11", "r12"
2163 );
2164 printf("m popcntw %016llx %016llx %016llx %016llx\n",
2165 block[0], block[1], block[2], block[3] & oszacp_mask);
2166 }
2167 }
2168
2169
test_PCMPGTQ(void)2170 void test_PCMPGTQ ( void )
2171 {
2172 V128 spec[7];
2173 do64HLtoV128( &spec[0], 0x0000000000000000ULL, 0xffffffffffffffffULL );
2174 do64HLtoV128( &spec[1], 0x0000000000000001ULL, 0xfffffffffffffffeULL );
2175 do64HLtoV128( &spec[2], 0x7fffffffffffffffULL, 0x8000000000000001ULL );
2176 do64HLtoV128( &spec[3], 0x8000000000000000ULL, 0x8000000000000000ULL );
2177 do64HLtoV128( &spec[4], 0x8000000000000001ULL, 0x7fffffffffffffffULL );
2178 do64HLtoV128( &spec[5], 0xfffffffffffffffeULL, 0x0000000000000001ULL );
2179 do64HLtoV128( &spec[6], 0xffffffffffffffffULL, 0x0000000000000000ULL );
2180
2181 V128 src, dst;
2182 Int i, j;
2183 for (i = 0; i < 10; i++) {
2184 randV128(&src);
2185 randV128(&dst);
2186 DO_mandr_r("pcmpgtq", src, dst);
2187 }
2188 for (i = 0; i < 7; i++) {
2189 for (j = 0; j < 7; j++) {
2190 memcpy(&src, &spec[i], 16);
2191 memcpy(&dst, &spec[j], 16);
2192 DO_mandr_r("pcmpgtq", src, dst);
2193 }
2194 }
2195 }
2196
2197 /* ------------ ROUNDSD ------------ */
2198
do_ROUNDSD_000(Bool mem,V128 * src,V128 * dst)2199 void do_ROUNDSD_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
2200 {
2201 if (mem) {
2202 __asm__ __volatile__(
2203 "movupd (%1), %%xmm11" "\n\t"
2204 "roundsd $0, (%0), %%xmm11" "\n\t"
2205 "movupd %%xmm11, (%1)" "\n"
2206 : /*OUT*/
2207 : /*IN*/ "r"(src), "r"(dst)
2208 : /*TRASH*/ "xmm11"
2209 );
2210 } else {
2211 __asm__ __volatile__(
2212 "movupd (%1), %%xmm11" "\n\t"
2213 "movupd (%0), %%xmm2" "\n\t"
2214 "roundsd $0, %%xmm2, %%xmm11" "\n\t"
2215 "movupd %%xmm11, (%1)" "\n"
2216 : /*OUT*/
2217 : /*IN*/ "r"(src), "r"(dst)
2218 : /*TRASH*/ "xmm11","xmm2"
2219 );
2220 }
2221 }
2222
do_ROUNDSD_001(Bool mem,V128 * src,V128 * dst)2223 void do_ROUNDSD_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
2224 {
2225 if (mem) {
2226 __asm__ __volatile__(
2227 "movupd (%1), %%xmm11" "\n\t"
2228 "roundsd $1, (%0), %%xmm11" "\n\t"
2229 "movupd %%xmm11, (%1)" "\n"
2230 : /*OUT*/
2231 : /*IN*/ "r"(src), "r"(dst)
2232 : /*TRASH*/ "xmm11"
2233 );
2234 } else {
2235 __asm__ __volatile__(
2236 "movupd (%1), %%xmm11" "\n\t"
2237 "movupd (%0), %%xmm2" "\n\t"
2238 "roundsd $1, %%xmm2, %%xmm11" "\n\t"
2239 "movupd %%xmm11, (%1)" "\n"
2240 : /*OUT*/
2241 : /*IN*/ "r"(src), "r"(dst)
2242 : /*TRASH*/ "xmm11","xmm2"
2243 );
2244 }
2245 }
2246
do_ROUNDSD_010(Bool mem,V128 * src,V128 * dst)2247 void do_ROUNDSD_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
2248 {
2249 if (mem) {
2250 __asm__ __volatile__(
2251 "movupd (%1), %%xmm11" "\n\t"
2252 "roundsd $2, (%0), %%xmm11" "\n\t"
2253 "movupd %%xmm11, (%1)" "\n"
2254 : /*OUT*/
2255 : /*IN*/ "r"(src), "r"(dst)
2256 : /*TRASH*/ "xmm11"
2257 );
2258 } else {
2259 __asm__ __volatile__(
2260 "movupd (%1), %%xmm11" "\n\t"
2261 "movupd (%0), %%xmm2" "\n\t"
2262 "roundsd $2, %%xmm2, %%xmm11" "\n\t"
2263 "movupd %%xmm11, (%1)" "\n"
2264 : /*OUT*/
2265 : /*IN*/ "r"(src), "r"(dst)
2266 : /*TRASH*/ "xmm11","xmm2"
2267 );
2268 }
2269 }
2270
do_ROUNDSD_011(Bool mem,V128 * src,V128 * dst)2271 void do_ROUNDSD_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
2272 {
2273 if (mem) {
2274 __asm__ __volatile__(
2275 "movupd (%1), %%xmm11" "\n\t"
2276 "roundsd $3, (%0), %%xmm11" "\n\t"
2277 "movupd %%xmm11, (%1)" "\n"
2278 : /*OUT*/
2279 : /*IN*/ "r"(src), "r"(dst)
2280 : /*TRASH*/ "xmm11"
2281 );
2282 } else {
2283 __asm__ __volatile__(
2284 "movupd (%1), %%xmm11" "\n\t"
2285 "movupd (%0), %%xmm2" "\n\t"
2286 "roundsd $3, %%xmm2, %%xmm11" "\n\t"
2287 "movupd %%xmm11, (%1)" "\n"
2288 : /*OUT*/
2289 : /*IN*/ "r"(src), "r"(dst)
2290 : /*TRASH*/ "xmm11","xmm2"
2291 );
2292 }
2293 }
2294
do_ROUNDSD_1XX(Bool mem,V128 * src,V128 * dst)2295 void do_ROUNDSD_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
2296 {
2297 if (mem) {
2298 __asm__ __volatile__(
2299 "movupd (%1), %%xmm11" "\n\t"
2300 "roundsd $4, (%0), %%xmm11" "\n\t"
2301 "movupd %%xmm11, (%1)" "\n"
2302 : /*OUT*/
2303 : /*IN*/ "r"(src), "r"(dst)
2304 : /*TRASH*/ "xmm11"
2305 );
2306 } else {
2307 __asm__ __volatile__(
2308 "movupd (%1), %%xmm11" "\n\t"
2309 "movupd (%0), %%xmm2" "\n\t"
2310 "roundsd $4, %%xmm2, %%xmm11" "\n\t"
2311 "movupd %%xmm11, (%1)" "\n"
2312 : /*OUT*/
2313 : /*IN*/ "r"(src), "r"(dst)
2314 : /*TRASH*/ "xmm11","xmm2"
2315 );
2316 }
2317 }
2318
test_ROUNDSD_w_immediate_rounding(void)2319 void test_ROUNDSD_w_immediate_rounding ( void )
2320 {
2321 double vals[22];
2322 Int i = 0;
2323 vals[i++] = 0.0;
2324 vals[i++] = -0.0;
2325 vals[i++] = mkPosInf();
2326 vals[i++] = mkNegInf();
2327 vals[i++] = mkPosNan();
2328 vals[i++] = mkNegNan();
2329 vals[i++] = -1.3;
2330 vals[i++] = -1.1;
2331 vals[i++] = -0.9;
2332 vals[i++] = -0.7;
2333 vals[i++] = -0.50001;
2334 vals[i++] = -0.49999;
2335 vals[i++] = -0.3;
2336 vals[i++] = -0.1;
2337 vals[i++] = 0.1;
2338 vals[i++] = 0.3;
2339 vals[i++] = 0.49999;
2340 vals[i++] = 0.50001;
2341 vals[i++] = 0.7;
2342 vals[i++] = 0.9;
2343 vals[i++] = 1.1;
2344 vals[i++] = 1.3;
2345 assert(i == 22);
2346
2347 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
2348 V128 src, dst;
2349
2350 randV128(&src);
2351 randV128(&dst);
2352 memcpy(&src[0], &vals[i], 8);
2353 do_ROUNDSD_000(False/*reg*/, &src, &dst);
2354 printf("r roundsd_000 ");
2355 showV128(&src);
2356 printf(" ");
2357 showV128(&dst);
2358 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2359 printf("\n");
2360
2361 randV128(&src);
2362 randV128(&dst);
2363 memcpy(&src[0], &vals[i], 8);
2364 do_ROUNDSD_000(True/*mem*/, &src, &dst);
2365 printf("m roundsd_000 ");
2366 showV128(&src);
2367 printf(" ");
2368 showV128(&dst);
2369 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2370 printf("\n");
2371
2372
2373 randV128(&src);
2374 randV128(&dst);
2375 memcpy(&src[0], &vals[i], 8);
2376 do_ROUNDSD_001(False/*reg*/, &src, &dst);
2377 printf("r roundsd_001 ");
2378 showV128(&src);
2379 printf(" ");
2380 showV128(&dst);
2381 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2382 printf("\n");
2383
2384 randV128(&src);
2385 randV128(&dst);
2386 memcpy(&src[0], &vals[i], 8);
2387 do_ROUNDSD_001(True/*mem*/, &src, &dst);
2388 printf("m roundsd_001 ");
2389 showV128(&src);
2390 printf(" ");
2391 showV128(&dst);
2392 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2393 printf("\n");
2394
2395
2396 randV128(&src);
2397 randV128(&dst);
2398 memcpy(&src[0], &vals[i], 8);
2399 do_ROUNDSD_010(False/*reg*/, &src, &dst);
2400 printf("r roundsd_010 ");
2401 showV128(&src);
2402 printf(" ");
2403 showV128(&dst);
2404 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2405 printf("\n");
2406
2407 randV128(&src);
2408 randV128(&dst);
2409 memcpy(&src[0], &vals[i], 8);
2410 do_ROUNDSD_010(True/*mem*/, &src, &dst);
2411 printf("m roundsd_010 ");
2412 showV128(&src);
2413 printf(" ");
2414 showV128(&dst);
2415 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2416 printf("\n");
2417
2418
2419 randV128(&src);
2420 randV128(&dst);
2421 memcpy(&src[0], &vals[i], 8);
2422 do_ROUNDSD_011(False/*reg*/, &src, &dst);
2423 printf("r roundsd_011 ");
2424 showV128(&src);
2425 printf(" ");
2426 showV128(&dst);
2427 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2428 printf("\n");
2429
2430 randV128(&src);
2431 randV128(&dst);
2432 memcpy(&src[0], &vals[i], 8);
2433 do_ROUNDSD_011(True/*mem*/, &src, &dst);
2434 printf("m roundsd_011 ");
2435 showV128(&src);
2436 printf(" ");
2437 showV128(&dst);
2438 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2439 printf("\n");
2440 }
2441 }
2442
test_ROUNDSD_w_mxcsr_rounding(void)2443 void test_ROUNDSD_w_mxcsr_rounding ( void )
2444 {
2445 UInt rm;
2446 double vals[22];
2447 Int i = 0;
2448 vals[i++] = 0.0;
2449 vals[i++] = -0.0;
2450 vals[i++] = mkPosInf();
2451 vals[i++] = mkNegInf();
2452 vals[i++] = mkPosNan();
2453 vals[i++] = mkNegNan();
2454 vals[i++] = -1.3;
2455 vals[i++] = -1.1;
2456 vals[i++] = -0.9;
2457 vals[i++] = -0.7;
2458 vals[i++] = -0.50001;
2459 vals[i++] = -0.49999;
2460 vals[i++] = -0.3;
2461 vals[i++] = -0.1;
2462 vals[i++] = 0.1;
2463 vals[i++] = 0.3;
2464 vals[i++] = 0.49999;
2465 vals[i++] = 0.50001;
2466 vals[i++] = 0.7;
2467 vals[i++] = 0.9;
2468 vals[i++] = 1.1;
2469 vals[i++] = 1.3;
2470 assert(i == 22);
2471
2472 rm = get_sse_roundingmode();
2473 assert(rm == 0); // 0 == RN == default
2474
2475 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
2476 V128 src, dst;
2477
2478 for (rm = 0; rm <= 3; rm++) {
2479 set_sse_roundingmode(rm);
2480
2481 randV128(&src);
2482 randV128(&dst);
2483 memcpy(&src[0], &vals[i], 8);
2484 do_ROUNDSD_1XX(False/*reg*/, &src, &dst);
2485 printf("r (rm=%u) roundsd_1XX ", rm);
2486 showV128(&src);
2487 printf(" ");
2488 showV128(&dst);
2489 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2490 printf("\n");
2491
2492 randV128(&src);
2493 randV128(&dst);
2494 memcpy(&src[0], &vals[i], 8);
2495 do_ROUNDSD_1XX(True/*mem*/, &src, &dst);
2496 printf("m (rm=%u) roundsd_1XX ", rm);
2497 showV128(&src);
2498 printf(" ");
2499 showV128(&dst);
2500 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2501 printf("\n");
2502 }
2503 }
2504
2505 rm = get_sse_roundingmode();
2506 assert(rm == 3);
2507 set_sse_roundingmode(0);
2508 rm = get_sse_roundingmode();
2509 assert(rm == 0); // 0 == RN == default
2510 }
2511
2512
2513 /* ------------ ROUNDSS ------------ */
2514
do_ROUNDSS_000(Bool mem,V128 * src,V128 * dst)2515 void do_ROUNDSS_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
2516 {
2517 if (mem) {
2518 __asm__ __volatile__(
2519 "movupd (%1), %%xmm11" "\n\t"
2520 "roundss $0, (%0), %%xmm11" "\n\t"
2521 "movupd %%xmm11, (%1)" "\n"
2522 : /*OUT*/
2523 : /*IN*/ "r"(src), "r"(dst)
2524 : /*TRASH*/ "xmm11"
2525 );
2526 } else {
2527 __asm__ __volatile__(
2528 "movupd (%1), %%xmm11" "\n\t"
2529 "movupd (%0), %%xmm2" "\n\t"
2530 "roundss $0, %%xmm2, %%xmm11" "\n\t"
2531 "movupd %%xmm11, (%1)" "\n"
2532 : /*OUT*/
2533 : /*IN*/ "r"(src), "r"(dst)
2534 : /*TRASH*/ "xmm11","xmm2"
2535 );
2536 }
2537 }
2538
do_ROUNDSS_001(Bool mem,V128 * src,V128 * dst)2539 void do_ROUNDSS_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
2540 {
2541 if (mem) {
2542 __asm__ __volatile__(
2543 "movupd (%1), %%xmm11" "\n\t"
2544 "roundss $1, (%0), %%xmm11" "\n\t"
2545 "movupd %%xmm11, (%1)" "\n"
2546 : /*OUT*/
2547 : /*IN*/ "r"(src), "r"(dst)
2548 : /*TRASH*/ "xmm11"
2549 );
2550 } else {
2551 __asm__ __volatile__(
2552 "movupd (%1), %%xmm11" "\n\t"
2553 "movupd (%0), %%xmm2" "\n\t"
2554 "roundss $1, %%xmm2, %%xmm11" "\n\t"
2555 "movupd %%xmm11, (%1)" "\n"
2556 : /*OUT*/
2557 : /*IN*/ "r"(src), "r"(dst)
2558 : /*TRASH*/ "xmm11","xmm2"
2559 );
2560 }
2561 }
2562
do_ROUNDSS_010(Bool mem,V128 * src,V128 * dst)2563 void do_ROUNDSS_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
2564 {
2565 if (mem) {
2566 __asm__ __volatile__(
2567 "movupd (%1), %%xmm11" "\n\t"
2568 "roundss $2, (%0), %%xmm11" "\n\t"
2569 "movupd %%xmm11, (%1)" "\n"
2570 : /*OUT*/
2571 : /*IN*/ "r"(src), "r"(dst)
2572 : /*TRASH*/ "xmm11"
2573 );
2574 } else {
2575 __asm__ __volatile__(
2576 "movupd (%1), %%xmm11" "\n\t"
2577 "movupd (%0), %%xmm2" "\n\t"
2578 "roundss $2, %%xmm2, %%xmm11" "\n\t"
2579 "movupd %%xmm11, (%1)" "\n"
2580 : /*OUT*/
2581 : /*IN*/ "r"(src), "r"(dst)
2582 : /*TRASH*/ "xmm11","xmm2"
2583 );
2584 }
2585 }
2586
do_ROUNDSS_011(Bool mem,V128 * src,V128 * dst)2587 void do_ROUNDSS_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
2588 {
2589 if (mem) {
2590 __asm__ __volatile__(
2591 "movupd (%1), %%xmm11" "\n\t"
2592 "roundss $3, (%0), %%xmm11" "\n\t"
2593 "movupd %%xmm11, (%1)" "\n"
2594 : /*OUT*/
2595 : /*IN*/ "r"(src), "r"(dst)
2596 : /*TRASH*/ "xmm11"
2597 );
2598 } else {
2599 __asm__ __volatile__(
2600 "movupd (%1), %%xmm11" "\n\t"
2601 "movupd (%0), %%xmm2" "\n\t"
2602 "roundss $3, %%xmm2, %%xmm11" "\n\t"
2603 "movupd %%xmm11, (%1)" "\n"
2604 : /*OUT*/
2605 : /*IN*/ "r"(src), "r"(dst)
2606 : /*TRASH*/ "xmm11","xmm2"
2607 );
2608 }
2609 }
2610
do_ROUNDSS_1XX(Bool mem,V128 * src,V128 * dst)2611 void do_ROUNDSS_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
2612 {
2613 if (mem) {
2614 __asm__ __volatile__(
2615 "movupd (%1), %%xmm11" "\n\t"
2616 "roundss $4, (%0), %%xmm11" "\n\t"
2617 "movupd %%xmm11, (%1)" "\n"
2618 : /*OUT*/
2619 : /*IN*/ "r"(src), "r"(dst)
2620 : /*TRASH*/ "xmm11"
2621 );
2622 } else {
2623 __asm__ __volatile__(
2624 "movupd (%1), %%xmm11" "\n\t"
2625 "movupd (%0), %%xmm2" "\n\t"
2626 "roundss $4, %%xmm2, %%xmm11" "\n\t"
2627 "movupd %%xmm11, (%1)" "\n"
2628 : /*OUT*/
2629 : /*IN*/ "r"(src), "r"(dst)
2630 : /*TRASH*/ "xmm11","xmm2"
2631 );
2632 }
2633 }
2634
test_ROUNDSS_w_immediate_rounding(void)2635 void test_ROUNDSS_w_immediate_rounding ( void )
2636 {
2637 float vals[22];
2638 Int i = 0;
2639 vals[i++] = 0.0;
2640 vals[i++] = -0.0;
2641 vals[i++] = mkPosInf();
2642 vals[i++] = mkNegInf();
2643 vals[i++] = mkPosNan();
2644 vals[i++] = mkNegNan();
2645 vals[i++] = -1.3;
2646 vals[i++] = -1.1;
2647 vals[i++] = -0.9;
2648 vals[i++] = -0.7;
2649 vals[i++] = -0.50001;
2650 vals[i++] = -0.49999;
2651 vals[i++] = -0.3;
2652 vals[i++] = -0.1;
2653 vals[i++] = 0.1;
2654 vals[i++] = 0.3;
2655 vals[i++] = 0.49999;
2656 vals[i++] = 0.50001;
2657 vals[i++] = 0.7;
2658 vals[i++] = 0.9;
2659 vals[i++] = 1.1;
2660 vals[i++] = 1.3;
2661 assert(i == 22);
2662
2663 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
2664 V128 src, dst;
2665
2666 randV128(&src);
2667 randV128(&dst);
2668 memcpy(&src[0], &vals[i], 4);
2669 do_ROUNDSS_000(False/*reg*/, &src, &dst);
2670 printf("r roundss_000 ");
2671 showV128(&src);
2672 printf(" ");
2673 showV128(&dst);
2674 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2675 printf("\n");
2676
2677 randV128(&src);
2678 randV128(&dst);
2679 memcpy(&src[0], &vals[i], 4);
2680 do_ROUNDSS_000(True/*mem*/, &src, &dst);
2681 printf("m roundss_000 ");
2682 showV128(&src);
2683 printf(" ");
2684 showV128(&dst);
2685 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2686 printf("\n");
2687
2688
2689 randV128(&src);
2690 randV128(&dst);
2691 memcpy(&src[0], &vals[i], 4);
2692 do_ROUNDSS_001(False/*reg*/, &src, &dst);
2693 printf("r roundss_001 ");
2694 showV128(&src);
2695 printf(" ");
2696 showV128(&dst);
2697 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2698 printf("\n");
2699
2700 randV128(&src);
2701 randV128(&dst);
2702 memcpy(&src[0], &vals[i], 4);
2703 do_ROUNDSS_001(True/*mem*/, &src, &dst);
2704 printf("m roundss_001 ");
2705 showV128(&src);
2706 printf(" ");
2707 showV128(&dst);
2708 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2709 printf("\n");
2710
2711
2712 randV128(&src);
2713 randV128(&dst);
2714 memcpy(&src[0], &vals[i], 4);
2715 do_ROUNDSS_010(False/*reg*/, &src, &dst);
2716 printf("r roundss_010 ");
2717 showV128(&src);
2718 printf(" ");
2719 showV128(&dst);
2720 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2721 printf("\n");
2722
2723 randV128(&src);
2724 randV128(&dst);
2725 memcpy(&src[0], &vals[i], 4);
2726 do_ROUNDSS_010(True/*mem*/, &src, &dst);
2727 printf("m roundss_010 ");
2728 showV128(&src);
2729 printf(" ");
2730 showV128(&dst);
2731 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2732 printf("\n");
2733
2734
2735 randV128(&src);
2736 randV128(&dst);
2737 memcpy(&src[0], &vals[i], 4);
2738 do_ROUNDSS_011(False/*reg*/, &src, &dst);
2739 printf("r roundss_011 ");
2740 showV128(&src);
2741 printf(" ");
2742 showV128(&dst);
2743 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2744 printf("\n");
2745
2746 randV128(&src);
2747 randV128(&dst);
2748 memcpy(&src[0], &vals[i], 4);
2749 do_ROUNDSS_011(True/*mem*/, &src, &dst);
2750 printf("m roundss_011 ");
2751 showV128(&src);
2752 printf(" ");
2753 showV128(&dst);
2754 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2755 printf("\n");
2756 }
2757 }
2758
test_ROUNDSS_w_mxcsr_rounding(void)2759 void test_ROUNDSS_w_mxcsr_rounding ( void )
2760 {
2761 UInt rm;
2762 float vals[22];
2763 Int i = 0;
2764 vals[i++] = 0.0;
2765 vals[i++] = -0.0;
2766 vals[i++] = mkPosInf();
2767 vals[i++] = mkNegInf();
2768 vals[i++] = mkPosNan();
2769 vals[i++] = mkNegNan();
2770 vals[i++] = -1.3;
2771 vals[i++] = -1.1;
2772 vals[i++] = -0.9;
2773 vals[i++] = -0.7;
2774 vals[i++] = -0.50001;
2775 vals[i++] = -0.49999;
2776 vals[i++] = -0.3;
2777 vals[i++] = -0.1;
2778 vals[i++] = 0.1;
2779 vals[i++] = 0.3;
2780 vals[i++] = 0.49999;
2781 vals[i++] = 0.50001;
2782 vals[i++] = 0.7;
2783 vals[i++] = 0.9;
2784 vals[i++] = 1.1;
2785 vals[i++] = 1.3;
2786 assert(i == 22);
2787
2788 rm = get_sse_roundingmode();
2789 assert(rm == 0); // 0 == RN == default
2790
2791 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
2792 V128 src, dst;
2793
2794 for (rm = 0; rm <= 3; rm++) {
2795 set_sse_roundingmode(rm);
2796
2797 randV128(&src);
2798 randV128(&dst);
2799 memcpy(&src[0], &vals[i], 4);
2800 do_ROUNDSS_1XX(False/*reg*/, &src, &dst);
2801 printf("r (rm=%u) roundss_1XX ", rm);
2802 showV128(&src);
2803 printf(" ");
2804 showV128(&dst);
2805 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2806 printf("\n");
2807
2808 randV128(&src);
2809 randV128(&dst);
2810 memcpy(&src[0], &vals[i], 4);
2811 do_ROUNDSS_1XX(True/*mem*/, &src, &dst);
2812 printf("m (rm=%u) roundss_1XX ", rm);
2813 showV128(&src);
2814 printf(" ");
2815 showV128(&dst);
2816 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2817 printf("\n");
2818 }
2819 }
2820
2821 rm = get_sse_roundingmode();
2822 assert(rm == 3);
2823 set_sse_roundingmode(0);
2824 rm = get_sse_roundingmode();
2825 assert(rm == 0); // 0 == RN == default
2826 }
2827
2828 /* ------------ ROUNDPD ------------ */
2829
do_ROUNDPD_000(Bool mem,V128 * src,V128 * dst)2830 void do_ROUNDPD_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
2831 {
2832 if (mem) {
2833 __asm__ __volatile__(
2834 "movupd (%1), %%xmm11" "\n\t"
2835 "roundpd $0, (%0), %%xmm11" "\n\t"
2836 "movupd %%xmm11, (%1)" "\n"
2837 : /*OUT*/
2838 : /*IN*/ "r"(src), "r"(dst)
2839 : /*TRASH*/ "xmm11"
2840 );
2841 } else {
2842 __asm__ __volatile__(
2843 "movupd (%1), %%xmm11" "\n\t"
2844 "movupd (%0), %%xmm2" "\n\t"
2845 "roundpd $0, %%xmm2, %%xmm11" "\n\t"
2846 "movupd %%xmm11, (%1)" "\n"
2847 : /*OUT*/
2848 : /*IN*/ "r"(src), "r"(dst)
2849 : /*TRASH*/ "xmm11","xmm2"
2850 );
2851 }
2852 }
2853
do_ROUNDPD_001(Bool mem,V128 * src,V128 * dst)2854 void do_ROUNDPD_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
2855 {
2856 if (mem) {
2857 __asm__ __volatile__(
2858 "movupd (%1), %%xmm11" "\n\t"
2859 "roundpd $1, (%0), %%xmm11" "\n\t"
2860 "movupd %%xmm11, (%1)" "\n"
2861 : /*OUT*/
2862 : /*IN*/ "r"(src), "r"(dst)
2863 : /*TRASH*/ "xmm11"
2864 );
2865 } else {
2866 __asm__ __volatile__(
2867 "movupd (%1), %%xmm11" "\n\t"
2868 "movupd (%0), %%xmm2" "\n\t"
2869 "roundpd $1, %%xmm2, %%xmm11" "\n\t"
2870 "movupd %%xmm11, (%1)" "\n"
2871 : /*OUT*/
2872 : /*IN*/ "r"(src), "r"(dst)
2873 : /*TRASH*/ "xmm11","xmm2"
2874 );
2875 }
2876 }
2877
do_ROUNDPD_010(Bool mem,V128 * src,V128 * dst)2878 void do_ROUNDPD_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
2879 {
2880 if (mem) {
2881 __asm__ __volatile__(
2882 "movupd (%1), %%xmm11" "\n\t"
2883 "roundpd $2, (%0), %%xmm11" "\n\t"
2884 "movupd %%xmm11, (%1)" "\n"
2885 : /*OUT*/
2886 : /*IN*/ "r"(src), "r"(dst)
2887 : /*TRASH*/ "xmm11"
2888 );
2889 } else {
2890 __asm__ __volatile__(
2891 "movupd (%1), %%xmm11" "\n\t"
2892 "movupd (%0), %%xmm2" "\n\t"
2893 "roundpd $2, %%xmm2, %%xmm11" "\n\t"
2894 "movupd %%xmm11, (%1)" "\n"
2895 : /*OUT*/
2896 : /*IN*/ "r"(src), "r"(dst)
2897 : /*TRASH*/ "xmm11","xmm2"
2898 );
2899 }
2900 }
2901
do_ROUNDPD_011(Bool mem,V128 * src,V128 * dst)2902 void do_ROUNDPD_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
2903 {
2904 if (mem) {
2905 __asm__ __volatile__(
2906 "movupd (%1), %%xmm11" "\n\t"
2907 "roundpd $3, (%0), %%xmm11" "\n\t"
2908 "movupd %%xmm11, (%1)" "\n"
2909 : /*OUT*/
2910 : /*IN*/ "r"(src), "r"(dst)
2911 : /*TRASH*/ "xmm11"
2912 );
2913 } else {
2914 __asm__ __volatile__(
2915 "movupd (%1), %%xmm11" "\n\t"
2916 "movupd (%0), %%xmm2" "\n\t"
2917 "roundpd $3, %%xmm2, %%xmm11" "\n\t"
2918 "movupd %%xmm11, (%1)" "\n"
2919 : /*OUT*/
2920 : /*IN*/ "r"(src), "r"(dst)
2921 : /*TRASH*/ "xmm11","xmm2"
2922 );
2923 }
2924 }
2925
do_ROUNDPD_1XX(Bool mem,V128 * src,V128 * dst)2926 void do_ROUNDPD_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
2927 {
2928 if (mem) {
2929 __asm__ __volatile__(
2930 "movupd (%1), %%xmm11" "\n\t"
2931 "roundpd $4, (%0), %%xmm11" "\n\t"
2932 "movupd %%xmm11, (%1)" "\n"
2933 : /*OUT*/
2934 : /*IN*/ "r"(src), "r"(dst)
2935 : /*TRASH*/ "xmm11"
2936 );
2937 } else {
2938 __asm__ __volatile__(
2939 "movupd (%1), %%xmm11" "\n\t"
2940 "movupd (%0), %%xmm2" "\n\t"
2941 "roundpd $4, %%xmm2, %%xmm11" "\n\t"
2942 "movupd %%xmm11, (%1)" "\n"
2943 : /*OUT*/
2944 : /*IN*/ "r"(src), "r"(dst)
2945 : /*TRASH*/ "xmm11","xmm2"
2946 );
2947 }
2948 }
2949
test_ROUNDPD_w_immediate_rounding(void)2950 void test_ROUNDPD_w_immediate_rounding ( void )
2951 {
2952 double vals[22];
2953 Int i = 0;
2954 vals[i++] = 0.0;
2955 vals[i++] = -0.0;
2956 vals[i++] = mkPosInf();
2957 vals[i++] = mkNegInf();
2958 vals[i++] = mkPosNan();
2959 vals[i++] = mkNegNan();
2960 vals[i++] = -1.3;
2961 vals[i++] = -1.1;
2962 vals[i++] = -0.9;
2963 vals[i++] = -0.7;
2964 vals[i++] = -0.50001;
2965 vals[i++] = -0.49999;
2966 vals[i++] = -0.3;
2967 vals[i++] = -0.1;
2968 vals[i++] = 0.1;
2969 vals[i++] = 0.3;
2970 vals[i++] = 0.49999;
2971 vals[i++] = 0.50001;
2972 vals[i++] = 0.7;
2973 vals[i++] = 0.9;
2974 vals[i++] = 1.1;
2975 vals[i++] = 1.3;
2976 assert(i == 22);
2977
2978 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
2979 V128 src, dst;
2980
2981 randV128(&src);
2982 randV128(&dst);
2983 memcpy(&src[0], &vals[i], 8);
2984 memcpy(&src[8], &vals[(i+11)%22], 8);
2985 do_ROUNDPD_000(False/*reg*/, &src, &dst);
2986 printf("r roundpd_000 ");
2987 showV128(&src);
2988 printf(" ");
2989 showV128(&dst);
2990 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
2991 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
2992 printf("\n");
2993
2994 randV128(&src);
2995 randV128(&dst);
2996 memcpy(&src[0], &vals[i], 8);
2997 memcpy(&src[8], &vals[(i+11)%22], 8);
2998 do_ROUNDPD_000(True/*mem*/, &src, &dst);
2999 printf("m roundpd_000 ");
3000 showV128(&src);
3001 printf(" ");
3002 showV128(&dst);
3003 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
3004 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3005 printf("\n");
3006
3007
3008 randV128(&src);
3009 randV128(&dst);
3010 memcpy(&src[0], &vals[i], 8);
3011 memcpy(&src[8], &vals[(i+11)%22], 8);
3012 do_ROUNDPD_001(False/*reg*/, &src, &dst);
3013 printf("r roundpd_001 ");
3014 showV128(&src);
3015 printf(" ");
3016 showV128(&dst);
3017 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
3018 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3019 printf("\n");
3020
3021 randV128(&src);
3022 randV128(&dst);
3023 memcpy(&src[0], &vals[i], 8);
3024 memcpy(&src[8], &vals[(i+11)%22], 8);
3025 do_ROUNDPD_001(True/*mem*/, &src, &dst);
3026 printf("m roundpd_001 ");
3027 showV128(&src);
3028 printf(" ");
3029 showV128(&dst);
3030 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
3031 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3032 printf("\n");
3033
3034
3035 randV128(&src);
3036 randV128(&dst);
3037 memcpy(&src[0], &vals[i], 8);
3038 memcpy(&src[8], &vals[(i+11)%22], 8);
3039 do_ROUNDPD_010(False/*reg*/, &src, &dst);
3040 printf("r roundpd_010 ");
3041 showV128(&src);
3042 printf(" ");
3043 showV128(&dst);
3044 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
3045 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3046 printf("\n");
3047
3048 randV128(&src);
3049 randV128(&dst);
3050 memcpy(&src[0], &vals[i], 8);
3051 memcpy(&src[8], &vals[(i+11)%22], 8);
3052 do_ROUNDPD_010(True/*mem*/, &src, &dst);
3053 printf("m roundpd_010 ");
3054 showV128(&src);
3055 printf(" ");
3056 showV128(&dst);
3057 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
3058 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3059 printf("\n");
3060
3061
3062 randV128(&src);
3063 randV128(&dst);
3064 memcpy(&src[0], &vals[i], 8);
3065 memcpy(&src[8], &vals[(i+11)%22], 8);
3066 do_ROUNDPD_011(False/*reg*/, &src, &dst);
3067 printf("r roundpd_011 ");
3068 showV128(&src);
3069 printf(" ");
3070 showV128(&dst);
3071 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
3072 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3073 printf("\n");
3074
3075 randV128(&src);
3076 randV128(&dst);
3077 memcpy(&src[0], &vals[i], 8);
3078 memcpy(&src[8], &vals[(i+11)%22], 8);
3079 do_ROUNDPD_011(True/*mem*/, &src, &dst);
3080 printf("m roundpd_011 ");
3081 showV128(&src);
3082 printf(" ");
3083 showV128(&dst);
3084 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
3085 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3086 printf("\n");
3087 }
3088 }
3089
test_ROUNDPD_w_mxcsr_rounding(void)3090 void test_ROUNDPD_w_mxcsr_rounding ( void )
3091 {
3092 UInt rm;
3093 double vals[22];
3094 Int i = 0;
3095 vals[i++] = 0.0;
3096 vals[i++] = -0.0;
3097 vals[i++] = mkPosInf();
3098 vals[i++] = mkNegInf();
3099 vals[i++] = mkPosNan();
3100 vals[i++] = mkNegNan();
3101 vals[i++] = -1.3;
3102 vals[i++] = -1.1;
3103 vals[i++] = -0.9;
3104 vals[i++] = -0.7;
3105 vals[i++] = -0.50001;
3106 vals[i++] = -0.49999;
3107 vals[i++] = -0.3;
3108 vals[i++] = -0.1;
3109 vals[i++] = 0.1;
3110 vals[i++] = 0.3;
3111 vals[i++] = 0.49999;
3112 vals[i++] = 0.50001;
3113 vals[i++] = 0.7;
3114 vals[i++] = 0.9;
3115 vals[i++] = 1.1;
3116 vals[i++] = 1.3;
3117 assert(i == 22);
3118
3119 rm = get_sse_roundingmode();
3120 assert(rm == 0); // 0 == RN == default
3121
3122 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
3123 V128 src, dst;
3124
3125 for (rm = 0; rm <= 3; rm++) {
3126 set_sse_roundingmode(rm);
3127
3128 randV128(&src);
3129 randV128(&dst);
3130 memcpy(&src[0], &vals[i], 8);
3131 memcpy(&src[8], &vals[(i+11)%22], 8);
3132 do_ROUNDPD_1XX(False/*reg*/, &src, &dst);
3133 printf("r (rm=%u) roundpd_1XX ", rm);
3134 showV128(&src);
3135 printf(" ");
3136 showV128(&dst);
3137 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
3138 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3139 printf("\n");
3140
3141 randV128(&src);
3142 randV128(&dst);
3143 memcpy(&src[0], &vals[i], 8);
3144 memcpy(&src[8], &vals[(i+11)%22], 8);
3145 do_ROUNDPD_1XX(True/*mem*/, &src, &dst);
3146 printf("m (rm=%u) roundpd_1XX ", rm);
3147 showV128(&src);
3148 printf(" ");
3149 showV128(&dst);
3150 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
3151 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3152 printf("\n");
3153 }
3154 }
3155
3156 rm = get_sse_roundingmode();
3157 assert(rm == 3);
3158 set_sse_roundingmode(0);
3159 rm = get_sse_roundingmode();
3160 assert(rm == 0); // 0 == RN == default
3161 }
3162
3163 /* ------------ ROUNDPS ------------ */
3164
do_ROUNDPS_000(Bool mem,V128 * src,V128 * dst)3165 void do_ROUNDPS_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
3166 {
3167 if (mem) {
3168 __asm__ __volatile__(
3169 "movupd (%1), %%xmm11" "\n\t"
3170 "roundps $0, (%0), %%xmm11" "\n\t"
3171 "movupd %%xmm11, (%1)" "\n"
3172 : /*OUT*/
3173 : /*IN*/ "r"(src), "r"(dst)
3174 : /*TRASH*/ "xmm11"
3175 );
3176 } else {
3177 __asm__ __volatile__(
3178 "movupd (%1), %%xmm11" "\n\t"
3179 "movupd (%0), %%xmm2" "\n\t"
3180 "roundps $0, %%xmm2, %%xmm11" "\n\t"
3181 "movupd %%xmm11, (%1)" "\n"
3182 : /*OUT*/
3183 : /*IN*/ "r"(src), "r"(dst)
3184 : /*TRASH*/ "xmm11","xmm2"
3185 );
3186 }
3187 }
3188
do_ROUNDPS_001(Bool mem,V128 * src,V128 * dst)3189 void do_ROUNDPS_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
3190 {
3191 if (mem) {
3192 __asm__ __volatile__(
3193 "movupd (%1), %%xmm11" "\n\t"
3194 "roundps $1, (%0), %%xmm11" "\n\t"
3195 "movupd %%xmm11, (%1)" "\n"
3196 : /*OUT*/
3197 : /*IN*/ "r"(src), "r"(dst)
3198 : /*TRASH*/ "xmm11"
3199 );
3200 } else {
3201 __asm__ __volatile__(
3202 "movupd (%1), %%xmm11" "\n\t"
3203 "movupd (%0), %%xmm2" "\n\t"
3204 "roundps $1, %%xmm2, %%xmm11" "\n\t"
3205 "movupd %%xmm11, (%1)" "\n"
3206 : /*OUT*/
3207 : /*IN*/ "r"(src), "r"(dst)
3208 : /*TRASH*/ "xmm11","xmm2"
3209 );
3210 }
3211 }
3212
do_ROUNDPS_010(Bool mem,V128 * src,V128 * dst)3213 void do_ROUNDPS_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
3214 {
3215 if (mem) {
3216 __asm__ __volatile__(
3217 "movupd (%1), %%xmm11" "\n\t"
3218 "roundps $2, (%0), %%xmm11" "\n\t"
3219 "movupd %%xmm11, (%1)" "\n"
3220 : /*OUT*/
3221 : /*IN*/ "r"(src), "r"(dst)
3222 : /*TRASH*/ "xmm11"
3223 );
3224 } else {
3225 __asm__ __volatile__(
3226 "movupd (%1), %%xmm11" "\n\t"
3227 "movupd (%0), %%xmm2" "\n\t"
3228 "roundps $2, %%xmm2, %%xmm11" "\n\t"
3229 "movupd %%xmm11, (%1)" "\n"
3230 : /*OUT*/
3231 : /*IN*/ "r"(src), "r"(dst)
3232 : /*TRASH*/ "xmm11","xmm2"
3233 );
3234 }
3235 }
3236
do_ROUNDPS_011(Bool mem,V128 * src,V128 * dst)3237 void do_ROUNDPS_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
3238 {
3239 if (mem) {
3240 __asm__ __volatile__(
3241 "movupd (%1), %%xmm11" "\n\t"
3242 "roundps $3, (%0), %%xmm11" "\n\t"
3243 "movupd %%xmm11, (%1)" "\n"
3244 : /*OUT*/
3245 : /*IN*/ "r"(src), "r"(dst)
3246 : /*TRASH*/ "xmm11"
3247 );
3248 } else {
3249 __asm__ __volatile__(
3250 "movupd (%1), %%xmm11" "\n\t"
3251 "movupd (%0), %%xmm2" "\n\t"
3252 "roundps $3, %%xmm2, %%xmm11" "\n\t"
3253 "movupd %%xmm11, (%1)" "\n"
3254 : /*OUT*/
3255 : /*IN*/ "r"(src), "r"(dst)
3256 : /*TRASH*/ "xmm11","xmm2"
3257 );
3258 }
3259 }
3260
do_ROUNDPS_1XX(Bool mem,V128 * src,V128 * dst)3261 void do_ROUNDPS_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
3262 {
3263 if (mem) {
3264 __asm__ __volatile__(
3265 "movupd (%1), %%xmm11" "\n\t"
3266 "roundps $4, (%0), %%xmm11" "\n\t"
3267 "movupd %%xmm11, (%1)" "\n"
3268 : /*OUT*/
3269 : /*IN*/ "r"(src), "r"(dst)
3270 : /*TRASH*/ "xmm11"
3271 );
3272 } else {
3273 __asm__ __volatile__(
3274 "movupd (%1), %%xmm11" "\n\t"
3275 "movupd (%0), %%xmm2" "\n\t"
3276 "roundps $4, %%xmm2, %%xmm11" "\n\t"
3277 "movupd %%xmm11, (%1)" "\n"
3278 : /*OUT*/
3279 : /*IN*/ "r"(src), "r"(dst)
3280 : /*TRASH*/ "xmm11","xmm2"
3281 );
3282 }
3283 }
3284
test_ROUNDPS_w_immediate_rounding(void)3285 void test_ROUNDPS_w_immediate_rounding ( void )
3286 {
3287 float vals[22];
3288 Int i = 0;
3289 vals[i++] = 0.0;
3290 vals[i++] = -0.0;
3291 vals[i++] = mkPosInf();
3292 vals[i++] = mkNegInf();
3293 vals[i++] = mkPosNan();
3294 vals[i++] = mkNegNan();
3295 vals[i++] = -1.3;
3296 vals[i++] = -1.1;
3297 vals[i++] = -0.9;
3298 vals[i++] = -0.7;
3299 vals[i++] = -0.50001;
3300 vals[i++] = -0.49999;
3301 vals[i++] = -0.3;
3302 vals[i++] = -0.1;
3303 vals[i++] = 0.1;
3304 vals[i++] = 0.3;
3305 vals[i++] = 0.49999;
3306 vals[i++] = 0.50001;
3307 vals[i++] = 0.7;
3308 vals[i++] = 0.9;
3309 vals[i++] = 1.1;
3310 vals[i++] = 1.3;
3311 assert(i == 22);
3312
3313 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
3314 V128 src, dst;
3315
3316 randV128(&src);
3317 randV128(&dst);
3318 memcpy(&src[0], &vals[i], 4);
3319 memcpy(&src[4], &vals[(i+5)%22], 4);
3320 memcpy(&src[8], &vals[(i+11)%22], 4);
3321 memcpy(&src[12], &vals[(i+17)%22], 4);
3322 do_ROUNDPS_000(False/*reg*/, &src, &dst);
3323 printf("r roundps_000 ");
3324 showV128(&src);
3325 printf(" ");
3326 showV128(&dst);
3327 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3328 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3329 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3330 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3331 printf("\n");
3332
3333 randV128(&src);
3334 randV128(&dst);
3335 memcpy(&src[0], &vals[i], 4);
3336 memcpy(&src[4], &vals[(i+5)%22], 4);
3337 memcpy(&src[8], &vals[(i+11)%22], 4);
3338 memcpy(&src[12], &vals[(i+17)%22], 4);
3339 do_ROUNDPS_000(True/*mem*/, &src, &dst);
3340 printf("m roundps_000 ");
3341 showV128(&src);
3342 printf(" ");
3343 showV128(&dst);
3344 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3345 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3346 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3347 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3348 printf("\n");
3349
3350
3351 randV128(&src);
3352 randV128(&dst);
3353 memcpy(&src[0], &vals[i], 4);
3354 memcpy(&src[4], &vals[(i+5)%22], 4);
3355 memcpy(&src[8], &vals[(i+11)%22], 4);
3356 memcpy(&src[12], &vals[(i+17)%22], 4);
3357 do_ROUNDPS_001(False/*reg*/, &src, &dst);
3358 printf("r roundps_001 ");
3359 showV128(&src);
3360 printf(" ");
3361 showV128(&dst);
3362 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3363 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3364 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3365 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3366 printf("\n");
3367
3368 randV128(&src);
3369 randV128(&dst);
3370 memcpy(&src[0], &vals[i], 4);
3371 memcpy(&src[4], &vals[(i+5)%22], 4);
3372 memcpy(&src[8], &vals[(i+11)%22], 4);
3373 memcpy(&src[12], &vals[(i+17)%22], 4);
3374 do_ROUNDPS_001(True/*mem*/, &src, &dst);
3375 printf("m roundps_001 ");
3376 showV128(&src);
3377 printf(" ");
3378 showV128(&dst);
3379 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3380 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3381 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3382 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3383 printf("\n");
3384
3385
3386 randV128(&src);
3387 randV128(&dst);
3388 memcpy(&src[0], &vals[i], 4);
3389 memcpy(&src[4], &vals[(i+5)%22], 4);
3390 memcpy(&src[8], &vals[(i+11)%22], 4);
3391 memcpy(&src[12], &vals[(i+17)%22], 4);
3392 do_ROUNDPS_010(False/*reg*/, &src, &dst);
3393 printf("r roundps_010 ");
3394 showV128(&src);
3395 printf(" ");
3396 showV128(&dst);
3397 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3398 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3399 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3400 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3401 printf("\n");
3402
3403 randV128(&src);
3404 randV128(&dst);
3405 memcpy(&src[0], &vals[i], 4);
3406 memcpy(&src[4], &vals[(i+5)%22], 4);
3407 memcpy(&src[8], &vals[(i+11)%22], 4);
3408 memcpy(&src[12], &vals[(i+17)%22], 4);
3409 do_ROUNDPS_010(True/*mem*/, &src, &dst);
3410 printf("m roundps_010 ");
3411 showV128(&src);
3412 printf(" ");
3413 showV128(&dst);
3414 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3415 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3416 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3417 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3418 printf("\n");
3419
3420
3421 randV128(&src);
3422 randV128(&dst);
3423 memcpy(&src[0], &vals[i], 4);
3424 memcpy(&src[4], &vals[(i+5)%22], 4);
3425 memcpy(&src[8], &vals[(i+11)%22], 4);
3426 memcpy(&src[12], &vals[(i+17)%22], 4);
3427 do_ROUNDPS_011(False/*reg*/, &src, &dst);
3428 printf("r roundps_011 ");
3429 showV128(&src);
3430 printf(" ");
3431 showV128(&dst);
3432 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3433 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3434 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3435 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3436 printf("\n");
3437
3438 randV128(&src);
3439 randV128(&dst);
3440 memcpy(&src[0], &vals[i], 4);
3441 memcpy(&src[4], &vals[(i+5)%22], 4);
3442 memcpy(&src[8], &vals[(i+11)%22], 4);
3443 memcpy(&src[12], &vals[(i+17)%22], 4);
3444 do_ROUNDPS_011(True/*mem*/, &src, &dst);
3445 printf("m roundps_011 ");
3446 showV128(&src);
3447 printf(" ");
3448 showV128(&dst);
3449 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3450 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3451 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3452 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3453 printf("\n");
3454 }
3455 }
3456
test_ROUNDPS_w_mxcsr_rounding(void)3457 void test_ROUNDPS_w_mxcsr_rounding ( void )
3458 {
3459 UInt rm;
3460 float vals[22];
3461 Int i = 0;
3462 vals[i++] = 0.0;
3463 vals[i++] = -0.0;
3464 vals[i++] = mkPosInf();
3465 vals[i++] = mkNegInf();
3466 vals[i++] = mkPosNan();
3467 vals[i++] = mkNegNan();
3468 vals[i++] = -1.3;
3469 vals[i++] = -1.1;
3470 vals[i++] = -0.9;
3471 vals[i++] = -0.7;
3472 vals[i++] = -0.50001;
3473 vals[i++] = -0.49999;
3474 vals[i++] = -0.3;
3475 vals[i++] = -0.1;
3476 vals[i++] = 0.1;
3477 vals[i++] = 0.3;
3478 vals[i++] = 0.49999;
3479 vals[i++] = 0.50001;
3480 vals[i++] = 0.7;
3481 vals[i++] = 0.9;
3482 vals[i++] = 1.1;
3483 vals[i++] = 1.3;
3484 assert(i == 22);
3485
3486 rm = get_sse_roundingmode();
3487 assert(rm == 0); // 0 == RN == default
3488
3489 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
3490 V128 src, dst;
3491
3492 for (rm = 0; rm <= 3; rm++) {
3493 set_sse_roundingmode(rm);
3494
3495 randV128(&src);
3496 randV128(&dst);
3497 memcpy(&src[0], &vals[i], 4);
3498 memcpy(&src[4], &vals[(i+5)%22], 4);
3499 memcpy(&src[8], &vals[(i+11)%22], 4);
3500 memcpy(&src[12], &vals[(i+17)%22], 4);
3501 do_ROUNDPS_1XX(False/*reg*/, &src, &dst);
3502 printf("r (rm=%u) roundps_1XX ", rm);
3503 showV128(&src);
3504 printf(" ");
3505 showV128(&dst);
3506 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3507 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3508 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3509 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3510 printf("\n");
3511
3512 randV128(&src);
3513 randV128(&dst);
3514 memcpy(&src[0], &vals[i], 4);
3515 memcpy(&src[4], &vals[(i+5)%22], 4);
3516 memcpy(&src[8], &vals[(i+11)%22], 4);
3517 memcpy(&src[12], &vals[(i+17)%22], 4);
3518 do_ROUNDPS_1XX(True/*mem*/, &src, &dst);
3519 printf("m (rm=%u) roundps_1XX ", rm);
3520 showV128(&src);
3521 printf(" ");
3522 showV128(&dst);
3523 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3524 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3525 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3526 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3527 printf("\n");
3528 }
3529 }
3530
3531 rm = get_sse_roundingmode();
3532 assert(rm == 3);
3533 set_sse_roundingmode(0);
3534 rm = get_sse_roundingmode();
3535 assert(rm == 0); // 0 == RN == default
3536 }
3537
3538 /* ------------ PTEST ------------ */
3539
test_PTEST(void)3540 void test_PTEST ( void )
3541 {
3542 const Int ntests = 8;
3543 V128 spec[ntests];
3544 do64HLtoV128( &spec[0], 0x0000000000000000ULL, 0x0000000000000000ULL );
3545 do64HLtoV128( &spec[1], 0x0000000000000000ULL, 0x0000000000000001ULL );
3546 do64HLtoV128( &spec[2], 0x0000000000000001ULL, 0x0000000000000000ULL );
3547 do64HLtoV128( &spec[3], 0x0000000000000001ULL, 0x0000000000000001ULL );
3548 do64HLtoV128( &spec[4], 0xffffffffffffffffULL, 0xffffffffffffffffULL );
3549 do64HLtoV128( &spec[5], 0xffffffffffffffffULL, 0xfffffffffffffffeULL );
3550 do64HLtoV128( &spec[6], 0xfffffffffffffffeULL, 0xffffffffffffffffULL );
3551 do64HLtoV128( &spec[7], 0xfffffffffffffffeULL, 0xfffffffffffffffeULL );
3552 V128 block[2];
3553 Int i, j;
3554 ULong flags;
3555 for (i = 0; i < ntests; i++) {
3556 for (j = 0; j < ntests; j++) {
3557 memcpy(&block[0], &spec[i], 16);
3558 memcpy(&block[1], &spec[j], 16);
3559 __asm__ __volatile__(
3560 "subq $256, %%rsp" "\n\t"
3561 "movupd 0(%1), %%xmm2" "\n\t"
3562 "ptest 16(%1), %%xmm2" "\n\t"
3563 "pushfq" "\n\t"
3564 "popq %0" "\n\t"
3565 "addq $256, %%rsp" "\n\t"
3566 : /*out*/"=r"(flags) : /*in*/ "r"(&block[0]) :
3567 "xmm2", "memory", "cc"
3568 );
3569 printf("r ptest ");
3570 showV128(&block[0]);
3571 printf(" ");
3572 showV128(&block[1]);
3573 printf(" -> eflags %04x\n", (UInt)flags & 0x8D5);
3574 }
3575 }
3576 }
3577
3578 /* ------------ PBLENDVB ------------ */
3579
do_PBLENDVB(Bool mem,V128 * xmm0,V128 * src,V128 * dst)3580 void do_PBLENDVB ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst )
3581 {
3582 if (mem) {
3583 __asm__ __volatile__(
3584 "movupd (%2), %%xmm0" "\n\t"
3585 "movupd (%1), %%xmm11" "\n\t"
3586 "pblendvb (%0), %%xmm11" "\n\t"
3587 "movupd %%xmm11, (%1)" "\n"
3588 : /*OUT*/
3589 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3590 : /*TRASH*/ "xmm11","xmm0"
3591 );
3592 } else {
3593 __asm__ __volatile__(
3594 "movupd (%2), %%xmm0" "\n\t"
3595 "movupd (%1), %%xmm11" "\n\t"
3596 "movupd (%0), %%xmm2" "\n\t"
3597 "pblendvb %%xmm2, %%xmm11" "\n\t"
3598 "movupd %%xmm11, (%1)" "\n"
3599 : /*OUT*/
3600 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3601 : /*TRASH*/ "xmm11","xmm2","xmm0"
3602 );
3603 }
3604 }
3605
test_PBLENDVB(void)3606 void test_PBLENDVB ( void )
3607 {
3608 V128 xmm0, src, dst, t_xmm0, t_src, t_dst;
3609 Int i;
3610 for (i = 0; i < 10; i++) {
3611 randV128(&t_xmm0);
3612 randV128(&t_src);
3613 randV128(&t_dst);
3614
3615 memcpy(&xmm0, &t_xmm0, 16);
3616 memcpy(&src, &t_src, 16);
3617 memcpy(&dst, &t_dst, 16);
3618 do_PBLENDVB(False/*reg*/, &xmm0, &src, &dst);
3619 printf("r pblendvb ");
3620 showV128(&t_xmm0);
3621 printf(" ");
3622 showV128(&t_src);
3623 printf(" ");
3624 showV128(&t_dst);
3625 printf(" -> ");
3626 showV128(&dst);
3627 printf("\n");
3628
3629 memcpy(&xmm0, &t_xmm0, 16);
3630 memcpy(&src, &t_src, 16);
3631 memcpy(&dst, &t_dst, 16);
3632 do_PBLENDVB(True/*mem*/, &xmm0, &src, &dst);
3633 printf("m pblendvb ");
3634 showV128(&t_xmm0);
3635 printf(" ");
3636 showV128(&t_src);
3637 printf(" ");
3638 showV128(&t_dst);
3639 printf(" -> ");
3640 showV128(&dst);
3641 printf("\n");
3642 }
3643 }
3644
3645 /* ------------ BLENDVPD ------------ */
3646
do_BLENDVPD(Bool mem,V128 * xmm0,V128 * src,V128 * dst)3647 void do_BLENDVPD ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst )
3648 {
3649 if (mem) {
3650 __asm__ __volatile__(
3651 "movupd (%2), %%xmm0" "\n\t"
3652 "movupd (%1), %%xmm11" "\n\t"
3653 "blendvpd (%0), %%xmm11" "\n\t"
3654 "movupd %%xmm11, (%1)" "\n"
3655 : /*OUT*/
3656 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3657 : /*TRASH*/ "xmm11","xmm0"
3658 );
3659 } else {
3660 __asm__ __volatile__(
3661 "movupd (%2), %%xmm0" "\n\t"
3662 "movupd (%1), %%xmm11" "\n\t"
3663 "movupd (%0), %%xmm2" "\n\t"
3664 "blendvpd %%xmm2, %%xmm11" "\n\t"
3665 "movupd %%xmm11, (%1)" "\n"
3666 : /*OUT*/
3667 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3668 : /*TRASH*/ "xmm11","xmm2","xmm0"
3669 );
3670 }
3671 }
3672
test_BLENDVPD(void)3673 void test_BLENDVPD ( void )
3674 {
3675 V128 xmm0, src, dst, t_xmm0, t_src, t_dst;
3676 Int i;
3677 for (i = 0; i < 10; i++) {
3678 randV128(&t_xmm0);
3679 randV128(&t_src);
3680 randV128(&t_dst);
3681
3682 memcpy(&xmm0, &t_xmm0, 16);
3683 memcpy(&src, &t_src, 16);
3684 memcpy(&dst, &t_dst, 16);
3685 do_BLENDVPD(False/*reg*/, &xmm0, &src, &dst);
3686 printf("r blendvpd ");
3687 showV128(&t_xmm0);
3688 printf(" ");
3689 showV128(&t_src);
3690 printf(" ");
3691 showV128(&t_dst);
3692 printf(" -> ");
3693 showV128(&dst);
3694 printf("\n");
3695
3696 memcpy(&xmm0, &t_xmm0, 16);
3697 memcpy(&src, &t_src, 16);
3698 memcpy(&dst, &t_dst, 16);
3699 do_BLENDVPD(True/*mem*/, &xmm0, &src, &dst);
3700 printf("m blendvpd ");
3701 showV128(&t_xmm0);
3702 printf(" ");
3703 showV128(&t_src);
3704 printf(" ");
3705 showV128(&t_dst);
3706 printf(" -> ");
3707 showV128(&dst);
3708 printf("\n");
3709 }
3710 }
3711
3712 /* ------------ BLENDVPS ------------ */
3713
do_BLENDVPS(Bool mem,V128 * xmm0,V128 * src,V128 * dst)3714 void do_BLENDVPS ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst )
3715 {
3716 if (mem) {
3717 __asm__ __volatile__(
3718 "movupd (%2), %%xmm0" "\n\t"
3719 "movupd (%1), %%xmm11" "\n\t"
3720 "blendvps (%0), %%xmm11" "\n\t"
3721 "movupd %%xmm11, (%1)" "\n"
3722 : /*OUT*/
3723 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3724 : /*TRASH*/ "xmm11","xmm0"
3725 );
3726 } else {
3727 __asm__ __volatile__(
3728 "movupd (%2), %%xmm0" "\n\t"
3729 "movupd (%1), %%xmm11" "\n\t"
3730 "movupd (%0), %%xmm2" "\n\t"
3731 "blendvps %%xmm2, %%xmm11" "\n\t"
3732 "movupd %%xmm11, (%1)" "\n"
3733 : /*OUT*/
3734 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3735 : /*TRASH*/ "xmm11","xmm2","xmm0"
3736 );
3737 }
3738 }
3739
test_BLENDVPS(void)3740 void test_BLENDVPS ( void )
3741 {
3742 V128 xmm0, src, dst, t_xmm0, t_src, t_dst;
3743 Int i;
3744 for (i = 0; i < 10; i++) {
3745 randV128(&t_xmm0);
3746 randV128(&t_src);
3747 randV128(&t_dst);
3748
3749 memcpy(&xmm0, &t_xmm0, 16);
3750 memcpy(&src, &t_src, 16);
3751 memcpy(&dst, &t_dst, 16);
3752 do_BLENDVPS(False/*reg*/, &xmm0, &src, &dst);
3753 printf("r blendvps ");
3754 showV128(&t_xmm0);
3755 printf(" ");
3756 showV128(&t_src);
3757 printf(" ");
3758 showV128(&t_dst);
3759 printf(" -> ");
3760 showV128(&dst);
3761 printf("\n");
3762
3763 memcpy(&xmm0, &t_xmm0, 16);
3764 memcpy(&src, &t_src, 16);
3765 memcpy(&dst, &t_dst, 16);
3766 do_BLENDVPS(True/*mem*/, &xmm0, &src, &dst);
3767 printf("m blendvps ");
3768 showV128(&t_xmm0);
3769 printf(" ");
3770 showV128(&t_src);
3771 printf(" ");
3772 showV128(&t_dst);
3773 printf(" -> ");
3774 showV128(&dst);
3775 printf("\n");
3776 }
3777 }
3778
3779 /* ------------ main ------------ */
3780
main(int argc,char ** argv)3781 int main ( int argc, char** argv )
3782 {
3783 #if 1
3784 // ------ SSE 4.1 ------
3785 test_BLENDPD(); // done Apr.01.2010
3786 test_BLENDPS(); // done Apr.02.2010
3787 test_PBLENDW();
3788 test_PBLENDVB();
3789 test_BLENDVPD();
3790 test_BLENDVPS();
3791 test_DPPD(); // done Apr.08.2010
3792 test_DPPS(); // done Apr.09.2010
3793 test_EXTRACTPS();
3794 test_INSERTPS(); // done Apr.01.2010
3795 // MOVNTDQA ***
3796 test_PCMPEQQ();
3797 test_PEXTRB(); // done Apr.15.2010
3798 test_PEXTRD(); // done Apr.14.2010
3799 test_PEXTRQ(); // done Apr.14.2010
3800 test_PEXTRW(); // done Apr.14.2010
3801 test_PINSRQ(); // done Apr.16.2010
3802 test_PINSRD(); // todo
3803 test_PINSRW(); /* Umm, this is SSE2, not SSE4. Right? */
3804 test_PINSRB(); // todo
3805 test_PMAXSB();
3806 test_PMAXSD(); // done Apr.09.2010
3807 test_PMAXUD(); // done Apr.16.2010
3808 test_PMAXUW();
3809 test_PMINSB();
3810 test_PMINSD(); // done Apr.09.2010
3811 test_PMINUD();
3812 test_PMINUW();
3813 test_PMOVSXBW(); // done Apr.02.2010
3814 test_PMOVSXBD(); // done Mar.30.2010
3815 test_PMOVSXBQ(); // done Mar.30.2010
3816 test_PMOVSXWD(); // done Mar.31.2010
3817 test_PMOVSXWQ(); // done Mar.31.2010
3818 test_PMOVSXDQ(); // done Mar.31.2010
3819 test_PMOVZXBW(); // done Mar.28.2010
3820 test_PMOVZXBD(); // done Mar.29.2010
3821 test_PMOVZXBQ(); // done Mar.29.2010
3822 test_PMOVZXWD(); // done Mar.28.2010
3823 test_PMOVZXWQ(); // done Mar.29.2010
3824 test_PMOVZXDQ(); // done Mar.29.2010
3825 test_POPCNTW();
3826 test_POPCNTL();
3827 test_POPCNTQ();
3828 test_PMULDQ();
3829 test_PMULLD();
3830 test_PTEST();
3831 test_ROUNDSD_w_immediate_rounding();
3832 test_ROUNDSS_w_immediate_rounding();
3833 test_ROUNDPD_w_immediate_rounding();
3834 test_ROUNDPS_w_immediate_rounding();
3835 test_ROUNDSD_w_mxcsr_rounding();
3836 test_ROUNDSS_w_mxcsr_rounding();
3837 test_ROUNDPD_w_mxcsr_rounding();
3838 test_ROUNDPS_w_mxcsr_rounding();
3839 // ------ SSE 4.2 ------
3840 test_PCMPGTQ();
3841 // CRC32B,Q
3842 test_PACKUSDW();
3843 test_PHMINPOSUW();
3844 test_MPSADBW();
3845 #else
3846 test_MPSADBW();
3847 #endif
3848
3849 return 0;
3850 }
3851
3852