• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 
2 /* Tests in detail the core arithmetic for pcmp{e,i}str{i,m} using
3    pcmpistri to drive it.  Does not check the e-vs-i or i-vs-m
4    aspect. */
5 
6 #include <string.h>
7 #include <stdio.h>
8 #include <assert.h>
9 
10 typedef  unsigned int   UInt;
11 typedef  signed int     Int;
12 typedef  unsigned char  UChar;
13 typedef  signed char    Char;
14 typedef  unsigned long long int ULong;
15 typedef  UChar          Bool;
16 #define False ((Bool)0)
17 #define True  ((Bool)1)
18 
19 //typedef  unsigned char  V128[16];
20 typedef
21    union {
22       UChar uChar[16];
23       UInt  uInt[4];
24    }
25    V128;
26 
27 #define SHIFT_O   11
28 #define SHIFT_S   7
29 #define SHIFT_Z   6
30 #define SHIFT_A   4
31 #define SHIFT_C   0
32 #define SHIFT_P   2
33 
34 #define MASK_O    (1ULL << SHIFT_O)
35 #define MASK_S    (1ULL << SHIFT_S)
36 #define MASK_Z    (1ULL << SHIFT_Z)
37 #define MASK_A    (1ULL << SHIFT_A)
38 #define MASK_C    (1ULL << SHIFT_C)
39 #define MASK_P    (1ULL << SHIFT_P)
40 
41 
clz32(UInt x)42 UInt clz32 ( UInt x )
43 {
44    Int y, m, n;
45    y = -(x >> 16);
46    m = (y >> 16) & 16;
47    n = 16 - m;
48    x = x >> m;
49    y = x - 0x100;
50    m = (y >> 16) & 8;
51    n = n + m;
52    x = x << m;
53    y = x - 0x1000;
54    m = (y >> 16) & 4;
55    n = n + m;
56    x = x << m;
57    y = x - 0x4000;
58    m = (y >> 16) & 2;
59    n = n + m;
60    x = x << m;
61    y = x >> 14;
62    m = y & ~(y >> 1);
63    return n + 2 - m;
64 }
65 
ctz32(UInt x)66 UInt ctz32 ( UInt x )
67 {
68    return 32 - clz32((~x) & (x-1));
69 }
70 
expand(V128 * dst,char * summary)71 void expand ( V128* dst, char* summary )
72 {
73    Int i;
74    assert( strlen(summary) == 16 );
75    for (i = 0; i < 16; i++) {
76       UChar xx = 0;
77       UChar x = summary[15-i];
78       if      (x >= '0' && x <= '9') { xx = x - '0'; }
79       else if (x >= 'A' && x <= 'F') { xx = x - 'A' + 10; }
80       else if (x >= 'a' && x <= 'f') { xx = x - 'a' + 10; }
81       else assert(0);
82 
83       assert(xx < 16);
84       xx = (xx << 4) | xx;
85       assert(xx < 256);
86       dst->uChar[i] = xx;
87    }
88 }
89 
try_istri(char * which,UInt (* h_fn)(V128 *,V128 *),UInt (* s_fn)(V128 *,V128 *),char * summL,char * summR)90 void try_istri ( char* which,
91                  UInt(*h_fn)(V128*,V128*),
92                  UInt(*s_fn)(V128*,V128*),
93                  char* summL, char* summR )
94 {
95    assert(strlen(which) == 2);
96    V128 argL, argR;
97    expand(&argL, summL);
98    expand(&argR, summR);
99    UInt h_res = h_fn(&argL, &argR);
100    UInt s_res = s_fn(&argL, &argR);
101    printf("istri %s  %s %s -> %08x %08x %s\n",
102           which, summL, summR, h_res, s_res, h_res == s_res ? "" : "!!!!");
103 }
104 
zmask_from_V128(V128 * arg)105 UInt zmask_from_V128 ( V128* arg )
106 {
107    UInt i, res = 0;
108    for (i = 0; i < 16; i++) {
109       res |=  ((arg->uChar[i] == 0) ? 1 : 0) << i;
110    }
111    return res;
112 }
113 
114 //////////////////////////////////////////////////////////
115 //                                                      //
116 //                       GENERAL                        //
117 //                                                      //
118 //////////////////////////////////////////////////////////
119 
120 
121 /* Given partial results from a pcmpXstrX operation (intRes1,
122    basically), generate an I format (index value for ECX) output, and
123    also the new OSZACP flags.
124 */
125 static
pcmpXstrX_WRK_gen_output_fmt_I(V128 * resV,UInt * resOSZACP,UInt intRes1,UInt zmaskL,UInt zmaskR,UInt validL,UInt pol,UInt idx)126 void pcmpXstrX_WRK_gen_output_fmt_I(/*OUT*/V128* resV,
127                                     /*OUT*/UInt* resOSZACP,
128                                     UInt intRes1,
129                                     UInt zmaskL, UInt zmaskR,
130                                     UInt validL,
131                                     UInt pol, UInt idx )
132 {
133    assert((pol >> 2) == 0);
134    assert((idx >> 1) == 0);
135 
136    UInt intRes2 = 0;
137    switch (pol) {
138       case 0: intRes2 = intRes1;          break; // pol +
139       case 1: intRes2 = ~intRes1;         break; // pol -
140       case 2: intRes2 = intRes1;          break; // pol m+
141       case 3: intRes2 = intRes1 ^ validL; break; // pol m-
142    }
143    intRes2 &= 0xFFFF;
144 
145    // generate ecx value
146    UInt newECX = 0;
147    if (idx) {
148      // index of ms-1-bit
149      newECX = intRes2 == 0 ? 16 : (31 - clz32(intRes2));
150    } else {
151      // index of ls-1-bit
152      newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
153    }
154 
155    *(UInt*)(&resV[0]) = newECX;
156 
157    // generate new flags, common to all ISTRI and ISTRM cases
158    *resOSZACP    // A, P are zero
159      = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
160      | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
161      | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
162      | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
163 }
164 
165 
166 /* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
167    variants.
168 
169    For xSTRI variants, the new ECX value is placed in the 32 bits
170    pointed to by *resV.  For xSTRM variants, the result is a 128 bit
171    value and is placed at *resV in the obvious way.
172 
173    For all variants, the new OSZACP value is placed at *resOSZACP.
174 
175    argLV and argRV are the vector args.  The caller must prepare a
176    16-bit mask for each, zmaskL and zmaskR.  For ISTRx variants this
177    must be 1 for each zero byte of of the respective arg.  For ESTRx
178    variants this is derived from the explicit length indication, and
179    must be 0 in all places except at the bit index corresponding to
180    the valid length (0 .. 16).  If the valid length is 16 then the
181    mask must be all zeroes.  In all cases, bits 31:16 must be zero.
182 
183    imm8 is the original immediate from the instruction.  isSTRM
184    indicates whether this is a xSTRM or xSTRI variant, which controls
185    how much of *res is written.
186 
187    If the given imm8 case can be handled, the return value is True.
188    If not, False is returned, and neither *res not *resOSZACP are
189    altered.
190 */
191 
pcmpXstrX_WRK(V128 * resV,UInt * resOSZACP,V128 * argLV,V128 * argRV,UInt zmaskL,UInt zmaskR,UInt imm8,Bool isSTRM)192 Bool pcmpXstrX_WRK ( /*OUT*/V128* resV,
193                      /*OUT*/UInt* resOSZACP,
194                      V128* argLV,  V128* argRV,
195                      UInt zmaskL, UInt zmaskR,
196                      UInt imm8,   Bool isSTRM )
197 {
198    assert(imm8 < 0x80);
199    assert((zmaskL >> 16) == 0);
200    assert((zmaskR >> 16) == 0);
201 
202    /* Explicitly reject any imm8 values that haven't been validated,
203       even if they would probably work.  Life is too short to have
204       unvalidated cases in the code base. */
205    switch (imm8) {
206       case 0x00: case 0x02: case 0x08: case 0x0A: case 0x0C: case 0x0E:
207       case 0x12: case 0x14: case 0x1A:
208       case 0x30: case 0x34: case 0x38: case 0x3A:
209       case 0x40: case 0x44: case 0x46: case 0x4A:
210          break;
211       default:
212          return False;
213    }
214 
215    UInt fmt = (imm8 >> 0) & 3; // imm8[1:0]  data format
216    UInt agg = (imm8 >> 2) & 3; // imm8[3:2]  aggregation fn
217    UInt pol = (imm8 >> 4) & 3; // imm8[5:4]  polarity
218    UInt idx = (imm8 >> 6) & 1; // imm8[6]    1==msb/bytemask
219 
220    /*----------------------------------------*/
221    /*-- strcmp on byte data                --*/
222    /*----------------------------------------*/
223 
224    if (agg == 2/*equal each, aka strcmp*/
225        && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
226        && !isSTRM) {
227       Int    i;
228       UChar* argL = (UChar*)argLV;
229       UChar* argR = (UChar*)argRV;
230       UInt boolResII = 0;
231       for (i = 15; i >= 0; i--) {
232          UChar cL  = argL[i];
233          UChar cR  = argR[i];
234          boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
235       }
236       UInt validL = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
237       UInt validR = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
238 
239       // do invalidation, common to all equal-each cases
240       UInt intRes1
241          = (boolResII & validL & validR)  // if both valid, use cmpres
242            | (~ (validL | validR));       // if both invalid, force 1
243                                           // else force 0
244       intRes1 &= 0xFFFF;
245 
246       // generate I-format output
247       pcmpXstrX_WRK_gen_output_fmt_I(
248          resV, resOSZACP,
249          intRes1, zmaskL, zmaskR, validL, pol, idx
250       );
251 
252       return True;
253    }
254 
255    /*----------------------------------------*/
256    /*-- set membership on byte data        --*/
257    /*----------------------------------------*/
258 
259    if (agg == 0/*equal any, aka find chars in a set*/
260        && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
261        && !isSTRM) {
262       /* argL: the string,  argR: charset */
263       UInt   si, ci;
264       UChar* argL    = (UChar*)argLV;
265       UChar* argR    = (UChar*)argRV;
266       UInt   boolRes = 0;
267       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
268       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
269 
270       for (si = 0; si < 16; si++) {
271          if ((validL & (1 << si)) == 0)
272             // run off the end of the string.
273             break;
274          UInt m = 0;
275          for (ci = 0; ci < 16; ci++) {
276             if ((validR & (1 << ci)) == 0) break;
277             if (argR[ci] == argL[si]) { m = 1; break; }
278          }
279          boolRes |= (m << si);
280       }
281 
282       // boolRes is "pre-invalidated"
283       UInt intRes1 = boolRes & 0xFFFF;
284 
285       // generate I-format output
286       pcmpXstrX_WRK_gen_output_fmt_I(
287          resV, resOSZACP,
288          intRes1, zmaskL, zmaskR, validL, pol, idx
289       );
290 
291       return True;
292    }
293 
294    /*----------------------------------------*/
295    /*-- substring search on byte data      --*/
296    /*----------------------------------------*/
297 
298    if (agg == 3/*equal ordered, aka substring search*/
299        && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
300        && !isSTRM) {
301 
302       /* argL: haystack,  argR: needle */
303       UInt   ni, hi;
304       UChar* argL    = (UChar*)argLV;
305       UChar* argR    = (UChar*)argRV;
306       UInt   boolRes = 0;
307       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
308       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
309       for (hi = 0; hi < 16; hi++) {
310          UInt m = 1;
311          for (ni = 0; ni < 16; ni++) {
312             if ((validR & (1 << ni)) == 0) break;
313             UInt i = ni + hi;
314             if (i >= 16) break;
315             if (argL[i] != argR[ni]) { m = 0; break; }
316          }
317          boolRes |= (m << hi);
318          if ((validL & (1 << hi)) == 0)
319             // run off the end of the haystack
320             break;
321       }
322 
323       // boolRes is "pre-invalidated"
324       UInt intRes1 = boolRes & 0xFFFF;
325 
326       // generate I-format output
327       pcmpXstrX_WRK_gen_output_fmt_I(
328          resV, resOSZACP,
329          intRes1, zmaskL, zmaskR, validL, pol, idx
330       );
331 
332       return True;
333    }
334 
335    /*----------------------------------------*/
336    /*-- ranges, unsigned byte data         --*/
337    /*----------------------------------------*/
338 
339    if (agg == 1/*ranges*/
340        && fmt == 0/*ub*/
341        && !isSTRM) {
342 
343       /* argL: string,  argR: range-pairs */
344       UInt   ri, si;
345       UChar* argL    = (UChar*)argLV;
346       UChar* argR    = (UChar*)argRV;
347       UInt   boolRes = 0;
348       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
349       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
350       for (si = 0; si < 16; si++) {
351          if ((validL & (1 << si)) == 0)
352             // run off the end of the string
353             break;
354          UInt m = 0;
355          for (ri = 0; ri < 16; ri += 2) {
356             if ((validR & (3 << ri)) != (3 << ri)) break;
357             if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
358                m = 1; break;
359             }
360          }
361          boolRes |= (m << si);
362       }
363 
364       // boolRes is "pre-invalidated"
365       UInt intRes1 = boolRes & 0xFFFF;
366 
367       // generate I-format output
368       pcmpXstrX_WRK_gen_output_fmt_I(
369          resV, resOSZACP,
370          intRes1, zmaskL, zmaskR, validL, pol, idx
371       );
372 
373       return True;
374    }
375 
376    /*----------------------------------------*/
377    /*-- ranges, signed byte data           --*/
378    /*----------------------------------------*/
379 
380    if (agg == 1/*ranges*/
381        && fmt == 2/*sb*/
382        && !isSTRM) {
383 
384       /* argL: string,  argR: range-pairs */
385       UInt   ri, si;
386       Char*  argL    = (Char*)argLV;
387       Char*  argR    = (Char*)argRV;
388       UInt   boolRes = 0;
389       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
390       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
391       for (si = 0; si < 16; si++) {
392          if ((validL & (1 << si)) == 0)
393             // run off the end of the string
394             break;
395          UInt m = 0;
396          for (ri = 0; ri < 16; ri += 2) {
397             if ((validR & (3 << ri)) != (3 << ri)) break;
398             if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
399                m = 1; break;
400             }
401          }
402          boolRes |= (m << si);
403       }
404 
405       // boolRes is "pre-invalidated"
406       UInt intRes1 = boolRes & 0xFFFF;
407 
408       // generate I-format output
409       pcmpXstrX_WRK_gen_output_fmt_I(
410          resV, resOSZACP,
411          intRes1, zmaskL, zmaskR, validL, pol, idx
412       );
413 
414       return True;
415    }
416 
417    return False;
418 }
419 
420 
421 //////////////////////////////////////////////////////////
422 //                                                      //
423 //                       ISTRI_4A                       //
424 //                                                      //
425 //////////////////////////////////////////////////////////
426 
h_pcmpistri_4A(V128 * argL,V128 * argR)427 UInt h_pcmpistri_4A ( V128* argL, V128* argR )
428 {
429    V128 block[2];
430    memcpy(&block[0], argL, sizeof(V128));
431    memcpy(&block[1], argR, sizeof(V128));
432    ULong res, flags;
433    __asm__ __volatile__(
434       "subq      $1024,  %%rsp"             "\n\t"
435       "movdqu    0(%2),  %%xmm2"            "\n\t"
436       "movdqu    16(%2), %%xmm11"           "\n\t"
437       "pcmpistri $0x4A,  %%xmm2, %%xmm11"   "\n\t"
438       "pushfq"                              "\n\t"
439       "popq      %%rdx"                     "\n\t"
440       "movq      %%rcx,  %0"                "\n\t"
441       "movq      %%rdx,  %1"                "\n\t"
442       "addq      $1024,  %%rsp"             "\n\t"
443       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
444       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
445    );
446    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
447 }
448 
s_pcmpistri_4A(V128 * argLU,V128 * argRU)449 UInt s_pcmpistri_4A ( V128* argLU, V128* argRU )
450 {
451    V128 resV;
452    UInt resOSZACP, resECX;
453    Bool ok
454       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
455                        zmask_from_V128(argLU),
456                        zmask_from_V128(argRU),
457                        0x4A, False/*!isSTRM*/
458         );
459    assert(ok);
460    resECX = resV.uInt[0];
461    return (resOSZACP << 16) | resECX;
462 }
463 
istri_4A(void)464 void istri_4A ( void )
465 {
466    char* wot = "4A";
467    UInt(*h)(V128*,V128*) = h_pcmpistri_4A;
468    UInt(*s)(V128*,V128*) = s_pcmpistri_4A;
469 
470    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
471 
472    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
473    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
474    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
475    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
476 
477    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
478    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
479    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
480 
481    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
482    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
483    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
484    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
485 
486    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
487    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
488    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
489 
490    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
491 
492    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
493    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
494    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
495 
496    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
497    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
498    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
499 
500    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
501    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
502    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
503 
504    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
505    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
506    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
507 
508    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
509    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
510 }
511 
512 //////////////////////////////////////////////////////////
513 //                                                      //
514 //                       ISTRI_3A                       //
515 //                                                      //
516 //////////////////////////////////////////////////////////
517 
h_pcmpistri_3A(V128 * argL,V128 * argR)518 UInt h_pcmpistri_3A ( V128* argL, V128* argR )
519 {
520    V128 block[2];
521    memcpy(&block[0], argL, sizeof(V128));
522    memcpy(&block[1], argR, sizeof(V128));
523    ULong res, flags;
524    __asm__ __volatile__(
525       "subq      $1024,  %%rsp"             "\n\t"
526       "movdqu    0(%2),  %%xmm2"            "\n\t"
527       "movdqu    16(%2), %%xmm11"           "\n\t"
528       "pcmpistri $0x3A,  %%xmm2, %%xmm11"   "\n\t"
529       "pushfq"                              "\n\t"
530       "popq      %%rdx"                     "\n\t"
531       "movq      %%rcx,  %0"                "\n\t"
532       "movq      %%rdx,  %1"                "\n\t"
533       "addq      $1024,  %%rsp"             "\n\t"
534       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
535       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
536    );
537    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
538 }
539 
s_pcmpistri_3A(V128 * argLU,V128 * argRU)540 UInt s_pcmpistri_3A ( V128* argLU, V128* argRU )
541 {
542    V128 resV;
543    UInt resOSZACP, resECX;
544    Bool ok
545       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
546                        zmask_from_V128(argLU),
547                        zmask_from_V128(argRU),
548                        0x3A, False/*!isSTRM*/
549         );
550    assert(ok);
551    resECX = resV.uInt[0];
552    return (resOSZACP << 16) | resECX;
553 }
554 
istri_3A(void)555 void istri_3A ( void )
556 {
557    char* wot = "3A";
558    UInt(*h)(V128*,V128*) = h_pcmpistri_3A;
559    UInt(*s)(V128*,V128*) = s_pcmpistri_3A;
560 
561    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
562 
563    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
564    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
565    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
566    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
567 
568    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
569    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
570    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
571 
572    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
573    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
574    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
575    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
576 
577    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
578    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
579    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
580 
581    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
582 
583    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
584    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
585    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
586 
587    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
588    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
589    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
590 
591    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
592    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
593    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
594 
595    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
596    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
597    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
598 
599    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
600    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
601 }
602 
603 
604 
605 //////////////////////////////////////////////////////////
606 //                                                      //
607 //                       ISTRI_0C                       //
608 //                                                      //
609 //////////////////////////////////////////////////////////
610 
611 __attribute__((noinline))
h_pcmpistri_0C(V128 * argL,V128 * argR)612 UInt h_pcmpistri_0C ( V128* argL, V128* argR )
613 {
614    V128 block[2];
615    memcpy(&block[0], argL, sizeof(V128));
616    memcpy(&block[1], argR, sizeof(V128));
617    ULong res = 0, flags = 0;
618    __asm__ __volatile__(
619       "movdqu    0(%2),  %%xmm2"            "\n\t"
620       "movdqu    16(%2), %%xmm11"           "\n\t"
621       "pcmpistri $0x0C,  %%xmm2, %%xmm11"   "\n\t"
622       //"pcmpistrm $0x0C,  %%xmm2, %%xmm11"   "\n\t"
623       //"movd %%xmm0, %%ecx" "\n\t"
624       "pushfq"                              "\n\t"
625       "popq      %%rdx"                     "\n\t"
626       "movq      %%rcx,  %0"                "\n\t"
627       "movq      %%rdx,  %1"                "\n\t"
628       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
629       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
630    );
631    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
632 }
633 
s_pcmpistri_0C(V128 * argLU,V128 * argRU)634 UInt s_pcmpistri_0C ( V128* argLU, V128* argRU )
635 {
636    V128 resV;
637    UInt resOSZACP, resECX;
638    Bool ok
639       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
640                        zmask_from_V128(argLU),
641                        zmask_from_V128(argRU),
642                        0x0C, False/*!isSTRM*/
643         );
644    assert(ok);
645    resECX = resV.uInt[0];
646    return (resOSZACP << 16) | resECX;
647 }
648 
istri_0C(void)649 void istri_0C ( void )
650 {
651    char* wot = "0C";
652    UInt(*h)(V128*,V128*) = h_pcmpistri_0C;
653    UInt(*s)(V128*,V128*) = s_pcmpistri_0C;
654 
655    try_istri(wot,h,s, "111111111abcde11", "00000000000abcde");
656 
657    try_istri(wot,h,s, "111111111abcde11", "0000abcde00abcde");
658 
659    try_istri(wot,h,s, "1111111111abcde1", "00000000000abcde");
660    try_istri(wot,h,s, "11111111111abcde", "00000000000abcde");
661    try_istri(wot,h,s, "111111111111abcd", "00000000000abcde");
662 
663    try_istri(wot,h,s, "111abcde1abcde11", "00000000000abcde");
664 
665    try_istri(wot,h,s, "11abcde11abcde11", "00000000000abcde");
666    try_istri(wot,h,s, "1abcde111abcde11", "00000000000abcde");
667    try_istri(wot,h,s, "abcde1111abcde11", "00000000000abcde");
668    try_istri(wot,h,s, "bcde11111abcde11", "00000000000abcde");
669    try_istri(wot,h,s, "cde111111abcde11", "00000000000abcde");
670 
671    try_istri(wot,h,s, "01abcde11abcde11", "00000000000abcde");
672    try_istri(wot,h,s, "00abcde11abcde11", "00000000000abcde");
673    try_istri(wot,h,s, "000bcde11abcde11", "00000000000abcde");
674 
675    try_istri(wot,h,s, "00abcde10abcde11", "00000000000abcde");
676    try_istri(wot,h,s, "00abcde100bcde11", "00000000000abcde");
677 
678    try_istri(wot,h,s, "1111111111111234", "0000000000000000");
679    try_istri(wot,h,s, "1111111111111234", "0000000000000001");
680    try_istri(wot,h,s, "1111111111111234", "0000000000000011");
681 
682    try_istri(wot,h,s, "1111111111111234", "1111111111111234");
683    try_istri(wot,h,s, "a111111111111111", "000000000000000a");
684    try_istri(wot,h,s, "b111111111111111", "000000000000000a");
685 
686    try_istri(wot,h,s, "b111111111111111", "0000000000000000");
687    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
688    try_istri(wot,h,s, "123456789abcdef1", "0000000000000000");
689    try_istri(wot,h,s, "0000000000000000", "123456789abcdef1");
690 }
691 
692 
693 //////////////////////////////////////////////////////////
694 //                                                      //
695 //                       ISTRI_08                       //
696 //                                                      //
697 //////////////////////////////////////////////////////////
698 
h_pcmpistri_08(V128 * argL,V128 * argR)699 UInt h_pcmpistri_08 ( V128* argL, V128* argR )
700 {
701    V128 block[2];
702    memcpy(&block[0], argL, sizeof(V128));
703    memcpy(&block[1], argR, sizeof(V128));
704    ULong res, flags;
705    __asm__ __volatile__(
706       "subq      $1024,  %%rsp"             "\n\t"
707       "movdqu    0(%2),  %%xmm2"            "\n\t"
708       "movdqu    16(%2), %%xmm11"           "\n\t"
709       "pcmpistri $0x08,  %%xmm2, %%xmm11"   "\n\t"
710       "pushfq"                              "\n\t"
711       "popq      %%rdx"                     "\n\t"
712       "movq      %%rcx,  %0"                "\n\t"
713       "movq      %%rdx,  %1"                "\n\t"
714       "addq      $1024,  %%rsp"             "\n\t"
715       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
716       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
717    );
718    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
719 }
720 
s_pcmpistri_08(V128 * argLU,V128 * argRU)721 UInt s_pcmpistri_08 ( V128* argLU, V128* argRU )
722 {
723    V128 resV;
724    UInt resOSZACP, resECX;
725    Bool ok
726       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
727                        zmask_from_V128(argLU),
728                        zmask_from_V128(argRU),
729                        0x08, False/*!isSTRM*/
730         );
731    assert(ok);
732    resECX = resV.uInt[0];
733    return (resOSZACP << 16) | resECX;
734 }
735 
istri_08(void)736 void istri_08 ( void )
737 {
738    char* wot = "08";
739    UInt(*h)(V128*,V128*) = h_pcmpistri_08;
740    UInt(*s)(V128*,V128*) = s_pcmpistri_08;
741 
742    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
743 
744    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
745    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
746    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
747    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
748 
749    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
750    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
751    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
752 
753    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
754    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
755    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
756    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
757 
758    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
759    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
760    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
761 
762    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
763 
764    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
765    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
766    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
767 
768    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
769    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
770    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
771 
772    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
773    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
774    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
775 
776    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
777    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
778    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
779 
780    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
781    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
782 }
783 
784 
785 
786 //////////////////////////////////////////////////////////
787 //                                                      //
788 //                       ISTRI_1A                       //
789 //                                                      //
790 //////////////////////////////////////////////////////////
791 
h_pcmpistri_1A(V128 * argL,V128 * argR)792 UInt h_pcmpistri_1A ( V128* argL, V128* argR )
793 {
794    V128 block[2];
795    memcpy(&block[0], argL, sizeof(V128));
796    memcpy(&block[1], argR, sizeof(V128));
797    ULong res, flags;
798    __asm__ __volatile__(
799       "subq      $1024,  %%rsp"             "\n\t"
800       "movdqu    0(%2),  %%xmm2"            "\n\t"
801       "movdqu    16(%2), %%xmm11"           "\n\t"
802       "pcmpistri $0x1A,  %%xmm2, %%xmm11"   "\n\t"
803       "pushfq"                              "\n\t"
804       "popq      %%rdx"                     "\n\t"
805       "movq      %%rcx,  %0"                "\n\t"
806       "movq      %%rdx,  %1"                "\n\t"
807       "addq      $1024,  %%rsp"             "\n\t"
808       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
809       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
810    );
811    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
812 }
813 
s_pcmpistri_1A(V128 * argLU,V128 * argRU)814 UInt s_pcmpistri_1A ( V128* argLU, V128* argRU )
815 {
816    V128 resV;
817    UInt resOSZACP, resECX;
818    Bool ok
819       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
820                        zmask_from_V128(argLU),
821                        zmask_from_V128(argRU),
822                        0x1A, False/*!isSTRM*/
823         );
824    assert(ok);
825    resECX = resV.uInt[0];
826    return (resOSZACP << 16) | resECX;
827 }
828 
istri_1A(void)829 void istri_1A ( void )
830 {
831    char* wot = "1A";
832    UInt(*h)(V128*,V128*) = h_pcmpistri_1A;
833    UInt(*s)(V128*,V128*) = s_pcmpistri_1A;
834 
835    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
836 
837    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
838    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
839    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
840    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
841 
842    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
843    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
844    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
845 
846    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
847    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
848    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
849    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
850 
851    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
852    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
853    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
854 
855    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
856 
857    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
858    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
859    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
860 
861    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
862    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
863    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
864 
865    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
866    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
867    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
868 
869    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
870    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
871    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
872 
873    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
874    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
875 }
876 
877 
878 
879 //////////////////////////////////////////////////////////
880 //                                                      //
881 //                       ISTRI_02                       //
882 //                                                      //
883 //////////////////////////////////////////////////////////
884 
h_pcmpistri_02(V128 * argL,V128 * argR)885 UInt h_pcmpistri_02 ( V128* argL, V128* argR )
886 {
887    V128 block[2];
888    memcpy(&block[0], argL, sizeof(V128));
889    memcpy(&block[1], argR, sizeof(V128));
890    ULong res, flags;
891    __asm__ __volatile__(
892       "subq      $1024,  %%rsp"             "\n\t"
893       "movdqu    0(%2),  %%xmm2"            "\n\t"
894       "movdqu    16(%2), %%xmm11"           "\n\t"
895       "pcmpistri $0x02,  %%xmm2, %%xmm11"   "\n\t"
896 //"pcmpistrm $0x02, %%xmm2, %%xmm11"   "\n\t"
897 //"movd %%xmm0, %%ecx" "\n\t"
898       "pushfq"                              "\n\t"
899       "popq      %%rdx"                     "\n\t"
900       "movq      %%rcx,  %0"                "\n\t"
901       "movq      %%rdx,  %1"                "\n\t"
902       "addq      $1024,  %%rsp"             "\n\t"
903       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
904       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
905    );
906    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
907 }
908 
s_pcmpistri_02(V128 * argLU,V128 * argRU)909 UInt s_pcmpistri_02 ( V128* argLU, V128* argRU )
910 {
911    V128 resV;
912    UInt resOSZACP, resECX;
913    Bool ok
914       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
915                        zmask_from_V128(argLU),
916                        zmask_from_V128(argRU),
917                        0x02, False/*!isSTRM*/
918         );
919    assert(ok);
920    resECX = resV.uInt[0];
921    return (resOSZACP << 16) | resECX;
922 }
923 
istri_02(void)924 void istri_02 ( void )
925 {
926    char* wot = "02";
927    UInt(*h)(V128*,V128*) = h_pcmpistri_02;
928    UInt(*s)(V128*,V128*) = s_pcmpistri_02;
929 
930    try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
931    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
932    try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
933    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
934 
935    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
936    try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
937    try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
938    try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
939    try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
940 
941    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
942    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
943    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
944    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
945 
946    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
947    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
948 
949    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
950    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
951    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
952    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
953 
954    try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
955 
956    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
957    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
958 }
959 
960 
961 //////////////////////////////////////////////////////////
962 //                                                      //
963 //                       ISTRI_12                       //
964 //                                                      //
965 //////////////////////////////////////////////////////////
966 
h_pcmpistri_12(V128 * argL,V128 * argR)967 UInt h_pcmpistri_12 ( V128* argL, V128* argR )
968 {
969    V128 block[2];
970    memcpy(&block[0], argL, sizeof(V128));
971    memcpy(&block[1], argR, sizeof(V128));
972    ULong res, flags;
973    __asm__ __volatile__(
974       "subq      $1024,  %%rsp"             "\n\t"
975       "movdqu    0(%2),  %%xmm2"            "\n\t"
976       "movdqu    16(%2), %%xmm11"           "\n\t"
977       "pcmpistri $0x12,  %%xmm2, %%xmm11"   "\n\t"
978 //"pcmpistrm $0x12, %%xmm2, %%xmm11"   "\n\t"
979 //"movd %%xmm0, %%ecx" "\n\t"
980       "pushfq"                              "\n\t"
981       "popq      %%rdx"                     "\n\t"
982       "movq      %%rcx,  %0"                "\n\t"
983       "movq      %%rdx,  %1"                "\n\t"
984       "addq      $1024,  %%rsp"             "\n\t"
985       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
986       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
987    );
988    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
989 }
990 
s_pcmpistri_12(V128 * argLU,V128 * argRU)991 UInt s_pcmpistri_12 ( V128* argLU, V128* argRU )
992 {
993    V128 resV;
994    UInt resOSZACP, resECX;
995    Bool ok
996       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
997                        zmask_from_V128(argLU),
998                        zmask_from_V128(argRU),
999                        0x12, False/*!isSTRM*/
1000         );
1001    assert(ok);
1002    resECX = resV.uInt[0];
1003    return (resOSZACP << 16) | resECX;
1004 }
1005 
istri_12(void)1006 void istri_12 ( void )
1007 {
1008    char* wot = "12";
1009    UInt(*h)(V128*,V128*) = h_pcmpistri_12;
1010    UInt(*s)(V128*,V128*) = s_pcmpistri_12;
1011 
1012    try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
1013    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
1014    try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
1015    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
1016 
1017    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
1018    try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
1019    try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
1020    try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
1021    try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
1022 
1023    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
1024    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
1025    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
1026    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
1027 
1028    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
1029    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1030 
1031    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
1032    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
1033    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
1034    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
1035 
1036    try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
1037 
1038    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
1039    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
1040 }
1041 
1042 
1043 
1044 //////////////////////////////////////////////////////////
1045 //                                                      //
1046 //                       ISTRI_44                       //
1047 //                                                      //
1048 //////////////////////////////////////////////////////////
1049 
h_pcmpistri_44(V128 * argL,V128 * argR)1050 UInt h_pcmpistri_44 ( V128* argL, V128* argR )
1051 {
1052    V128 block[2];
1053    memcpy(&block[0], argL, sizeof(V128));
1054    memcpy(&block[1], argR, sizeof(V128));
1055    ULong res, flags;
1056    __asm__ __volatile__(
1057       "subq      $1024,  %%rsp"             "\n\t"
1058       "movdqu    0(%2),  %%xmm2"            "\n\t"
1059       "movdqu    16(%2), %%xmm11"           "\n\t"
1060       "pcmpistri $0x44,  %%xmm2, %%xmm11"   "\n\t"
1061 //"pcmpistrm $0x04, %%xmm2, %%xmm11"   "\n\t"
1062 //"movd %%xmm0, %%ecx" "\n\t"
1063       "pushfq"                              "\n\t"
1064       "popq      %%rdx"                     "\n\t"
1065       "movq      %%rcx,  %0"                "\n\t"
1066       "movq      %%rdx,  %1"                "\n\t"
1067       "addq      $1024,  %%rsp"             "\n\t"
1068       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
1069       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
1070    );
1071    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
1072 }
1073 
s_pcmpistri_44(V128 * argLU,V128 * argRU)1074 UInt s_pcmpistri_44 ( V128* argLU, V128* argRU )
1075 {
1076    V128 resV;
1077    UInt resOSZACP, resECX;
1078    Bool ok
1079       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
1080                        zmask_from_V128(argLU),
1081                        zmask_from_V128(argRU),
1082                        0x44, False/*!isSTRM*/
1083         );
1084    assert(ok);
1085    resECX = resV.uInt[0];
1086    return (resOSZACP << 16) | resECX;
1087 }
1088 
istri_44(void)1089 void istri_44 ( void )
1090 {
1091    char* wot = "44";
1092    UInt(*h)(V128*,V128*) = h_pcmpistri_44;
1093    UInt(*s)(V128*,V128*) = s_pcmpistri_44;
1094 
1095    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000bc");
1096    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000cb");
1097    try_istri(wot,h,s, "baaabbbbccccdddd", "00000000000000cb");
1098    try_istri(wot,h,s, "baaabbbbccccdddc", "00000000000000cb");
1099 
1100    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
1101    try_istri(wot,h,s, "bbbbbbbb0bbbbbbb", "00000000000000cb");
1102    try_istri(wot,h,s, "bbbbbbbbbbbbbb0b", "00000000000000cb");
1103    try_istri(wot,h,s, "bbbbbbbbbbbbbbb0", "00000000000000cb");
1104    try_istri(wot,h,s, "0000000000000000", "00000000000000cb");
1105 
1106    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
1107 
1108    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
1109    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000000b");
1110    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000062cb");
1111 
1112    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000002cb");
1113    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000000cb");
1114    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "000000000000000b");
1115 
1116    try_istri(wot,h,s, "0123456789abcdef", "000000fecb975421");
1117    try_istri(wot,h,s, "123456789abcdef1", "000000fecb975421");
1118 
1119    try_istri(wot,h,s, "0123456789abcdef", "00000000dca86532");
1120    try_istri(wot,h,s, "123456789abcdef1", "00000000dca86532");
1121 
1122    try_istri(wot,h,s, "163887ec041a9b72", "fcd75adb9b3e895a");
1123    try_istri(wot,h,s, "fc937cbfbf53f8e2", "0d136bcb024d3fb7");
1124    try_istri(wot,h,s, "2ca34182c29a82ab", "302ebd646775ab54");
1125    try_istri(wot,h,s, "3f2987608c11be6f", "a9ecb661f8e0a8cb");
1126 }
1127 
1128 
1129 //////////////////////////////////////////////////////////
1130 //                                                      //
1131 //                       ISTRI_00                       //
1132 //                                                      //
1133 //////////////////////////////////////////////////////////
1134 
h_pcmpistri_00(V128 * argL,V128 * argR)1135 UInt h_pcmpistri_00 ( V128* argL, V128* argR )
1136 {
1137    V128 block[2];
1138    memcpy(&block[0], argL, sizeof(V128));
1139    memcpy(&block[1], argR, sizeof(V128));
1140    ULong res, flags;
1141    __asm__ __volatile__(
1142       "subq      $1024,  %%rsp"             "\n\t"
1143       "movdqu    0(%2),  %%xmm2"            "\n\t"
1144       "movdqu    16(%2), %%xmm11"           "\n\t"
1145       "pcmpistri $0x00,  %%xmm2, %%xmm11"   "\n\t"
1146 //"pcmpistrm $0x00, %%xmm2, %%xmm11"   "\n\t"
1147 //"movd %%xmm0, %%ecx" "\n\t"
1148       "pushfq"                              "\n\t"
1149       "popq      %%rdx"                     "\n\t"
1150       "movq      %%rcx,  %0"                "\n\t"
1151       "movq      %%rdx,  %1"                "\n\t"
1152       "addq      $1024,  %%rsp"             "\n\t"
1153       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
1154       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
1155    );
1156    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
1157 }
1158 
s_pcmpistri_00(V128 * argLU,V128 * argRU)1159 UInt s_pcmpistri_00 ( V128* argLU, V128* argRU )
1160 {
1161    V128 resV;
1162    UInt resOSZACP, resECX;
1163    Bool ok
1164       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
1165                        zmask_from_V128(argLU),
1166                        zmask_from_V128(argRU),
1167                        0x00, False/*!isSTRM*/
1168         );
1169    assert(ok);
1170    resECX = resV.uInt[0];
1171    return (resOSZACP << 16) | resECX;
1172 }
1173 
istri_00(void)1174 void istri_00 ( void )
1175 {
1176    char* wot = "00";
1177    UInt(*h)(V128*,V128*) = h_pcmpistri_00;
1178    UInt(*s)(V128*,V128*) = s_pcmpistri_00;
1179 
1180    try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
1181    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
1182    try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
1183    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
1184 
1185    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
1186    try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
1187    try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
1188    try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
1189    try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
1190 
1191    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
1192    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
1193    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
1194    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
1195 
1196    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
1197    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1198 
1199    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
1200    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
1201    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
1202    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
1203 
1204    try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
1205 
1206    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
1207    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
1208 }
1209 
1210 
1211 //////////////////////////////////////////////////////////
1212 //                                                      //
1213 //                       ISTRI_38                       //
1214 //                                                      //
1215 //////////////////////////////////////////////////////////
1216 
h_pcmpistri_38(V128 * argL,V128 * argR)1217 UInt h_pcmpistri_38 ( V128* argL, V128* argR )
1218 {
1219    V128 block[2];
1220    memcpy(&block[0], argL, sizeof(V128));
1221    memcpy(&block[1], argR, sizeof(V128));
1222    ULong res, flags;
1223    __asm__ __volatile__(
1224       "subq      $1024,  %%rsp"             "\n\t"
1225       "movdqu    0(%2),  %%xmm2"            "\n\t"
1226       "movdqu    16(%2), %%xmm11"           "\n\t"
1227       "pcmpistri $0x38,  %%xmm2, %%xmm11"   "\n\t"
1228       "pushfq"                              "\n\t"
1229       "popq      %%rdx"                     "\n\t"
1230       "movq      %%rcx,  %0"                "\n\t"
1231       "movq      %%rdx,  %1"                "\n\t"
1232       "addq      $1024,  %%rsp"             "\n\t"
1233       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
1234       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
1235    );
1236    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
1237 }
1238 
s_pcmpistri_38(V128 * argLU,V128 * argRU)1239 UInt s_pcmpistri_38 ( V128* argLU, V128* argRU )
1240 {
1241    V128 resV;
1242    UInt resOSZACP, resECX;
1243    Bool ok
1244       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
1245                        zmask_from_V128(argLU),
1246                        zmask_from_V128(argRU),
1247                        0x38, False/*!isSTRM*/
1248         );
1249    assert(ok);
1250    resECX = resV.uInt[0];
1251    return (resOSZACP << 16) | resECX;
1252 }
1253 
istri_38(void)1254 void istri_38 ( void )
1255 {
1256    char* wot = "38";
1257    UInt(*h)(V128*,V128*) = h_pcmpistri_38;
1258    UInt(*s)(V128*,V128*) = s_pcmpistri_38;
1259 
1260    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
1261 
1262    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1263    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1264    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
1265    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
1266 
1267    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
1268    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
1269    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
1270 
1271    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1272    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1273    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1274    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1275 
1276    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1277    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
1278    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
1279 
1280    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1281 
1282    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
1283    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
1284    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
1285 
1286    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
1287    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
1288    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
1289 
1290    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
1291    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
1292    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
1293 
1294    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
1295    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
1296    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
1297 
1298    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
1299    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
1300 }
1301 
1302 
1303 
1304 //////////////////////////////////////////////////////////
1305 //                                                      //
1306 //                       ISTRI_46                       //
1307 //                                                      //
1308 //////////////////////////////////////////////////////////
1309 
h_pcmpistri_46(V128 * argL,V128 * argR)1310 UInt h_pcmpistri_46 ( V128* argL, V128* argR )
1311 {
1312    V128 block[2];
1313    memcpy(&block[0], argL, sizeof(V128));
1314    memcpy(&block[1], argR, sizeof(V128));
1315    ULong res, flags;
1316    __asm__ __volatile__(
1317       "subq      $1024,  %%rsp"             "\n\t"
1318       "movdqu    0(%2),  %%xmm2"            "\n\t"
1319       "movdqu    16(%2), %%xmm11"           "\n\t"
1320       "pcmpistri $0x46,  %%xmm2, %%xmm11"   "\n\t"
1321       "pushfq"                              "\n\t"
1322       "popq      %%rdx"                     "\n\t"
1323       "movq      %%rcx,  %0"                "\n\t"
1324       "movq      %%rdx,  %1"                "\n\t"
1325       "addq      $1024,  %%rsp"             "\n\t"
1326       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
1327       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
1328    );
1329    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
1330 }
1331 
s_pcmpistri_46(V128 * argLU,V128 * argRU)1332 UInt s_pcmpistri_46 ( V128* argLU, V128* argRU )
1333 {
1334    V128 resV;
1335    UInt resOSZACP, resECX;
1336    Bool ok
1337       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
1338                        zmask_from_V128(argLU),
1339                        zmask_from_V128(argRU),
1340                        0x46, False/*!isSTRM*/
1341         );
1342    assert(ok);
1343    resECX = resV.uInt[0];
1344    return (resOSZACP << 16) | resECX;
1345 }
1346 
istri_46(void)1347 void istri_46 ( void )
1348 {
1349    char* wot = "46";
1350    UInt(*h)(V128*,V128*) = h_pcmpistri_46;
1351    UInt(*s)(V128*,V128*) = s_pcmpistri_46;
1352 
1353    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000bc");
1354    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000cb");
1355    try_istri(wot,h,s, "baaabbbbccccdddd", "00000000000000cb");
1356    try_istri(wot,h,s, "baaabbbbccccdddc", "00000000000000cb");
1357 
1358    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
1359    try_istri(wot,h,s, "bbbbbbbb0bbbbbbb", "00000000000000cb");
1360    try_istri(wot,h,s, "bbbbbbbbbbbbbb0b", "00000000000000cb");
1361    try_istri(wot,h,s, "bbbbbbbbbbbbbbb0", "00000000000000cb");
1362    try_istri(wot,h,s, "0000000000000000", "00000000000000cb");
1363 
1364    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
1365 
1366    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
1367    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000000b");
1368    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000062cb");
1369 
1370    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000002cb");
1371    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000000cb");
1372    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "000000000000000b");
1373 
1374    try_istri(wot,h,s, "0123456789abcdef", "000000fecb975421");
1375    try_istri(wot,h,s, "123456789abcdef1", "000000fecb975421");
1376 
1377    try_istri(wot,h,s, "0123456789abcdef", "00000000dca86532");
1378    try_istri(wot,h,s, "123456789abcdef1", "00000000dca86532");
1379 
1380    try_istri(wot,h,s, "163887ec041a9b72", "fcd75adb9b3e895a");
1381    try_istri(wot,h,s, "fc937cbfbf53f8e2", "0d136bcb024d3fb7");
1382    try_istri(wot,h,s, "2ca34182c29a82ab", "302ebd646775ab54");
1383    try_istri(wot,h,s, "3f2987608c11be6f", "a9ecb661f8e0a8cb");
1384 }
1385 
1386 
1387 //////////////////////////////////////////////////////////
1388 //                                                      //
1389 //                       ISTRI_30                       //
1390 //                                                      //
1391 //////////////////////////////////////////////////////////
1392 
h_pcmpistri_30(V128 * argL,V128 * argR)1393 UInt h_pcmpistri_30 ( V128* argL, V128* argR )
1394 {
1395    V128 block[2];
1396    memcpy(&block[0], argL, sizeof(V128));
1397    memcpy(&block[1], argR, sizeof(V128));
1398    ULong res, flags;
1399    __asm__ __volatile__(
1400       "subq      $1024,  %%rsp"             "\n\t"
1401       "movdqu    0(%2),  %%xmm2"            "\n\t"
1402       "movdqu    16(%2), %%xmm11"           "\n\t"
1403       "pcmpistri $0x30,  %%xmm2, %%xmm11"   "\n\t"
1404       "pushfq"                              "\n\t"
1405       "popq      %%rdx"                     "\n\t"
1406       "movq      %%rcx,  %0"                "\n\t"
1407       "movq      %%rdx,  %1"                "\n\t"
1408       "addq      $1024,  %%rsp"             "\n\t"
1409       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
1410       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
1411    );
1412    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
1413 }
1414 
s_pcmpistri_30(V128 * argLU,V128 * argRU)1415 UInt s_pcmpistri_30 ( V128* argLU, V128* argRU )
1416 {
1417    V128 resV;
1418    UInt resOSZACP, resECX;
1419    Bool ok
1420       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
1421                        zmask_from_V128(argLU),
1422                        zmask_from_V128(argRU),
1423                        0x30, False/*!isSTRM*/
1424         );
1425    assert(ok);
1426    resECX = resV.uInt[0];
1427    return (resOSZACP << 16) | resECX;
1428 }
1429 
istri_30(void)1430 void istri_30 ( void )
1431 {
1432    char* wot = "30";
1433    UInt(*h)(V128*,V128*) = h_pcmpistri_30;
1434    UInt(*s)(V128*,V128*) = s_pcmpistri_30;
1435 
1436    try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
1437    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
1438    try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
1439    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
1440 
1441    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
1442    try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
1443    try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
1444    try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
1445    try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
1446 
1447    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
1448    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
1449    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
1450    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
1451 
1452    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
1453    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1454 
1455    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
1456    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
1457    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
1458    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
1459 
1460    try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
1461 
1462    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
1463    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
1464 }
1465 
1466 
1467 //////////////////////////////////////////////////////////
1468 //                                                      //
1469 //                       ISTRI_40                       //
1470 //                                                      //
1471 //////////////////////////////////////////////////////////
1472 
h_pcmpistri_40(V128 * argL,V128 * argR)1473 UInt h_pcmpistri_40 ( V128* argL, V128* argR )
1474 {
1475    V128 block[2];
1476    memcpy(&block[0], argL, sizeof(V128));
1477    memcpy(&block[1], argR, sizeof(V128));
1478    ULong res, flags;
1479    __asm__ __volatile__(
1480       "subq      $1024,  %%rsp"             "\n\t"
1481       "movdqu    0(%2),  %%xmm2"            "\n\t"
1482       "movdqu    16(%2), %%xmm11"           "\n\t"
1483       "pcmpistri $0x40,  %%xmm2, %%xmm11"   "\n\t"
1484       "pushfq"                              "\n\t"
1485       "popq      %%rdx"                     "\n\t"
1486       "movq      %%rcx,  %0"                "\n\t"
1487       "movq      %%rdx,  %1"                "\n\t"
1488       "addq      $1024,  %%rsp"             "\n\t"
1489       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
1490       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
1491    );
1492    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
1493 }
1494 
s_pcmpistri_40(V128 * argLU,V128 * argRU)1495 UInt s_pcmpistri_40 ( V128* argLU, V128* argRU )
1496 {
1497    V128 resV;
1498    UInt resOSZACP, resECX;
1499    Bool ok
1500       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
1501                        zmask_from_V128(argLU),
1502                        zmask_from_V128(argRU),
1503                        0x40, False/*!isSTRM*/
1504         );
1505    assert(ok);
1506    resECX = resV.uInt[0];
1507    return (resOSZACP << 16) | resECX;
1508 }
1509 
istri_40(void)1510 void istri_40 ( void )
1511 {
1512    char* wot = "40";
1513    UInt(*h)(V128*,V128*) = h_pcmpistri_40;
1514    UInt(*s)(V128*,V128*) = s_pcmpistri_40;
1515 
1516    try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
1517    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
1518    try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
1519    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
1520 
1521    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
1522    try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
1523    try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
1524    try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
1525    try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
1526 
1527    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
1528    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
1529    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
1530    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
1531 
1532    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
1533    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1534 
1535    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
1536    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
1537    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
1538    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
1539 
1540    try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
1541 
1542    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
1543    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
1544 }
1545 
1546 
1547 //////////////////////////////////////////////////////////
1548 //                                                      //
1549 //                       ISTRI_0E                       //
1550 //                                                      //
1551 //////////////////////////////////////////////////////////
1552 
1553 __attribute__((noinline))
h_pcmpistri_0E(V128 * argL,V128 * argR)1554 UInt h_pcmpistri_0E ( V128* argL, V128* argR )
1555 {
1556    V128 block[2];
1557    memcpy(&block[0], argL, sizeof(V128));
1558    memcpy(&block[1], argR, sizeof(V128));
1559    ULong res = 0, flags = 0;
1560    __asm__ __volatile__(
1561       "movdqu    0(%2),  %%xmm2"            "\n\t"
1562       "movdqu    16(%2), %%xmm11"           "\n\t"
1563       "pcmpistri $0x0E,  %%xmm2, %%xmm11"   "\n\t"
1564       "pushfq"                              "\n\t"
1565       "popq      %%rdx"                     "\n\t"
1566       "movq      %%rcx,  %0"                "\n\t"
1567       "movq      %%rdx,  %1"                "\n\t"
1568       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
1569       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
1570    );
1571    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
1572 }
1573 
s_pcmpistri_0E(V128 * argLU,V128 * argRU)1574 UInt s_pcmpistri_0E ( V128* argLU, V128* argRU )
1575 {
1576    V128 resV;
1577    UInt resOSZACP, resECX;
1578    Bool ok
1579       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
1580                        zmask_from_V128(argLU),
1581                        zmask_from_V128(argRU),
1582                        0x0E, False/*!isSTRM*/
1583         );
1584    assert(ok);
1585    resECX = resV.uInt[0];
1586    return (resOSZACP << 16) | resECX;
1587 }
1588 
istri_0E(void)1589 void istri_0E ( void )
1590 {
1591    char* wot = "0E";
1592    UInt(*h)(V128*,V128*) = h_pcmpistri_0E;
1593    UInt(*s)(V128*,V128*) = s_pcmpistri_0E;
1594 
1595    try_istri(wot,h,s, "111111111abcde11", "00000000000abcde");
1596 
1597    try_istri(wot,h,s, "111111111abcde11", "0000abcde00abcde");
1598 
1599    try_istri(wot,h,s, "1111111111abcde1", "00000000000abcde");
1600    try_istri(wot,h,s, "11111111111abcde", "00000000000abcde");
1601    try_istri(wot,h,s, "111111111111abcd", "00000000000abcde");
1602 
1603    try_istri(wot,h,s, "111abcde1abcde11", "00000000000abcde");
1604 
1605    try_istri(wot,h,s, "11abcde11abcde11", "00000000000abcde");
1606    try_istri(wot,h,s, "1abcde111abcde11", "00000000000abcde");
1607    try_istri(wot,h,s, "abcde1111abcde11", "00000000000abcde");
1608    try_istri(wot,h,s, "bcde11111abcde11", "00000000000abcde");
1609    try_istri(wot,h,s, "cde111111abcde11", "00000000000abcde");
1610 
1611    try_istri(wot,h,s, "01abcde11abcde11", "00000000000abcde");
1612    try_istri(wot,h,s, "00abcde11abcde11", "00000000000abcde");
1613    try_istri(wot,h,s, "000bcde11abcde11", "00000000000abcde");
1614 
1615    try_istri(wot,h,s, "00abcde10abcde11", "00000000000abcde");
1616    try_istri(wot,h,s, "00abcde100bcde11", "00000000000abcde");
1617 
1618    try_istri(wot,h,s, "1111111111111234", "0000000000000000");
1619    try_istri(wot,h,s, "1111111111111234", "0000000000000001");
1620    try_istri(wot,h,s, "1111111111111234", "0000000000000011");
1621 
1622    try_istri(wot,h,s, "1111111111111234", "1111111111111234");
1623    try_istri(wot,h,s, "a111111111111111", "000000000000000a");
1624    try_istri(wot,h,s, "b111111111111111", "000000000000000a");
1625 
1626    try_istri(wot,h,s, "b111111111111111", "0000000000000000");
1627    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
1628    try_istri(wot,h,s, "123456789abcdef1", "0000000000000000");
1629    try_istri(wot,h,s, "0000000000000000", "123456789abcdef1");
1630 }
1631 
1632 
1633 //////////////////////////////////////////////////////////
1634 //                                                      //
1635 //                       ISTRI_34                       //
1636 //                                                      //
1637 //////////////////////////////////////////////////////////
1638 
h_pcmpistri_34(V128 * argL,V128 * argR)1639 UInt h_pcmpistri_34 ( V128* argL, V128* argR )
1640 {
1641    V128 block[2];
1642    memcpy(&block[0], argL, sizeof(V128));
1643    memcpy(&block[1], argR, sizeof(V128));
1644    ULong res, flags;
1645    __asm__ __volatile__(
1646       "subq      $1024,  %%rsp"             "\n\t"
1647       "movdqu    0(%2),  %%xmm2"            "\n\t"
1648       "movdqu    16(%2), %%xmm11"           "\n\t"
1649       "pcmpistri $0x34,  %%xmm2, %%xmm11"   "\n\t"
1650       "pushfq"                              "\n\t"
1651       "popq      %%rdx"                     "\n\t"
1652       "movq      %%rcx,  %0"                "\n\t"
1653       "movq      %%rdx,  %1"                "\n\t"
1654       "addq      $1024,  %%rsp"             "\n\t"
1655       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
1656       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
1657    );
1658    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
1659 }
1660 
s_pcmpistri_34(V128 * argLU,V128 * argRU)1661 UInt s_pcmpistri_34 ( V128* argLU, V128* argRU )
1662 {
1663    V128 resV;
1664    UInt resOSZACP, resECX;
1665    Bool ok
1666       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
1667                        zmask_from_V128(argLU),
1668                        zmask_from_V128(argRU),
1669                        0x34, False/*!isSTRM*/
1670         );
1671    assert(ok);
1672    resECX = resV.uInt[0];
1673    return (resOSZACP << 16) | resECX;
1674 }
1675 
istri_34(void)1676 void istri_34 ( void )
1677 {
1678    char* wot = "34";
1679    UInt(*h)(V128*,V128*) = h_pcmpistri_34;
1680    UInt(*s)(V128*,V128*) = s_pcmpistri_34;
1681 
1682    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000bc");
1683    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000cb");
1684    try_istri(wot,h,s, "baaabbbbccccdddd", "00000000000000cb");
1685    try_istri(wot,h,s, "baaabbbbccccdddc", "00000000000000cb");
1686 
1687    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
1688    try_istri(wot,h,s, "bbbbbbbb0bbbbbbb", "00000000000000cb");
1689    try_istri(wot,h,s, "bbbbbbbbbbbbbb0b", "00000000000000cb");
1690    try_istri(wot,h,s, "bbbbbbbbbbbbbbb0", "00000000000000cb");
1691    try_istri(wot,h,s, "0000000000000000", "00000000000000cb");
1692 
1693    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
1694 
1695    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
1696    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000000b");
1697    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000062cb");
1698 
1699    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000002cb");
1700    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000000cb");
1701    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "000000000000000b");
1702 
1703    try_istri(wot,h,s, "0123456789abcdef", "000000fecb975421");
1704    try_istri(wot,h,s, "123456789abcdef1", "000000fecb975421");
1705 
1706    try_istri(wot,h,s, "0123456789abcdef", "00000000dca86532");
1707    try_istri(wot,h,s, "123456789abcdef1", "00000000dca86532");
1708 
1709    try_istri(wot,h,s, "163887ec041a9b72", "fcd75adb9b3e895a");
1710    try_istri(wot,h,s, "fc937cbfbf53f8e2", "0d136bcb024d3fb7");
1711    try_istri(wot,h,s, "2ca34182c29a82ab", "302ebd646775ab54");
1712    try_istri(wot,h,s, "3f2987608c11be6f", "a9ecb661f8e0a8cb");
1713 }
1714 
1715 
1716 //////////////////////////////////////////////////////////
1717 //                                                      //
1718 //                       ISTRI_14                       //
1719 //                                                      //
1720 //////////////////////////////////////////////////////////
1721 
h_pcmpistri_14(V128 * argL,V128 * argR)1722 UInt h_pcmpistri_14 ( V128* argL, V128* argR )
1723 {
1724    V128 block[2];
1725    memcpy(&block[0], argL, sizeof(V128));
1726    memcpy(&block[1], argR, sizeof(V128));
1727    ULong res, flags;
1728    __asm__ __volatile__(
1729       "subq      $1024,  %%rsp"             "\n\t"
1730       "movdqu    0(%2),  %%xmm2"            "\n\t"
1731       "movdqu    16(%2), %%xmm11"           "\n\t"
1732       "pcmpistri $0x14,  %%xmm2, %%xmm11"   "\n\t"
1733       "pushfq"                              "\n\t"
1734       "popq      %%rdx"                     "\n\t"
1735       "movq      %%rcx,  %0"                "\n\t"
1736       "movq      %%rdx,  %1"                "\n\t"
1737       "addq      $1024,  %%rsp"             "\n\t"
1738       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
1739       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
1740    );
1741    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
1742 }
1743 
s_pcmpistri_14(V128 * argLU,V128 * argRU)1744 UInt s_pcmpistri_14 ( V128* argLU, V128* argRU )
1745 {
1746    V128 resV;
1747    UInt resOSZACP, resECX;
1748    Bool ok
1749       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
1750                        zmask_from_V128(argLU),
1751                        zmask_from_V128(argRU),
1752                        0x14, False/*!isSTRM*/
1753         );
1754    assert(ok);
1755    resECX = resV.uInt[0];
1756    return (resOSZACP << 16) | resECX;
1757 }
1758 
istri_14(void)1759 void istri_14 ( void )
1760 {
1761    char* wot = "14";
1762    UInt(*h)(V128*,V128*) = h_pcmpistri_14;
1763    UInt(*s)(V128*,V128*) = s_pcmpistri_14;
1764 
1765    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000bc");
1766    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000cb");
1767    try_istri(wot,h,s, "baaabbbbccccdddd", "00000000000000cb");
1768    try_istri(wot,h,s, "baaabbbbccccdddc", "00000000000000cb");
1769 
1770    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
1771    try_istri(wot,h,s, "bbbbbbbb0bbbbbbb", "00000000000000cb");
1772    try_istri(wot,h,s, "bbbbbbbbbbbbbb0b", "00000000000000cb");
1773    try_istri(wot,h,s, "bbbbbbbbbbbbbbb0", "00000000000000cb");
1774    try_istri(wot,h,s, "0000000000000000", "00000000000000cb");
1775 
1776    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
1777 
1778    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
1779    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000000b");
1780    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000062cb");
1781 
1782    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000002cb");
1783    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000000cb");
1784    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "000000000000000b");
1785 
1786    try_istri(wot,h,s, "0123456789abcdef", "000000fecb975421");
1787    try_istri(wot,h,s, "123456789abcdef1", "000000fecb975421");
1788 
1789    try_istri(wot,h,s, "0123456789abcdef", "00000000dca86532");
1790    try_istri(wot,h,s, "123456789abcdef1", "00000000dca86532");
1791 
1792    try_istri(wot,h,s, "163887ec041a9b72", "fcd75adb9b3e895a");
1793    try_istri(wot,h,s, "fc937cbfbf53f8e2", "0d136bcb024d3fb7");
1794    try_istri(wot,h,s, "2ca34182c29a82ab", "302ebd646775ab54");
1795    try_istri(wot,h,s, "3f2987608c11be6f", "a9ecb661f8e0a8cb");
1796 }
1797 
1798 
1799 //////////////////////////////////////////////////////////
1800 //                                                      //
1801 //                         main                         //
1802 //                                                      //
1803 //////////////////////////////////////////////////////////
1804 
main(void)1805 int main ( void )
1806 {
1807    istri_4A();
1808    istri_3A();
1809    istri_08();
1810    istri_1A();
1811    istri_02();
1812    istri_0C();
1813    istri_12();
1814    istri_44();
1815    istri_00();
1816    istri_38();
1817    istri_46();
1818    istri_30();
1819    istri_40();
1820    istri_0E();
1821    istri_14();
1822    istri_34();
1823    return 0;
1824 }
1825