• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 
2 /*---------------------------------------------------------------*/
3 /*--- begin                             host_generic_simd64.c ---*/
4 /*---------------------------------------------------------------*/
5 
6 /*
7    This file is part of Valgrind, a dynamic binary instrumentation
8    framework.
9 
10    Copyright (C) 2004-2011 OpenWorks LLP
11       info@open-works.net
12 
13    This program is free software; you can redistribute it and/or
14    modify it under the terms of the GNU General Public License as
15    published by the Free Software Foundation; either version 2 of the
16    License, or (at your option) any later version.
17 
18    This program is distributed in the hope that it will be useful, but
19    WITHOUT ANY WARRANTY; without even the implied warranty of
20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21    General Public License for more details.
22 
23    You should have received a copy of the GNU General Public License
24    along with this program; if not, write to the Free Software
25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26    02110-1301, USA.
27 
28    The GNU General Public License is contained in the file COPYING.
29 
30    Neither the names of the U.S. Department of Energy nor the
31    University of California nor the names of its contributors may be
32    used to endorse or promote products derived from this software
33    without prior written permission.
34 */
35 
36 /* Generic helper functions for doing 64-bit SIMD arithmetic in cases
37    where the instruction selectors cannot generate code in-line.
38    These are purely back-end entities and cannot be seen/referenced
39    from IR. */
40 
41 #include "libvex_basictypes.h"
42 #include "host_generic_simd64.h"
43 
44 
45 
46 /* Tuple/select functions for 32x2 vectors. */
47 
mk32x2(UInt w1,UInt w0)48 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
49    return (((ULong)w1) << 32) | ((ULong)w0);
50 }
51 
sel32x2_1(ULong w64)52 static inline UInt sel32x2_1 ( ULong w64 ) {
53    return 0xFFFFFFFF & toUInt(w64 >> 32);
54 }
sel32x2_0(ULong w64)55 static inline UInt sel32x2_0 ( ULong w64 ) {
56    return 0xFFFFFFFF & toUInt(w64);
57 }
58 
59 
60 /* Tuple/select functions for 16x4 vectors.  gcc is pretty hopeless
61    with 64-bit shifts so we give it a hand. */
62 
mk16x4(UShort w3,UShort w2,UShort w1,UShort w0)63 static inline ULong mk16x4 ( UShort w3, UShort w2,
64                              UShort w1, UShort w0 ) {
65    UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2);
66    UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0);
67    return mk32x2(hi32, lo32);
68 }
69 
sel16x4_3(ULong w64)70 static inline UShort sel16x4_3 ( ULong w64 ) {
71    UInt hi32 = toUInt(w64 >> 32);
72    return toUShort(0xFFFF & (hi32 >> 16));
73 }
sel16x4_2(ULong w64)74 static inline UShort sel16x4_2 ( ULong w64 ) {
75    UInt hi32 = toUInt(w64 >> 32);
76    return toUShort(0xFFFF & hi32);
77 }
sel16x4_1(ULong w64)78 static inline UShort sel16x4_1 ( ULong w64 ) {
79    UInt lo32 = (UInt)w64;
80    return toUShort(0xFFFF & (lo32 >> 16));
81 }
sel16x4_0(ULong w64)82 static inline UShort sel16x4_0 ( ULong w64 ) {
83    UInt lo32 = (UInt)w64;
84    return toUShort(0xFFFF & lo32);
85 }
86 
87 
88 /* Tuple/select functions for 8x8 vectors. */
89 
mk8x8(UChar w7,UChar w6,UChar w5,UChar w4,UChar w3,UChar w2,UChar w1,UChar w0)90 static inline ULong mk8x8 ( UChar w7, UChar w6,
91                             UChar w5, UChar w4,
92                             UChar w3, UChar w2,
93                             UChar w1, UChar w0 ) {
94    UInt hi32 =   (((UInt)w7) << 24) | (((UInt)w6) << 16)
95                | (((UInt)w5) << 8)  | (((UInt)w4) << 0);
96    UInt lo32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
97                | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
98    return mk32x2(hi32, lo32);
99 }
100 
sel8x8_7(ULong w64)101 static inline UChar sel8x8_7 ( ULong w64 ) {
102    UInt hi32 = toUInt(w64 >> 32);
103    return toUChar(0xFF & (hi32 >> 24));
104 }
sel8x8_6(ULong w64)105 static inline UChar sel8x8_6 ( ULong w64 ) {
106    UInt hi32 = toUInt(w64 >> 32);
107    return toUChar(0xFF & (hi32 >> 16));
108 }
sel8x8_5(ULong w64)109 static inline UChar sel8x8_5 ( ULong w64 ) {
110    UInt hi32 = toUInt(w64 >> 32);
111    return toUChar(0xFF & (hi32 >> 8));
112 }
sel8x8_4(ULong w64)113 static inline UChar sel8x8_4 ( ULong w64 ) {
114    UInt hi32 = toUInt(w64 >> 32);
115    return toUChar(0xFF & (hi32 >> 0));
116 }
sel8x8_3(ULong w64)117 static inline UChar sel8x8_3 ( ULong w64 ) {
118    UInt lo32 = (UInt)w64;
119    return toUChar(0xFF & (lo32 >> 24));
120 }
sel8x8_2(ULong w64)121 static inline UChar sel8x8_2 ( ULong w64 ) {
122    UInt lo32 = (UInt)w64;
123    return toUChar(0xFF & (lo32 >> 16));
124 }
sel8x8_1(ULong w64)125 static inline UChar sel8x8_1 ( ULong w64 ) {
126    UInt lo32 = (UInt)w64;
127    return toUChar(0xFF & (lo32 >> 8));
128 }
sel8x8_0(ULong w64)129 static inline UChar sel8x8_0 ( ULong w64 ) {
130    UInt lo32 = (UInt)w64;
131    return toUChar(0xFF & (lo32 >> 0));
132 }
133 
index8x8(ULong w64,UChar ix)134 static inline UChar index8x8 ( ULong w64, UChar ix ) {
135    ix &= 7;
136    return toUChar((w64 >> (8*ix)) & 0xFF);
137 }
138 
139 
140 /* Scalar helpers. */
141 
qadd16S(Short xx,Short yy)142 static inline Short qadd16S ( Short xx, Short yy )
143 {
144    Int t = ((Int)xx) + ((Int)yy);
145    if (t < -32768) t = -32768;
146    if (t > 32767)  t = 32767;
147    return (Short)t;
148 }
149 
qadd8S(Char xx,Char yy)150 static inline Char qadd8S ( Char xx, Char yy )
151 {
152    Int t = ((Int)xx) + ((Int)yy);
153    if (t < -128) t = -128;
154    if (t > 127)  t = 127;
155    return (Char)t;
156 }
157 
qadd16U(UShort xx,UShort yy)158 static inline UShort qadd16U ( UShort xx, UShort yy )
159 {
160    UInt t = ((UInt)xx) + ((UInt)yy);
161    if (t > 0xFFFF) t = 0xFFFF;
162    return (UShort)t;
163 }
164 
qadd8U(UChar xx,UChar yy)165 static inline UChar qadd8U ( UChar xx, UChar yy )
166 {
167    UInt t = ((UInt)xx) + ((UInt)yy);
168    if (t > 0xFF) t = 0xFF;
169    return (UChar)t;
170 }
171 
qsub16S(Short xx,Short yy)172 static inline Short qsub16S ( Short xx, Short yy )
173 {
174    Int t = ((Int)xx) - ((Int)yy);
175    if (t < -32768) t = -32768;
176    if (t > 32767)  t = 32767;
177    return (Short)t;
178 }
179 
qsub8S(Char xx,Char yy)180 static inline Char qsub8S ( Char xx, Char yy )
181 {
182    Int t = ((Int)xx) - ((Int)yy);
183    if (t < -128) t = -128;
184    if (t > 127)  t = 127;
185    return (Char)t;
186 }
187 
qsub16U(UShort xx,UShort yy)188 static inline UShort qsub16U ( UShort xx, UShort yy )
189 {
190    Int t = ((Int)xx) - ((Int)yy);
191    if (t < 0)      t = 0;
192    if (t > 0xFFFF) t = 0xFFFF;
193    return (UShort)t;
194 }
195 
qsub8U(UChar xx,UChar yy)196 static inline UChar qsub8U ( UChar xx, UChar yy )
197 {
198    Int t = ((Int)xx) - ((Int)yy);
199    if (t < 0)    t = 0;
200    if (t > 0xFF) t = 0xFF;
201    return (UChar)t;
202 }
203 
mul16(Short xx,Short yy)204 static inline Short mul16 ( Short xx, Short yy )
205 {
206    Int t = ((Int)xx) * ((Int)yy);
207    return (Short)t;
208 }
209 
mul32(Int xx,Int yy)210 static inline Int mul32 ( Int xx, Int yy )
211 {
212    Int t = ((Int)xx) * ((Int)yy);
213    return (Int)t;
214 }
215 
mulhi16S(Short xx,Short yy)216 static inline Short mulhi16S ( Short xx, Short yy )
217 {
218    Int t = ((Int)xx) * ((Int)yy);
219    t >>=/*s*/ 16;
220    return (Short)t;
221 }
222 
mulhi16U(UShort xx,UShort yy)223 static inline UShort mulhi16U ( UShort xx, UShort yy )
224 {
225    UInt t = ((UInt)xx) * ((UInt)yy);
226    t >>=/*u*/ 16;
227    return (UShort)t;
228 }
229 
cmpeq32(UInt xx,UInt yy)230 static inline UInt cmpeq32 ( UInt xx, UInt yy )
231 {
232    return xx==yy ? 0xFFFFFFFF : 0;
233 }
234 
cmpeq16(UShort xx,UShort yy)235 static inline UShort cmpeq16 ( UShort xx, UShort yy )
236 {
237    return toUShort(xx==yy ? 0xFFFF : 0);
238 }
239 
cmpeq8(UChar xx,UChar yy)240 static inline UChar cmpeq8 ( UChar xx, UChar yy )
241 {
242    return toUChar(xx==yy ? 0xFF : 0);
243 }
244 
cmpgt32S(Int xx,Int yy)245 static inline UInt cmpgt32S ( Int xx, Int yy )
246 {
247    return xx>yy ? 0xFFFFFFFF : 0;
248 }
249 
cmpgt16S(Short xx,Short yy)250 static inline UShort cmpgt16S ( Short xx, Short yy )
251 {
252    return toUShort(xx>yy ? 0xFFFF : 0);
253 }
254 
cmpgt8S(Char xx,Char yy)255 static inline UChar cmpgt8S ( Char xx, Char yy )
256 {
257    return toUChar(xx>yy ? 0xFF : 0);
258 }
259 
cmpnez32(UInt xx)260 static inline UInt cmpnez32 ( UInt xx )
261 {
262    return xx==0 ? 0 : 0xFFFFFFFF;
263 }
264 
cmpnez16(UShort xx)265 static inline UShort cmpnez16 ( UShort xx )
266 {
267    return toUShort(xx==0 ? 0 : 0xFFFF);
268 }
269 
cmpnez8(UChar xx)270 static inline UChar cmpnez8 ( UChar xx )
271 {
272    return toUChar(xx==0 ? 0 : 0xFF);
273 }
274 
qnarrow32Sto16S(UInt xx0)275 static inline Short qnarrow32Sto16S ( UInt xx0 )
276 {
277    Int xx = (Int)xx0;
278    if (xx < -32768) xx = -32768;
279    if (xx > 32767)  xx = 32767;
280    return (Short)xx;
281 }
282 
qnarrow16Sto8S(UShort xx0)283 static inline Char qnarrow16Sto8S ( UShort xx0 )
284 {
285    Short xx = (Short)xx0;
286    if (xx < -128) xx = -128;
287    if (xx > 127)  xx = 127;
288    return (Char)xx;
289 }
290 
qnarrow16Sto8U(UShort xx0)291 static inline UChar qnarrow16Sto8U ( UShort xx0 )
292 {
293    Short xx = (Short)xx0;
294    if (xx < 0)   xx = 0;
295    if (xx > 255) xx = 255;
296    return (UChar)xx;
297 }
298 
narrow32to16(UInt xx)299 static inline UShort narrow32to16 ( UInt xx )
300 {
301    return (UShort)xx;
302 }
303 
narrow16to8(UShort xx)304 static inline UChar narrow16to8 ( UShort xx )
305 {
306    return (UChar)xx;
307 }
308 
309 /* shifts: we don't care about out-of-range ones, since
310    that is dealt with at a higher level. */
311 
shl8(UChar v,UInt n)312 static inline UChar shl8 ( UChar v, UInt n )
313 {
314    return toUChar(v << n);
315 }
316 
sar8(UChar v,UInt n)317 static inline UChar sar8 ( UChar v, UInt n )
318 {
319    return toUChar(((Char)v) >> n);
320 }
321 
shl16(UShort v,UInt n)322 static inline UShort shl16 ( UShort v, UInt n )
323 {
324    return toUShort(v << n);
325 }
326 
shr16(UShort v,UInt n)327 static inline UShort shr16 ( UShort v, UInt n )
328 {
329    return toUShort((((UShort)v) >> n));
330 }
331 
sar16(UShort v,UInt n)332 static inline UShort sar16 ( UShort v, UInt n )
333 {
334    return toUShort(((Short)v) >> n);
335 }
336 
shl32(UInt v,UInt n)337 static inline UInt shl32 ( UInt v, UInt n )
338 {
339    return v << n;
340 }
341 
shr32(UInt v,UInt n)342 static inline UInt shr32 ( UInt v, UInt n )
343 {
344    return (((UInt)v) >> n);
345 }
346 
sar32(UInt v,UInt n)347 static inline UInt sar32 ( UInt v, UInt n )
348 {
349    return ((Int)v) >> n;
350 }
351 
avg8U(UChar xx,UChar yy)352 static inline UChar avg8U ( UChar xx, UChar yy )
353 {
354    UInt xxi = (UInt)xx;
355    UInt yyi = (UInt)yy;
356    UInt r   = (xxi + yyi + 1) >> 1;
357    return (UChar)r;
358 }
359 
avg16U(UShort xx,UShort yy)360 static inline UShort avg16U ( UShort xx, UShort yy )
361 {
362    UInt xxi = (UInt)xx;
363    UInt yyi = (UInt)yy;
364    UInt r   = (xxi + yyi + 1) >> 1;
365    return (UShort)r;
366 }
367 
max16S(Short xx,Short yy)368 static inline Short max16S ( Short xx, Short yy )
369 {
370    return toUShort((xx > yy) ? xx : yy);
371 }
372 
max8U(UChar xx,UChar yy)373 static inline UChar max8U ( UChar xx, UChar yy )
374 {
375    return toUChar((xx > yy) ? xx : yy);
376 }
377 
min16S(Short xx,Short yy)378 static inline Short min16S ( Short xx, Short yy )
379 {
380    return toUShort((xx < yy) ? xx : yy);
381 }
382 
min8U(UChar xx,UChar yy)383 static inline UChar min8U ( UChar xx, UChar yy )
384 {
385    return toUChar((xx < yy) ? xx : yy);
386 }
387 
hadd16U(UShort xx,UShort yy)388 static inline UShort hadd16U ( UShort xx, UShort yy )
389 {
390    UInt xxi = (UInt)xx;
391    UInt yyi = (UInt)yy;
392    UInt r   = (xxi + yyi) >> 1;
393    return (UShort)r;
394 }
395 
hadd16S(Short xx,Short yy)396 static inline Short hadd16S ( Short xx, Short yy )
397 {
398    Int xxi = (Int)xx;
399    Int yyi = (Int)yy;
400    Int r   = (xxi + yyi) >> 1;
401    return (Short)r;
402 }
403 
hsub16U(UShort xx,UShort yy)404 static inline UShort hsub16U ( UShort xx, UShort yy )
405 {
406    UInt xxi = (UInt)xx;
407    UInt yyi = (UInt)yy;
408    UInt r   = (xxi - yyi) >> 1;
409    return (UShort)r;
410 }
411 
hsub16S(Short xx,Short yy)412 static inline Short hsub16S ( Short xx, Short yy )
413 {
414    Int xxi = (Int)xx;
415    Int yyi = (Int)yy;
416    Int r   = (xxi - yyi) >> 1;
417    return (Short)r;
418 }
419 
hadd8U(UChar xx,UChar yy)420 static inline UChar hadd8U ( UChar xx, UChar yy )
421 {
422    UInt xxi = (UInt)xx;
423    UInt yyi = (UInt)yy;
424    UInt r   = (xxi + yyi) >> 1;
425    return (UChar)r;
426 }
427 
hadd8S(Char xx,Char yy)428 static inline Char hadd8S ( Char xx, Char yy )
429 {
430    Int xxi = (Int)xx;
431    Int yyi = (Int)yy;
432    Int r   = (xxi + yyi) >> 1;
433    return (Char)r;
434 }
435 
hsub8U(UChar xx,UChar yy)436 static inline UChar hsub8U ( UChar xx, UChar yy )
437 {
438    UInt xxi = (UInt)xx;
439    UInt yyi = (UInt)yy;
440    UInt r   = (xxi - yyi) >> 1;
441    return (UChar)r;
442 }
443 
hsub8S(Char xx,Char yy)444 static inline Char hsub8S ( Char xx, Char yy )
445 {
446    Int xxi = (Int)xx;
447    Int yyi = (Int)yy;
448    Int r   = (xxi - yyi) >> 1;
449    return (Char)r;
450 }
451 
absdiff8U(UChar xx,UChar yy)452 static inline UInt absdiff8U ( UChar xx, UChar yy )
453 {
454    UInt xxu = (UChar)xx;
455    UInt yyu = (UChar)yy;
456    return xxu >= yyu  ? xxu - yyu  : yyu - xxu;
457 }
458 
459 /* ----------------------------------------------------- */
460 /* Start of the externally visible functions.  These simply
461    implement the corresponding IR primops. */
462 /* ----------------------------------------------------- */
463 
464 /* ------------ Normal addition ------------ */
465 
h_generic_calc_Add32x2(ULong xx,ULong yy)466 ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy )
467 {
468    return mk32x2(
469              sel32x2_1(xx) + sel32x2_1(yy),
470              sel32x2_0(xx) + sel32x2_0(yy)
471           );
472 }
473 
h_generic_calc_Add16x4(ULong xx,ULong yy)474 ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy )
475 {
476    return mk16x4(
477              toUShort( sel16x4_3(xx) + sel16x4_3(yy) ),
478              toUShort( sel16x4_2(xx) + sel16x4_2(yy) ),
479              toUShort( sel16x4_1(xx) + sel16x4_1(yy) ),
480              toUShort( sel16x4_0(xx) + sel16x4_0(yy) )
481           );
482 }
483 
h_generic_calc_Add8x8(ULong xx,ULong yy)484 ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy )
485 {
486    return mk8x8(
487              toUChar( sel8x8_7(xx) + sel8x8_7(yy) ),
488              toUChar( sel8x8_6(xx) + sel8x8_6(yy) ),
489              toUChar( sel8x8_5(xx) + sel8x8_5(yy) ),
490              toUChar( sel8x8_4(xx) + sel8x8_4(yy) ),
491              toUChar( sel8x8_3(xx) + sel8x8_3(yy) ),
492              toUChar( sel8x8_2(xx) + sel8x8_2(yy) ),
493              toUChar( sel8x8_1(xx) + sel8x8_1(yy) ),
494              toUChar( sel8x8_0(xx) + sel8x8_0(yy) )
495           );
496 }
497 
498 /* ------------ Saturating addition ------------ */
499 
h_generic_calc_QAdd16Sx4(ULong xx,ULong yy)500 ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy )
501 {
502    return mk16x4(
503              qadd16S( sel16x4_3(xx), sel16x4_3(yy) ),
504              qadd16S( sel16x4_2(xx), sel16x4_2(yy) ),
505              qadd16S( sel16x4_1(xx), sel16x4_1(yy) ),
506              qadd16S( sel16x4_0(xx), sel16x4_0(yy) )
507           );
508 }
509 
h_generic_calc_QAdd8Sx8(ULong xx,ULong yy)510 ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy )
511 {
512    return mk8x8(
513              qadd8S( sel8x8_7(xx), sel8x8_7(yy) ),
514              qadd8S( sel8x8_6(xx), sel8x8_6(yy) ),
515              qadd8S( sel8x8_5(xx), sel8x8_5(yy) ),
516              qadd8S( sel8x8_4(xx), sel8x8_4(yy) ),
517              qadd8S( sel8x8_3(xx), sel8x8_3(yy) ),
518              qadd8S( sel8x8_2(xx), sel8x8_2(yy) ),
519              qadd8S( sel8x8_1(xx), sel8x8_1(yy) ),
520              qadd8S( sel8x8_0(xx), sel8x8_0(yy) )
521           );
522 }
523 
h_generic_calc_QAdd16Ux4(ULong xx,ULong yy)524 ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy )
525 {
526    return mk16x4(
527              qadd16U( sel16x4_3(xx), sel16x4_3(yy) ),
528              qadd16U( sel16x4_2(xx), sel16x4_2(yy) ),
529              qadd16U( sel16x4_1(xx), sel16x4_1(yy) ),
530              qadd16U( sel16x4_0(xx), sel16x4_0(yy) )
531           );
532 }
533 
h_generic_calc_QAdd8Ux8(ULong xx,ULong yy)534 ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy )
535 {
536    return mk8x8(
537              qadd8U( sel8x8_7(xx), sel8x8_7(yy) ),
538              qadd8U( sel8x8_6(xx), sel8x8_6(yy) ),
539              qadd8U( sel8x8_5(xx), sel8x8_5(yy) ),
540              qadd8U( sel8x8_4(xx), sel8x8_4(yy) ),
541              qadd8U( sel8x8_3(xx), sel8x8_3(yy) ),
542              qadd8U( sel8x8_2(xx), sel8x8_2(yy) ),
543              qadd8U( sel8x8_1(xx), sel8x8_1(yy) ),
544              qadd8U( sel8x8_0(xx), sel8x8_0(yy) )
545           );
546 }
547 
548 /* ------------ Normal subtraction ------------ */
549 
h_generic_calc_Sub32x2(ULong xx,ULong yy)550 ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy )
551 {
552    return mk32x2(
553              sel32x2_1(xx) - sel32x2_1(yy),
554              sel32x2_0(xx) - sel32x2_0(yy)
555           );
556 }
557 
h_generic_calc_Sub16x4(ULong xx,ULong yy)558 ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy )
559 {
560    return mk16x4(
561              toUShort( sel16x4_3(xx) - sel16x4_3(yy) ),
562              toUShort( sel16x4_2(xx) - sel16x4_2(yy) ),
563              toUShort( sel16x4_1(xx) - sel16x4_1(yy) ),
564              toUShort( sel16x4_0(xx) - sel16x4_0(yy) )
565           );
566 }
567 
h_generic_calc_Sub8x8(ULong xx,ULong yy)568 ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy )
569 {
570    return mk8x8(
571              toUChar( sel8x8_7(xx) - sel8x8_7(yy) ),
572              toUChar( sel8x8_6(xx) - sel8x8_6(yy) ),
573              toUChar( sel8x8_5(xx) - sel8x8_5(yy) ),
574              toUChar( sel8x8_4(xx) - sel8x8_4(yy) ),
575              toUChar( sel8x8_3(xx) - sel8x8_3(yy) ),
576              toUChar( sel8x8_2(xx) - sel8x8_2(yy) ),
577              toUChar( sel8x8_1(xx) - sel8x8_1(yy) ),
578              toUChar( sel8x8_0(xx) - sel8x8_0(yy) )
579           );
580 }
581 
582 /* ------------ Saturating subtraction ------------ */
583 
h_generic_calc_QSub16Sx4(ULong xx,ULong yy)584 ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy )
585 {
586    return mk16x4(
587              qsub16S( sel16x4_3(xx), sel16x4_3(yy) ),
588              qsub16S( sel16x4_2(xx), sel16x4_2(yy) ),
589              qsub16S( sel16x4_1(xx), sel16x4_1(yy) ),
590              qsub16S( sel16x4_0(xx), sel16x4_0(yy) )
591           );
592 }
593 
h_generic_calc_QSub8Sx8(ULong xx,ULong yy)594 ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy )
595 {
596    return mk8x8(
597              qsub8S( sel8x8_7(xx), sel8x8_7(yy) ),
598              qsub8S( sel8x8_6(xx), sel8x8_6(yy) ),
599              qsub8S( sel8x8_5(xx), sel8x8_5(yy) ),
600              qsub8S( sel8x8_4(xx), sel8x8_4(yy) ),
601              qsub8S( sel8x8_3(xx), sel8x8_3(yy) ),
602              qsub8S( sel8x8_2(xx), sel8x8_2(yy) ),
603              qsub8S( sel8x8_1(xx), sel8x8_1(yy) ),
604              qsub8S( sel8x8_0(xx), sel8x8_0(yy) )
605           );
606 }
607 
h_generic_calc_QSub16Ux4(ULong xx,ULong yy)608 ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy )
609 {
610    return mk16x4(
611              qsub16U( sel16x4_3(xx), sel16x4_3(yy) ),
612              qsub16U( sel16x4_2(xx), sel16x4_2(yy) ),
613              qsub16U( sel16x4_1(xx), sel16x4_1(yy) ),
614              qsub16U( sel16x4_0(xx), sel16x4_0(yy) )
615           );
616 }
617 
h_generic_calc_QSub8Ux8(ULong xx,ULong yy)618 ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy )
619 {
620    return mk8x8(
621              qsub8U( sel8x8_7(xx), sel8x8_7(yy) ),
622              qsub8U( sel8x8_6(xx), sel8x8_6(yy) ),
623              qsub8U( sel8x8_5(xx), sel8x8_5(yy) ),
624              qsub8U( sel8x8_4(xx), sel8x8_4(yy) ),
625              qsub8U( sel8x8_3(xx), sel8x8_3(yy) ),
626              qsub8U( sel8x8_2(xx), sel8x8_2(yy) ),
627              qsub8U( sel8x8_1(xx), sel8x8_1(yy) ),
628              qsub8U( sel8x8_0(xx), sel8x8_0(yy) )
629           );
630 }
631 
632 /* ------------ Multiplication ------------ */
633 
h_generic_calc_Mul16x4(ULong xx,ULong yy)634 ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy )
635 {
636    return mk16x4(
637              mul16( sel16x4_3(xx), sel16x4_3(yy) ),
638              mul16( sel16x4_2(xx), sel16x4_2(yy) ),
639              mul16( sel16x4_1(xx), sel16x4_1(yy) ),
640              mul16( sel16x4_0(xx), sel16x4_0(yy) )
641           );
642 }
643 
h_generic_calc_Mul32x2(ULong xx,ULong yy)644 ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy )
645 {
646    return mk32x2(
647              mul32( sel32x2_1(xx), sel32x2_1(yy) ),
648              mul32( sel32x2_0(xx), sel32x2_0(yy) )
649           );
650 }
651 
h_generic_calc_MulHi16Sx4(ULong xx,ULong yy)652 ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy )
653 {
654    return mk16x4(
655              mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ),
656              mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ),
657              mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ),
658              mulhi16S( sel16x4_0(xx), sel16x4_0(yy) )
659           );
660 }
661 
h_generic_calc_MulHi16Ux4(ULong xx,ULong yy)662 ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy )
663 {
664    return mk16x4(
665              mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ),
666              mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ),
667              mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ),
668              mulhi16U( sel16x4_0(xx), sel16x4_0(yy) )
669           );
670 }
671 
672 /* ------------ Comparison ------------ */
673 
h_generic_calc_CmpEQ32x2(ULong xx,ULong yy)674 ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy )
675 {
676    return mk32x2(
677              cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ),
678              cmpeq32( sel32x2_0(xx), sel32x2_0(yy) )
679           );
680 }
681 
h_generic_calc_CmpEQ16x4(ULong xx,ULong yy)682 ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy )
683 {
684    return mk16x4(
685              cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ),
686              cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ),
687              cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ),
688              cmpeq16( sel16x4_0(xx), sel16x4_0(yy) )
689           );
690 }
691 
h_generic_calc_CmpEQ8x8(ULong xx,ULong yy)692 ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy )
693 {
694    return mk8x8(
695              cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ),
696              cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ),
697              cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ),
698              cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ),
699              cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ),
700              cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ),
701              cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ),
702              cmpeq8( sel8x8_0(xx), sel8x8_0(yy) )
703           );
704 }
705 
h_generic_calc_CmpGT32Sx2(ULong xx,ULong yy)706 ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy )
707 {
708    return mk32x2(
709              cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ),
710              cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) )
711           );
712 }
713 
h_generic_calc_CmpGT16Sx4(ULong xx,ULong yy)714 ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy )
715 {
716    return mk16x4(
717              cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ),
718              cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ),
719              cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ),
720              cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) )
721           );
722 }
723 
h_generic_calc_CmpGT8Sx8(ULong xx,ULong yy)724 ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy )
725 {
726    return mk8x8(
727              cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ),
728              cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ),
729              cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ),
730              cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ),
731              cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ),
732              cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ),
733              cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ),
734              cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) )
735           );
736 }
737 
h_generic_calc_CmpNEZ32x2(ULong xx)738 ULong h_generic_calc_CmpNEZ32x2 ( ULong xx )
739 {
740    return mk32x2(
741              cmpnez32( sel32x2_1(xx) ),
742              cmpnez32( sel32x2_0(xx) )
743           );
744 }
745 
h_generic_calc_CmpNEZ16x4(ULong xx)746 ULong h_generic_calc_CmpNEZ16x4 ( ULong xx )
747 {
748    return mk16x4(
749              cmpnez16( sel16x4_3(xx) ),
750              cmpnez16( sel16x4_2(xx) ),
751              cmpnez16( sel16x4_1(xx) ),
752              cmpnez16( sel16x4_0(xx) )
753           );
754 }
755 
h_generic_calc_CmpNEZ8x8(ULong xx)756 ULong h_generic_calc_CmpNEZ8x8 ( ULong xx )
757 {
758    return mk8x8(
759              cmpnez8( sel8x8_7(xx) ),
760              cmpnez8( sel8x8_6(xx) ),
761              cmpnez8( sel8x8_5(xx) ),
762              cmpnez8( sel8x8_4(xx) ),
763              cmpnez8( sel8x8_3(xx) ),
764              cmpnez8( sel8x8_2(xx) ),
765              cmpnez8( sel8x8_1(xx) ),
766              cmpnez8( sel8x8_0(xx) )
767           );
768 }
769 
770 /* ------------ Saturating narrowing ------------ */
771 
h_generic_calc_QNarrowBin32Sto16Sx4(ULong aa,ULong bb)772 ULong h_generic_calc_QNarrowBin32Sto16Sx4 ( ULong aa, ULong bb )
773 {
774    UInt d = sel32x2_1(aa);
775    UInt c = sel32x2_0(aa);
776    UInt b = sel32x2_1(bb);
777    UInt a = sel32x2_0(bb);
778    return mk16x4(
779              qnarrow32Sto16S(d),
780              qnarrow32Sto16S(c),
781              qnarrow32Sto16S(b),
782              qnarrow32Sto16S(a)
783           );
784 }
785 
h_generic_calc_QNarrowBin16Sto8Sx8(ULong aa,ULong bb)786 ULong h_generic_calc_QNarrowBin16Sto8Sx8 ( ULong aa, ULong bb )
787 {
788    UShort h = sel16x4_3(aa);
789    UShort g = sel16x4_2(aa);
790    UShort f = sel16x4_1(aa);
791    UShort e = sel16x4_0(aa);
792    UShort d = sel16x4_3(bb);
793    UShort c = sel16x4_2(bb);
794    UShort b = sel16x4_1(bb);
795    UShort a = sel16x4_0(bb);
796    return mk8x8(
797              qnarrow16Sto8S(h),
798              qnarrow16Sto8S(g),
799              qnarrow16Sto8S(f),
800              qnarrow16Sto8S(e),
801              qnarrow16Sto8S(d),
802              qnarrow16Sto8S(c),
803              qnarrow16Sto8S(b),
804              qnarrow16Sto8S(a)
805           );
806 }
807 
h_generic_calc_QNarrowBin16Sto8Ux8(ULong aa,ULong bb)808 ULong h_generic_calc_QNarrowBin16Sto8Ux8 ( ULong aa, ULong bb )
809 {
810    UShort h = sel16x4_3(aa);
811    UShort g = sel16x4_2(aa);
812    UShort f = sel16x4_1(aa);
813    UShort e = sel16x4_0(aa);
814    UShort d = sel16x4_3(bb);
815    UShort c = sel16x4_2(bb);
816    UShort b = sel16x4_1(bb);
817    UShort a = sel16x4_0(bb);
818    return mk8x8(
819              qnarrow16Sto8U(h),
820              qnarrow16Sto8U(g),
821              qnarrow16Sto8U(f),
822              qnarrow16Sto8U(e),
823              qnarrow16Sto8U(d),
824              qnarrow16Sto8U(c),
825              qnarrow16Sto8U(b),
826              qnarrow16Sto8U(a)
827           );
828 }
829 
830 /* ------------ Truncating narrowing ------------ */
831 
h_generic_calc_NarrowBin32to16x4(ULong aa,ULong bb)832 ULong h_generic_calc_NarrowBin32to16x4 ( ULong aa, ULong bb )
833 {
834    UInt d = sel32x2_1(aa);
835    UInt c = sel32x2_0(aa);
836    UInt b = sel32x2_1(bb);
837    UInt a = sel32x2_0(bb);
838    return mk16x4(
839              narrow32to16(d),
840              narrow32to16(c),
841              narrow32to16(b),
842              narrow32to16(a)
843           );
844 }
845 
h_generic_calc_NarrowBin16to8x8(ULong aa,ULong bb)846 ULong h_generic_calc_NarrowBin16to8x8 ( ULong aa, ULong bb )
847 {
848    UShort h = sel16x4_3(aa);
849    UShort g = sel16x4_2(aa);
850    UShort f = sel16x4_1(aa);
851    UShort e = sel16x4_0(aa);
852    UShort d = sel16x4_3(bb);
853    UShort c = sel16x4_2(bb);
854    UShort b = sel16x4_1(bb);
855    UShort a = sel16x4_0(bb);
856    return mk8x8(
857              narrow16to8(h),
858              narrow16to8(g),
859              narrow16to8(f),
860              narrow16to8(e),
861              narrow16to8(d),
862              narrow16to8(c),
863              narrow16to8(b),
864              narrow16to8(a)
865           );
866 }
867 
868 /* ------------ Interleaving ------------ */
869 
h_generic_calc_InterleaveHI8x8(ULong aa,ULong bb)870 ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb )
871 {
872    return mk8x8(
873              sel8x8_7(aa),
874              sel8x8_7(bb),
875              sel8x8_6(aa),
876              sel8x8_6(bb),
877              sel8x8_5(aa),
878              sel8x8_5(bb),
879              sel8x8_4(aa),
880              sel8x8_4(bb)
881           );
882 }
883 
h_generic_calc_InterleaveLO8x8(ULong aa,ULong bb)884 ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb )
885 {
886    return mk8x8(
887              sel8x8_3(aa),
888              sel8x8_3(bb),
889              sel8x8_2(aa),
890              sel8x8_2(bb),
891              sel8x8_1(aa),
892              sel8x8_1(bb),
893              sel8x8_0(aa),
894              sel8x8_0(bb)
895           );
896 }
897 
h_generic_calc_InterleaveHI16x4(ULong aa,ULong bb)898 ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb )
899 {
900    return mk16x4(
901              sel16x4_3(aa),
902              sel16x4_3(bb),
903              sel16x4_2(aa),
904              sel16x4_2(bb)
905           );
906 }
907 
h_generic_calc_InterleaveLO16x4(ULong aa,ULong bb)908 ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb )
909 {
910    return mk16x4(
911              sel16x4_1(aa),
912              sel16x4_1(bb),
913              sel16x4_0(aa),
914              sel16x4_0(bb)
915           );
916 }
917 
h_generic_calc_InterleaveHI32x2(ULong aa,ULong bb)918 ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb )
919 {
920    return mk32x2(
921              sel32x2_1(aa),
922              sel32x2_1(bb)
923           );
924 }
925 
h_generic_calc_InterleaveLO32x2(ULong aa,ULong bb)926 ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb )
927 {
928    return mk32x2(
929              sel32x2_0(aa),
930              sel32x2_0(bb)
931           );
932 }
933 
934 /* ------------ Concatenation ------------ */
935 
h_generic_calc_CatOddLanes16x4(ULong aa,ULong bb)936 ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb )
937 {
938    return mk16x4(
939              sel16x4_3(aa),
940              sel16x4_1(aa),
941              sel16x4_3(bb),
942              sel16x4_1(bb)
943           );
944 }
945 
h_generic_calc_CatEvenLanes16x4(ULong aa,ULong bb)946 ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb )
947 {
948    return mk16x4(
949              sel16x4_2(aa),
950              sel16x4_0(aa),
951              sel16x4_2(bb),
952              sel16x4_0(bb)
953           );
954 }
955 
956 /* misc hack looking for a proper home */
h_generic_calc_Perm8x8(ULong aa,ULong bb)957 ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb )
958 {
959    return mk8x8(
960              index8x8(aa, sel8x8_7(bb)),
961              index8x8(aa, sel8x8_6(bb)),
962              index8x8(aa, sel8x8_5(bb)),
963              index8x8(aa, sel8x8_4(bb)),
964              index8x8(aa, sel8x8_3(bb)),
965              index8x8(aa, sel8x8_2(bb)),
966              index8x8(aa, sel8x8_1(bb)),
967              index8x8(aa, sel8x8_0(bb))
968           );
969 }
970 
971 /* ------------ Shifting ------------ */
972 /* Note that because these primops are undefined if the shift amount
973    equals or exceeds the lane width, the shift amount is masked so
974    that the scalar shifts are always in range.  In fact, given the
975    semantics of these primops (ShlN16x4, etc) it is an error if in
976    fact we are ever given an out-of-range shift amount.
977 */
h_generic_calc_ShlN32x2(ULong xx,UInt nn)978 ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn )
979 {
980    /* vassert(nn < 32); */
981    nn &= 31;
982    return mk32x2(
983              shl32( sel32x2_1(xx), nn ),
984              shl32( sel32x2_0(xx), nn )
985           );
986 }
987 
h_generic_calc_ShlN16x4(ULong xx,UInt nn)988 ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn )
989 {
990    /* vassert(nn < 16); */
991    nn &= 15;
992    return mk16x4(
993              shl16( sel16x4_3(xx), nn ),
994              shl16( sel16x4_2(xx), nn ),
995              shl16( sel16x4_1(xx), nn ),
996              shl16( sel16x4_0(xx), nn )
997           );
998 }
999 
h_generic_calc_ShlN8x8(ULong xx,UInt nn)1000 ULong h_generic_calc_ShlN8x8  ( ULong xx, UInt nn )
1001 {
1002    /* vassert(nn < 8); */
1003    nn &= 7;
1004    return mk8x8(
1005              shl8( sel8x8_7(xx), nn ),
1006              shl8( sel8x8_6(xx), nn ),
1007              shl8( sel8x8_5(xx), nn ),
1008              shl8( sel8x8_4(xx), nn ),
1009              shl8( sel8x8_3(xx), nn ),
1010              shl8( sel8x8_2(xx), nn ),
1011              shl8( sel8x8_1(xx), nn ),
1012              shl8( sel8x8_0(xx), nn )
1013           );
1014 }
1015 
h_generic_calc_ShrN32x2(ULong xx,UInt nn)1016 ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn )
1017 {
1018    /* vassert(nn < 32); */
1019    nn &= 31;
1020    return mk32x2(
1021              shr32( sel32x2_1(xx), nn ),
1022              shr32( sel32x2_0(xx), nn )
1023           );
1024 }
1025 
h_generic_calc_ShrN16x4(ULong xx,UInt nn)1026 ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn )
1027 {
1028    /* vassert(nn < 16); */
1029    nn &= 15;
1030    return mk16x4(
1031              shr16( sel16x4_3(xx), nn ),
1032              shr16( sel16x4_2(xx), nn ),
1033              shr16( sel16x4_1(xx), nn ),
1034              shr16( sel16x4_0(xx), nn )
1035           );
1036 }
1037 
h_generic_calc_SarN32x2(ULong xx,UInt nn)1038 ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn )
1039 {
1040    /* vassert(nn < 32); */
1041    nn &= 31;
1042    return mk32x2(
1043              sar32( sel32x2_1(xx), nn ),
1044              sar32( sel32x2_0(xx), nn )
1045           );
1046 }
1047 
h_generic_calc_SarN16x4(ULong xx,UInt nn)1048 ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn )
1049 {
1050    /* vassert(nn < 16); */
1051    nn &= 15;
1052    return mk16x4(
1053              sar16( sel16x4_3(xx), nn ),
1054              sar16( sel16x4_2(xx), nn ),
1055              sar16( sel16x4_1(xx), nn ),
1056              sar16( sel16x4_0(xx), nn )
1057           );
1058 }
1059 
h_generic_calc_SarN8x8(ULong xx,UInt nn)1060 ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn )
1061 {
1062    /* vassert(nn < 8); */
1063    nn &= 7;
1064    return mk8x8(
1065              sar8( sel8x8_7(xx), nn ),
1066              sar8( sel8x8_6(xx), nn ),
1067              sar8( sel8x8_5(xx), nn ),
1068              sar8( sel8x8_4(xx), nn ),
1069              sar8( sel8x8_3(xx), nn ),
1070              sar8( sel8x8_2(xx), nn ),
1071              sar8( sel8x8_1(xx), nn ),
1072              sar8( sel8x8_0(xx), nn )
1073           );
1074 }
1075 
1076 /* ------------ Averaging ------------ */
1077 
h_generic_calc_Avg8Ux8(ULong xx,ULong yy)1078 ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy )
1079 {
1080    return mk8x8(
1081              avg8U( sel8x8_7(xx), sel8x8_7(yy) ),
1082              avg8U( sel8x8_6(xx), sel8x8_6(yy) ),
1083              avg8U( sel8x8_5(xx), sel8x8_5(yy) ),
1084              avg8U( sel8x8_4(xx), sel8x8_4(yy) ),
1085              avg8U( sel8x8_3(xx), sel8x8_3(yy) ),
1086              avg8U( sel8x8_2(xx), sel8x8_2(yy) ),
1087              avg8U( sel8x8_1(xx), sel8x8_1(yy) ),
1088              avg8U( sel8x8_0(xx), sel8x8_0(yy) )
1089           );
1090 }
1091 
h_generic_calc_Avg16Ux4(ULong xx,ULong yy)1092 ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy )
1093 {
1094    return mk16x4(
1095              avg16U( sel16x4_3(xx), sel16x4_3(yy) ),
1096              avg16U( sel16x4_2(xx), sel16x4_2(yy) ),
1097              avg16U( sel16x4_1(xx), sel16x4_1(yy) ),
1098              avg16U( sel16x4_0(xx), sel16x4_0(yy) )
1099           );
1100 }
1101 
1102 /* ------------ max/min ------------ */
1103 
h_generic_calc_Max16Sx4(ULong xx,ULong yy)1104 ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy )
1105 {
1106    return mk16x4(
1107              max16S( sel16x4_3(xx), sel16x4_3(yy) ),
1108              max16S( sel16x4_2(xx), sel16x4_2(yy) ),
1109              max16S( sel16x4_1(xx), sel16x4_1(yy) ),
1110              max16S( sel16x4_0(xx), sel16x4_0(yy) )
1111           );
1112 }
1113 
h_generic_calc_Max8Ux8(ULong xx,ULong yy)1114 ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy )
1115 {
1116    return mk8x8(
1117              max8U( sel8x8_7(xx), sel8x8_7(yy) ),
1118              max8U( sel8x8_6(xx), sel8x8_6(yy) ),
1119              max8U( sel8x8_5(xx), sel8x8_5(yy) ),
1120              max8U( sel8x8_4(xx), sel8x8_4(yy) ),
1121              max8U( sel8x8_3(xx), sel8x8_3(yy) ),
1122              max8U( sel8x8_2(xx), sel8x8_2(yy) ),
1123              max8U( sel8x8_1(xx), sel8x8_1(yy) ),
1124              max8U( sel8x8_0(xx), sel8x8_0(yy) )
1125           );
1126 }
1127 
h_generic_calc_Min16Sx4(ULong xx,ULong yy)1128 ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy )
1129 {
1130    return mk16x4(
1131              min16S( sel16x4_3(xx), sel16x4_3(yy) ),
1132              min16S( sel16x4_2(xx), sel16x4_2(yy) ),
1133              min16S( sel16x4_1(xx), sel16x4_1(yy) ),
1134              min16S( sel16x4_0(xx), sel16x4_0(yy) )
1135           );
1136 }
1137 
h_generic_calc_Min8Ux8(ULong xx,ULong yy)1138 ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy )
1139 {
1140    return mk8x8(
1141              min8U( sel8x8_7(xx), sel8x8_7(yy) ),
1142              min8U( sel8x8_6(xx), sel8x8_6(yy) ),
1143              min8U( sel8x8_5(xx), sel8x8_5(yy) ),
1144              min8U( sel8x8_4(xx), sel8x8_4(yy) ),
1145              min8U( sel8x8_3(xx), sel8x8_3(yy) ),
1146              min8U( sel8x8_2(xx), sel8x8_2(yy) ),
1147              min8U( sel8x8_1(xx), sel8x8_1(yy) ),
1148              min8U( sel8x8_0(xx), sel8x8_0(yy) )
1149           );
1150 }
1151 
1152 /* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */
1153 
1154 /* Tuple/select functions for 16x2 vectors. */
mk16x2(UShort w1,UShort w2)1155 static inline UInt mk16x2 ( UShort w1, UShort w2 ) {
1156    return (((UInt)w1) << 16) | ((UInt)w2);
1157 }
1158 
sel16x2_1(UInt w32)1159 static inline UShort sel16x2_1 ( UInt w32 ) {
1160    return 0xFFFF & (UShort)(w32 >> 16);
1161 }
sel16x2_0(UInt w32)1162 static inline UShort sel16x2_0 ( UInt w32 ) {
1163    return 0xFFFF & (UShort)(w32);
1164 }
1165 
mk8x4(UChar w3,UChar w2,UChar w1,UChar w0)1166 static inline UInt mk8x4 ( UChar w3, UChar w2,
1167                            UChar w1, UChar w0 ) {
1168    UInt w32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
1169               | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
1170    return w32;
1171 }
1172 
sel8x4_3(UInt w32)1173 static inline UChar sel8x4_3 ( UInt w32 ) {
1174    return toUChar(0xFF & (w32 >> 24));
1175 }
sel8x4_2(UInt w32)1176 static inline UChar sel8x4_2 ( UInt w32 ) {
1177    return toUChar(0xFF & (w32 >> 16));
1178 }
sel8x4_1(UInt w32)1179 static inline UChar sel8x4_1 ( UInt w32 ) {
1180    return toUChar(0xFF & (w32 >> 8));
1181 }
sel8x4_0(UInt w32)1182 static inline UChar sel8x4_0 ( UInt w32 ) {
1183    return toUChar(0xFF & (w32 >> 0));
1184 }
1185 
1186 
1187 /* ----------------------------------------------------- */
1188 /* More externally visible functions.  These simply
1189    implement the corresponding IR primops. */
1190 /* ----------------------------------------------------- */
1191 
1192 /* ------ 16x2 ------ */
1193 
h_generic_calc_Add16x2(UInt xx,UInt yy)1194 UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy )
1195 {
1196    return mk16x2( sel16x2_1(xx) + sel16x2_1(yy),
1197                   sel16x2_0(xx) + sel16x2_0(yy) );
1198 }
1199 
h_generic_calc_Sub16x2(UInt xx,UInt yy)1200 UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy )
1201 {
1202    return mk16x2( sel16x2_1(xx) - sel16x2_1(yy),
1203                   sel16x2_0(xx) - sel16x2_0(yy) );
1204 }
1205 
h_generic_calc_HAdd16Ux2(UInt xx,UInt yy)1206 UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy )
1207 {
1208    return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1209                   hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1210 }
1211 
h_generic_calc_HAdd16Sx2(UInt xx,UInt yy)1212 UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy )
1213 {
1214    return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1215                   hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1216 }
1217 
h_generic_calc_HSub16Ux2(UInt xx,UInt yy)1218 UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy )
1219 {
1220    return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1221                   hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1222 }
1223 
h_generic_calc_HSub16Sx2(UInt xx,UInt yy)1224 UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy )
1225 {
1226    return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1227                   hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1228 }
1229 
h_generic_calc_QAdd16Ux2(UInt xx,UInt yy)1230 UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy )
1231 {
1232    return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1233                   qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1234 }
1235 
h_generic_calc_QAdd16Sx2(UInt xx,UInt yy)1236 UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy )
1237 {
1238    return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1239                   qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1240 }
1241 
h_generic_calc_QSub16Ux2(UInt xx,UInt yy)1242 UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy )
1243 {
1244    return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1245                   qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1246 }
1247 
h_generic_calc_QSub16Sx2(UInt xx,UInt yy)1248 UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy )
1249 {
1250    return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1251                   qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1252 }
1253 
1254 /* ------ 8x4 ------ */
1255 
h_generic_calc_Add8x4(UInt xx,UInt yy)1256 UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy )
1257 {
1258    return mk8x4(
1259              sel8x4_3(xx) + sel8x4_3(yy),
1260              sel8x4_2(xx) + sel8x4_2(yy),
1261              sel8x4_1(xx) + sel8x4_1(yy),
1262              sel8x4_0(xx) + sel8x4_0(yy)
1263           );
1264 }
1265 
h_generic_calc_Sub8x4(UInt xx,UInt yy)1266 UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy )
1267 {
1268    return mk8x4(
1269              sel8x4_3(xx) - sel8x4_3(yy),
1270              sel8x4_2(xx) - sel8x4_2(yy),
1271              sel8x4_1(xx) - sel8x4_1(yy),
1272              sel8x4_0(xx) - sel8x4_0(yy)
1273           );
1274 }
1275 
h_generic_calc_HAdd8Ux4(UInt xx,UInt yy)1276 UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy )
1277 {
1278    return mk8x4(
1279              hadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1280              hadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1281              hadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1282              hadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1283           );
1284 }
1285 
h_generic_calc_HAdd8Sx4(UInt xx,UInt yy)1286 UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy )
1287 {
1288    return mk8x4(
1289              hadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1290              hadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1291              hadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1292              hadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1293           );
1294 }
1295 
h_generic_calc_HSub8Ux4(UInt xx,UInt yy)1296 UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy )
1297 {
1298    return mk8x4(
1299              hsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1300              hsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1301              hsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1302              hsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1303           );
1304 }
1305 
h_generic_calc_HSub8Sx4(UInt xx,UInt yy)1306 UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy )
1307 {
1308    return mk8x4(
1309              hsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1310              hsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1311              hsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1312              hsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1313           );
1314 }
1315 
h_generic_calc_QAdd8Ux4(UInt xx,UInt yy)1316 UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy )
1317 {
1318    return mk8x4(
1319              qadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1320              qadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1321              qadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1322              qadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1323           );
1324 }
1325 
h_generic_calc_QAdd8Sx4(UInt xx,UInt yy)1326 UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy )
1327 {
1328    return mk8x4(
1329              qadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1330              qadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1331              qadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1332              qadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1333           );
1334 }
1335 
h_generic_calc_QSub8Ux4(UInt xx,UInt yy)1336 UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy )
1337 {
1338    return mk8x4(
1339              qsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1340              qsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1341              qsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1342              qsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1343           );
1344 }
1345 
h_generic_calc_QSub8Sx4(UInt xx,UInt yy)1346 UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy )
1347 {
1348    return mk8x4(
1349              qsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1350              qsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1351              qsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1352              qsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1353           );
1354 }
1355 
h_generic_calc_CmpNEZ16x2(UInt xx)1356 UInt h_generic_calc_CmpNEZ16x2 ( UInt xx )
1357 {
1358    return mk16x2(
1359              cmpnez16( sel16x2_1(xx) ),
1360              cmpnez16( sel16x2_0(xx) )
1361           );
1362 }
1363 
h_generic_calc_CmpNEZ8x4(UInt xx)1364 UInt h_generic_calc_CmpNEZ8x4 ( UInt xx )
1365 {
1366    return mk8x4(
1367              cmpnez8( sel8x4_3(xx) ),
1368              cmpnez8( sel8x4_2(xx) ),
1369              cmpnez8( sel8x4_1(xx) ),
1370              cmpnez8( sel8x4_0(xx) )
1371           );
1372 }
1373 
h_generic_calc_Sad8Ux4(UInt xx,UInt yy)1374 UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy )
1375 {
1376    return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) )
1377           + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) )
1378           + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) )
1379           + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) );
1380 }
1381 
1382 
1383 /*---------------------------------------------------------------*/
1384 /*--- end                               host_generic_simd64.c ---*/
1385 /*---------------------------------------------------------------*/
1386