• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 
2 /*---------------------------------------------------------------*/
3 /*--- begin                             host_generic_simd64.c ---*/
4 /*---------------------------------------------------------------*/
5 
6 /*
7    This file is part of Valgrind, a dynamic binary instrumentation
8    framework.
9 
10    Copyright (C) 2004-2012 OpenWorks LLP
11       info@open-works.net
12 
13    This program is free software; you can redistribute it and/or
14    modify it under the terms of the GNU General Public License as
15    published by the Free Software Foundation; either version 2 of the
16    License, or (at your option) any later version.
17 
18    This program is distributed in the hope that it will be useful, but
19    WITHOUT ANY WARRANTY; without even the implied warranty of
20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21    General Public License for more details.
22 
23    You should have received a copy of the GNU General Public License
24    along with this program; if not, write to the Free Software
25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26    02110-1301, USA.
27 
28    The GNU General Public License is contained in the file COPYING.
29 
30    Neither the names of the U.S. Department of Energy nor the
31    University of California nor the names of its contributors may be
32    used to endorse or promote products derived from this software
33    without prior written permission.
34 */
35 
36 /* Generic helper functions for doing 64-bit SIMD arithmetic in cases
37    where the instruction selectors cannot generate code in-line.
38    These are purely back-end entities and cannot be seen/referenced
39    from IR. */
40 
41 #include "libvex_basictypes.h"
42 #include "host_generic_simd64.h"
43 
44 
45 
46 /* Tuple/select functions for 32x2 vectors. */
47 
mk32x2(UInt w1,UInt w0)48 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
49    return (((ULong)w1) << 32) | ((ULong)w0);
50 }
51 
sel32x2_1(ULong w64)52 static inline UInt sel32x2_1 ( ULong w64 ) {
53    return 0xFFFFFFFF & toUInt(w64 >> 32);
54 }
sel32x2_0(ULong w64)55 static inline UInt sel32x2_0 ( ULong w64 ) {
56    return 0xFFFFFFFF & toUInt(w64);
57 }
58 
59 
60 /* Tuple/select functions for 16x4 vectors.  gcc is pretty hopeless
61    with 64-bit shifts so we give it a hand. */
62 
mk16x4(UShort w3,UShort w2,UShort w1,UShort w0)63 static inline ULong mk16x4 ( UShort w3, UShort w2,
64                              UShort w1, UShort w0 ) {
65    UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2);
66    UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0);
67    return mk32x2(hi32, lo32);
68 }
69 
sel16x4_3(ULong w64)70 static inline UShort sel16x4_3 ( ULong w64 ) {
71    UInt hi32 = toUInt(w64 >> 32);
72    return toUShort(0xFFFF & (hi32 >> 16));
73 }
sel16x4_2(ULong w64)74 static inline UShort sel16x4_2 ( ULong w64 ) {
75    UInt hi32 = toUInt(w64 >> 32);
76    return toUShort(0xFFFF & hi32);
77 }
sel16x4_1(ULong w64)78 static inline UShort sel16x4_1 ( ULong w64 ) {
79    UInt lo32 = (UInt)w64;
80    return toUShort(0xFFFF & (lo32 >> 16));
81 }
sel16x4_0(ULong w64)82 static inline UShort sel16x4_0 ( ULong w64 ) {
83    UInt lo32 = (UInt)w64;
84    return toUShort(0xFFFF & lo32);
85 }
86 
87 
88 /* Tuple/select functions for 8x8 vectors. */
89 
mk8x8(UChar w7,UChar w6,UChar w5,UChar w4,UChar w3,UChar w2,UChar w1,UChar w0)90 static inline ULong mk8x8 ( UChar w7, UChar w6,
91                             UChar w5, UChar w4,
92                             UChar w3, UChar w2,
93                             UChar w1, UChar w0 ) {
94    UInt hi32 =   (((UInt)w7) << 24) | (((UInt)w6) << 16)
95                | (((UInt)w5) << 8)  | (((UInt)w4) << 0);
96    UInt lo32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
97                | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
98    return mk32x2(hi32, lo32);
99 }
100 
sel8x8_7(ULong w64)101 static inline UChar sel8x8_7 ( ULong w64 ) {
102    UInt hi32 = toUInt(w64 >> 32);
103    return toUChar(0xFF & (hi32 >> 24));
104 }
sel8x8_6(ULong w64)105 static inline UChar sel8x8_6 ( ULong w64 ) {
106    UInt hi32 = toUInt(w64 >> 32);
107    return toUChar(0xFF & (hi32 >> 16));
108 }
sel8x8_5(ULong w64)109 static inline UChar sel8x8_5 ( ULong w64 ) {
110    UInt hi32 = toUInt(w64 >> 32);
111    return toUChar(0xFF & (hi32 >> 8));
112 }
sel8x8_4(ULong w64)113 static inline UChar sel8x8_4 ( ULong w64 ) {
114    UInt hi32 = toUInt(w64 >> 32);
115    return toUChar(0xFF & (hi32 >> 0));
116 }
sel8x8_3(ULong w64)117 static inline UChar sel8x8_3 ( ULong w64 ) {
118    UInt lo32 = (UInt)w64;
119    return toUChar(0xFF & (lo32 >> 24));
120 }
sel8x8_2(ULong w64)121 static inline UChar sel8x8_2 ( ULong w64 ) {
122    UInt lo32 = (UInt)w64;
123    return toUChar(0xFF & (lo32 >> 16));
124 }
sel8x8_1(ULong w64)125 static inline UChar sel8x8_1 ( ULong w64 ) {
126    UInt lo32 = (UInt)w64;
127    return toUChar(0xFF & (lo32 >> 8));
128 }
sel8x8_0(ULong w64)129 static inline UChar sel8x8_0 ( ULong w64 ) {
130    UInt lo32 = (UInt)w64;
131    return toUChar(0xFF & (lo32 >> 0));
132 }
133 
index8x8(ULong w64,UChar ix)134 static inline UChar index8x8 ( ULong w64, UChar ix ) {
135    ix &= 7;
136    return toUChar((w64 >> (8*ix)) & 0xFF);
137 }
138 
139 
140 /* Scalar helpers. */
141 
qadd32S(Int xx,Int yy)142 static inline Int qadd32S ( Int xx, Int yy )
143 {
144    Long t = ((Long)xx) + ((Long)yy);
145    const Long loLim = -0x80000000LL;
146    const Long hiLim =  0x7FFFFFFFLL;
147    if (t < loLim) t = loLim;
148    if (t > hiLim) t = hiLim;
149    return (Int)t;
150 }
151 
qadd16S(Short xx,Short yy)152 static inline Short qadd16S ( Short xx, Short yy )
153 {
154    Int t = ((Int)xx) + ((Int)yy);
155    if (t < -32768) t = -32768;
156    if (t > 32767)  t = 32767;
157    return (Short)t;
158 }
159 
qadd8S(Char xx,Char yy)160 static inline Char qadd8S ( Char xx, Char yy )
161 {
162    Int t = ((Int)xx) + ((Int)yy);
163    if (t < -128) t = -128;
164    if (t > 127)  t = 127;
165    return (Char)t;
166 }
167 
qadd16U(UShort xx,UShort yy)168 static inline UShort qadd16U ( UShort xx, UShort yy )
169 {
170    UInt t = ((UInt)xx) + ((UInt)yy);
171    if (t > 0xFFFF) t = 0xFFFF;
172    return (UShort)t;
173 }
174 
qadd8U(UChar xx,UChar yy)175 static inline UChar qadd8U ( UChar xx, UChar yy )
176 {
177    UInt t = ((UInt)xx) + ((UInt)yy);
178    if (t > 0xFF) t = 0xFF;
179    return (UChar)t;
180 }
181 
qsub32S(Int xx,Int yy)182 static inline Int qsub32S ( Int xx, Int yy )
183 {
184    Long t = ((Long)xx) - ((Long)yy);
185    const Long loLim = -0x80000000LL;
186    const Long hiLim =  0x7FFFFFFFLL;
187    if (t < loLim) t = loLim;
188    if (t > hiLim) t = hiLim;
189    return (Int)t;
190 }
191 
qsub16S(Short xx,Short yy)192 static inline Short qsub16S ( Short xx, Short yy )
193 {
194    Int t = ((Int)xx) - ((Int)yy);
195    if (t < -32768) t = -32768;
196    if (t > 32767)  t = 32767;
197    return (Short)t;
198 }
199 
qsub8S(Char xx,Char yy)200 static inline Char qsub8S ( Char xx, Char yy )
201 {
202    Int t = ((Int)xx) - ((Int)yy);
203    if (t < -128) t = -128;
204    if (t > 127)  t = 127;
205    return (Char)t;
206 }
207 
qsub16U(UShort xx,UShort yy)208 static inline UShort qsub16U ( UShort xx, UShort yy )
209 {
210    Int t = ((Int)xx) - ((Int)yy);
211    if (t < 0)      t = 0;
212    if (t > 0xFFFF) t = 0xFFFF;
213    return (UShort)t;
214 }
215 
qsub8U(UChar xx,UChar yy)216 static inline UChar qsub8U ( UChar xx, UChar yy )
217 {
218    Int t = ((Int)xx) - ((Int)yy);
219    if (t < 0)    t = 0;
220    if (t > 0xFF) t = 0xFF;
221    return (UChar)t;
222 }
223 
mul16(Short xx,Short yy)224 static inline Short mul16 ( Short xx, Short yy )
225 {
226    Int t = ((Int)xx) * ((Int)yy);
227    return (Short)t;
228 }
229 
mul32(Int xx,Int yy)230 static inline Int mul32 ( Int xx, Int yy )
231 {
232    Int t = ((Int)xx) * ((Int)yy);
233    return (Int)t;
234 }
235 
mulhi16S(Short xx,Short yy)236 static inline Short mulhi16S ( Short xx, Short yy )
237 {
238    Int t = ((Int)xx) * ((Int)yy);
239    t >>=/*s*/ 16;
240    return (Short)t;
241 }
242 
mulhi16U(UShort xx,UShort yy)243 static inline UShort mulhi16U ( UShort xx, UShort yy )
244 {
245    UInt t = ((UInt)xx) * ((UInt)yy);
246    t >>=/*u*/ 16;
247    return (UShort)t;
248 }
249 
cmpeq32(UInt xx,UInt yy)250 static inline UInt cmpeq32 ( UInt xx, UInt yy )
251 {
252    return xx==yy ? 0xFFFFFFFF : 0;
253 }
254 
cmpeq16(UShort xx,UShort yy)255 static inline UShort cmpeq16 ( UShort xx, UShort yy )
256 {
257    return toUShort(xx==yy ? 0xFFFF : 0);
258 }
259 
cmpeq8(UChar xx,UChar yy)260 static inline UChar cmpeq8 ( UChar xx, UChar yy )
261 {
262    return toUChar(xx==yy ? 0xFF : 0);
263 }
264 
cmpgt32S(Int xx,Int yy)265 static inline UInt cmpgt32S ( Int xx, Int yy )
266 {
267    return xx>yy ? 0xFFFFFFFF : 0;
268 }
269 
cmpgt16S(Short xx,Short yy)270 static inline UShort cmpgt16S ( Short xx, Short yy )
271 {
272    return toUShort(xx>yy ? 0xFFFF : 0);
273 }
274 
cmpgt8S(Char xx,Char yy)275 static inline UChar cmpgt8S ( Char xx, Char yy )
276 {
277    return toUChar(xx>yy ? 0xFF : 0);
278 }
279 
cmpnez32(UInt xx)280 static inline UInt cmpnez32 ( UInt xx )
281 {
282    return xx==0 ? 0 : 0xFFFFFFFF;
283 }
284 
cmpnez16(UShort xx)285 static inline UShort cmpnez16 ( UShort xx )
286 {
287    return toUShort(xx==0 ? 0 : 0xFFFF);
288 }
289 
cmpnez8(UChar xx)290 static inline UChar cmpnez8 ( UChar xx )
291 {
292    return toUChar(xx==0 ? 0 : 0xFF);
293 }
294 
qnarrow32Sto16S(UInt xx0)295 static inline Short qnarrow32Sto16S ( UInt xx0 )
296 {
297    Int xx = (Int)xx0;
298    if (xx < -32768) xx = -32768;
299    if (xx > 32767)  xx = 32767;
300    return (Short)xx;
301 }
302 
qnarrow16Sto8S(UShort xx0)303 static inline Char qnarrow16Sto8S ( UShort xx0 )
304 {
305    Short xx = (Short)xx0;
306    if (xx < -128) xx = -128;
307    if (xx > 127)  xx = 127;
308    return (Char)xx;
309 }
310 
qnarrow16Sto8U(UShort xx0)311 static inline UChar qnarrow16Sto8U ( UShort xx0 )
312 {
313    Short xx = (Short)xx0;
314    if (xx < 0)   xx = 0;
315    if (xx > 255) xx = 255;
316    return (UChar)xx;
317 }
318 
narrow32to16(UInt xx)319 static inline UShort narrow32to16 ( UInt xx )
320 {
321    return (UShort)xx;
322 }
323 
narrow16to8(UShort xx)324 static inline UChar narrow16to8 ( UShort xx )
325 {
326    return (UChar)xx;
327 }
328 
329 /* shifts: we don't care about out-of-range ones, since
330    that is dealt with at a higher level. */
331 
shl8(UChar v,UInt n)332 static inline UChar shl8 ( UChar v, UInt n )
333 {
334    return toUChar(v << n);
335 }
336 
sar8(UChar v,UInt n)337 static inline UChar sar8 ( UChar v, UInt n )
338 {
339    return toUChar(((Char)v) >> n);
340 }
341 
shl16(UShort v,UInt n)342 static inline UShort shl16 ( UShort v, UInt n )
343 {
344    return toUShort(v << n);
345 }
346 
shr16(UShort v,UInt n)347 static inline UShort shr16 ( UShort v, UInt n )
348 {
349    return toUShort((((UShort)v) >> n));
350 }
351 
sar16(UShort v,UInt n)352 static inline UShort sar16 ( UShort v, UInt n )
353 {
354    return toUShort(((Short)v) >> n);
355 }
356 
shl32(UInt v,UInt n)357 static inline UInt shl32 ( UInt v, UInt n )
358 {
359    return v << n;
360 }
361 
shr32(UInt v,UInt n)362 static inline UInt shr32 ( UInt v, UInt n )
363 {
364    return (((UInt)v) >> n);
365 }
366 
sar32(UInt v,UInt n)367 static inline UInt sar32 ( UInt v, UInt n )
368 {
369    return ((Int)v) >> n;
370 }
371 
avg8U(UChar xx,UChar yy)372 static inline UChar avg8U ( UChar xx, UChar yy )
373 {
374    UInt xxi = (UInt)xx;
375    UInt yyi = (UInt)yy;
376    UInt r   = (xxi + yyi + 1) >> 1;
377    return (UChar)r;
378 }
379 
avg16U(UShort xx,UShort yy)380 static inline UShort avg16U ( UShort xx, UShort yy )
381 {
382    UInt xxi = (UInt)xx;
383    UInt yyi = (UInt)yy;
384    UInt r   = (xxi + yyi + 1) >> 1;
385    return (UShort)r;
386 }
387 
max16S(Short xx,Short yy)388 static inline Short max16S ( Short xx, Short yy )
389 {
390    return toUShort((xx > yy) ? xx : yy);
391 }
392 
max8U(UChar xx,UChar yy)393 static inline UChar max8U ( UChar xx, UChar yy )
394 {
395    return toUChar((xx > yy) ? xx : yy);
396 }
397 
min16S(Short xx,Short yy)398 static inline Short min16S ( Short xx, Short yy )
399 {
400    return toUShort((xx < yy) ? xx : yy);
401 }
402 
min8U(UChar xx,UChar yy)403 static inline UChar min8U ( UChar xx, UChar yy )
404 {
405    return toUChar((xx < yy) ? xx : yy);
406 }
407 
hadd16U(UShort xx,UShort yy)408 static inline UShort hadd16U ( UShort xx, UShort yy )
409 {
410    UInt xxi = (UInt)xx;
411    UInt yyi = (UInt)yy;
412    UInt r   = (xxi + yyi) >> 1;
413    return (UShort)r;
414 }
415 
hadd16S(Short xx,Short yy)416 static inline Short hadd16S ( Short xx, Short yy )
417 {
418    Int xxi = (Int)xx;
419    Int yyi = (Int)yy;
420    Int r   = (xxi + yyi) >> 1;
421    return (Short)r;
422 }
423 
hsub16U(UShort xx,UShort yy)424 static inline UShort hsub16U ( UShort xx, UShort yy )
425 {
426    UInt xxi = (UInt)xx;
427    UInt yyi = (UInt)yy;
428    UInt r   = (xxi - yyi) >> 1;
429    return (UShort)r;
430 }
431 
hsub16S(Short xx,Short yy)432 static inline Short hsub16S ( Short xx, Short yy )
433 {
434    Int xxi = (Int)xx;
435    Int yyi = (Int)yy;
436    Int r   = (xxi - yyi) >> 1;
437    return (Short)r;
438 }
439 
hadd8U(UChar xx,UChar yy)440 static inline UChar hadd8U ( UChar xx, UChar yy )
441 {
442    UInt xxi = (UInt)xx;
443    UInt yyi = (UInt)yy;
444    UInt r   = (xxi + yyi) >> 1;
445    return (UChar)r;
446 }
447 
hadd8S(Char xx,Char yy)448 static inline Char hadd8S ( Char xx, Char yy )
449 {
450    Int xxi = (Int)xx;
451    Int yyi = (Int)yy;
452    Int r   = (xxi + yyi) >> 1;
453    return (Char)r;
454 }
455 
hsub8U(UChar xx,UChar yy)456 static inline UChar hsub8U ( UChar xx, UChar yy )
457 {
458    UInt xxi = (UInt)xx;
459    UInt yyi = (UInt)yy;
460    UInt r   = (xxi - yyi) >> 1;
461    return (UChar)r;
462 }
463 
hsub8S(Char xx,Char yy)464 static inline Char hsub8S ( Char xx, Char yy )
465 {
466    Int xxi = (Int)xx;
467    Int yyi = (Int)yy;
468    Int r   = (xxi - yyi) >> 1;
469    return (Char)r;
470 }
471 
absdiff8U(UChar xx,UChar yy)472 static inline UInt absdiff8U ( UChar xx, UChar yy )
473 {
474    UInt xxu = (UChar)xx;
475    UInt yyu = (UChar)yy;
476    return xxu >= yyu  ? xxu - yyu  : yyu - xxu;
477 }
478 
479 /* ----------------------------------------------------- */
480 /* Start of the externally visible functions.  These simply
481    implement the corresponding IR primops. */
482 /* ----------------------------------------------------- */
483 
484 /* ------------ Normal addition ------------ */
485 
h_generic_calc_Add32x2(ULong xx,ULong yy)486 ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy )
487 {
488    return mk32x2(
489              sel32x2_1(xx) + sel32x2_1(yy),
490              sel32x2_0(xx) + sel32x2_0(yy)
491           );
492 }
493 
h_generic_calc_Add16x4(ULong xx,ULong yy)494 ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy )
495 {
496    return mk16x4(
497              toUShort( sel16x4_3(xx) + sel16x4_3(yy) ),
498              toUShort( sel16x4_2(xx) + sel16x4_2(yy) ),
499              toUShort( sel16x4_1(xx) + sel16x4_1(yy) ),
500              toUShort( sel16x4_0(xx) + sel16x4_0(yy) )
501           );
502 }
503 
h_generic_calc_Add8x8(ULong xx,ULong yy)504 ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy )
505 {
506    return mk8x8(
507              toUChar( sel8x8_7(xx) + sel8x8_7(yy) ),
508              toUChar( sel8x8_6(xx) + sel8x8_6(yy) ),
509              toUChar( sel8x8_5(xx) + sel8x8_5(yy) ),
510              toUChar( sel8x8_4(xx) + sel8x8_4(yy) ),
511              toUChar( sel8x8_3(xx) + sel8x8_3(yy) ),
512              toUChar( sel8x8_2(xx) + sel8x8_2(yy) ),
513              toUChar( sel8x8_1(xx) + sel8x8_1(yy) ),
514              toUChar( sel8x8_0(xx) + sel8x8_0(yy) )
515           );
516 }
517 
518 /* ------------ Saturating addition ------------ */
519 
h_generic_calc_QAdd16Sx4(ULong xx,ULong yy)520 ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy )
521 {
522    return mk16x4(
523              qadd16S( sel16x4_3(xx), sel16x4_3(yy) ),
524              qadd16S( sel16x4_2(xx), sel16x4_2(yy) ),
525              qadd16S( sel16x4_1(xx), sel16x4_1(yy) ),
526              qadd16S( sel16x4_0(xx), sel16x4_0(yy) )
527           );
528 }
529 
h_generic_calc_QAdd8Sx8(ULong xx,ULong yy)530 ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy )
531 {
532    return mk8x8(
533              qadd8S( sel8x8_7(xx), sel8x8_7(yy) ),
534              qadd8S( sel8x8_6(xx), sel8x8_6(yy) ),
535              qadd8S( sel8x8_5(xx), sel8x8_5(yy) ),
536              qadd8S( sel8x8_4(xx), sel8x8_4(yy) ),
537              qadd8S( sel8x8_3(xx), sel8x8_3(yy) ),
538              qadd8S( sel8x8_2(xx), sel8x8_2(yy) ),
539              qadd8S( sel8x8_1(xx), sel8x8_1(yy) ),
540              qadd8S( sel8x8_0(xx), sel8x8_0(yy) )
541           );
542 }
543 
h_generic_calc_QAdd16Ux4(ULong xx,ULong yy)544 ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy )
545 {
546    return mk16x4(
547              qadd16U( sel16x4_3(xx), sel16x4_3(yy) ),
548              qadd16U( sel16x4_2(xx), sel16x4_2(yy) ),
549              qadd16U( sel16x4_1(xx), sel16x4_1(yy) ),
550              qadd16U( sel16x4_0(xx), sel16x4_0(yy) )
551           );
552 }
553 
h_generic_calc_QAdd8Ux8(ULong xx,ULong yy)554 ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy )
555 {
556    return mk8x8(
557              qadd8U( sel8x8_7(xx), sel8x8_7(yy) ),
558              qadd8U( sel8x8_6(xx), sel8x8_6(yy) ),
559              qadd8U( sel8x8_5(xx), sel8x8_5(yy) ),
560              qadd8U( sel8x8_4(xx), sel8x8_4(yy) ),
561              qadd8U( sel8x8_3(xx), sel8x8_3(yy) ),
562              qadd8U( sel8x8_2(xx), sel8x8_2(yy) ),
563              qadd8U( sel8x8_1(xx), sel8x8_1(yy) ),
564              qadd8U( sel8x8_0(xx), sel8x8_0(yy) )
565           );
566 }
567 
568 /* ------------ Normal subtraction ------------ */
569 
h_generic_calc_Sub32x2(ULong xx,ULong yy)570 ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy )
571 {
572    return mk32x2(
573              sel32x2_1(xx) - sel32x2_1(yy),
574              sel32x2_0(xx) - sel32x2_0(yy)
575           );
576 }
577 
h_generic_calc_Sub16x4(ULong xx,ULong yy)578 ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy )
579 {
580    return mk16x4(
581              toUShort( sel16x4_3(xx) - sel16x4_3(yy) ),
582              toUShort( sel16x4_2(xx) - sel16x4_2(yy) ),
583              toUShort( sel16x4_1(xx) - sel16x4_1(yy) ),
584              toUShort( sel16x4_0(xx) - sel16x4_0(yy) )
585           );
586 }
587 
h_generic_calc_Sub8x8(ULong xx,ULong yy)588 ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy )
589 {
590    return mk8x8(
591              toUChar( sel8x8_7(xx) - sel8x8_7(yy) ),
592              toUChar( sel8x8_6(xx) - sel8x8_6(yy) ),
593              toUChar( sel8x8_5(xx) - sel8x8_5(yy) ),
594              toUChar( sel8x8_4(xx) - sel8x8_4(yy) ),
595              toUChar( sel8x8_3(xx) - sel8x8_3(yy) ),
596              toUChar( sel8x8_2(xx) - sel8x8_2(yy) ),
597              toUChar( sel8x8_1(xx) - sel8x8_1(yy) ),
598              toUChar( sel8x8_0(xx) - sel8x8_0(yy) )
599           );
600 }
601 
602 /* ------------ Saturating subtraction ------------ */
603 
h_generic_calc_QSub16Sx4(ULong xx,ULong yy)604 ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy )
605 {
606    return mk16x4(
607              qsub16S( sel16x4_3(xx), sel16x4_3(yy) ),
608              qsub16S( sel16x4_2(xx), sel16x4_2(yy) ),
609              qsub16S( sel16x4_1(xx), sel16x4_1(yy) ),
610              qsub16S( sel16x4_0(xx), sel16x4_0(yy) )
611           );
612 }
613 
h_generic_calc_QSub8Sx8(ULong xx,ULong yy)614 ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy )
615 {
616    return mk8x8(
617              qsub8S( sel8x8_7(xx), sel8x8_7(yy) ),
618              qsub8S( sel8x8_6(xx), sel8x8_6(yy) ),
619              qsub8S( sel8x8_5(xx), sel8x8_5(yy) ),
620              qsub8S( sel8x8_4(xx), sel8x8_4(yy) ),
621              qsub8S( sel8x8_3(xx), sel8x8_3(yy) ),
622              qsub8S( sel8x8_2(xx), sel8x8_2(yy) ),
623              qsub8S( sel8x8_1(xx), sel8x8_1(yy) ),
624              qsub8S( sel8x8_0(xx), sel8x8_0(yy) )
625           );
626 }
627 
h_generic_calc_QSub16Ux4(ULong xx,ULong yy)628 ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy )
629 {
630    return mk16x4(
631              qsub16U( sel16x4_3(xx), sel16x4_3(yy) ),
632              qsub16U( sel16x4_2(xx), sel16x4_2(yy) ),
633              qsub16U( sel16x4_1(xx), sel16x4_1(yy) ),
634              qsub16U( sel16x4_0(xx), sel16x4_0(yy) )
635           );
636 }
637 
h_generic_calc_QSub8Ux8(ULong xx,ULong yy)638 ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy )
639 {
640    return mk8x8(
641              qsub8U( sel8x8_7(xx), sel8x8_7(yy) ),
642              qsub8U( sel8x8_6(xx), sel8x8_6(yy) ),
643              qsub8U( sel8x8_5(xx), sel8x8_5(yy) ),
644              qsub8U( sel8x8_4(xx), sel8x8_4(yy) ),
645              qsub8U( sel8x8_3(xx), sel8x8_3(yy) ),
646              qsub8U( sel8x8_2(xx), sel8x8_2(yy) ),
647              qsub8U( sel8x8_1(xx), sel8x8_1(yy) ),
648              qsub8U( sel8x8_0(xx), sel8x8_0(yy) )
649           );
650 }
651 
652 /* ------------ Multiplication ------------ */
653 
h_generic_calc_Mul16x4(ULong xx,ULong yy)654 ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy )
655 {
656    return mk16x4(
657              mul16( sel16x4_3(xx), sel16x4_3(yy) ),
658              mul16( sel16x4_2(xx), sel16x4_2(yy) ),
659              mul16( sel16x4_1(xx), sel16x4_1(yy) ),
660              mul16( sel16x4_0(xx), sel16x4_0(yy) )
661           );
662 }
663 
h_generic_calc_Mul32x2(ULong xx,ULong yy)664 ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy )
665 {
666    return mk32x2(
667              mul32( sel32x2_1(xx), sel32x2_1(yy) ),
668              mul32( sel32x2_0(xx), sel32x2_0(yy) )
669           );
670 }
671 
h_generic_calc_MulHi16Sx4(ULong xx,ULong yy)672 ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy )
673 {
674    return mk16x4(
675              mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ),
676              mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ),
677              mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ),
678              mulhi16S( sel16x4_0(xx), sel16x4_0(yy) )
679           );
680 }
681 
h_generic_calc_MulHi16Ux4(ULong xx,ULong yy)682 ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy )
683 {
684    return mk16x4(
685              mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ),
686              mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ),
687              mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ),
688              mulhi16U( sel16x4_0(xx), sel16x4_0(yy) )
689           );
690 }
691 
692 /* ------------ Comparison ------------ */
693 
h_generic_calc_CmpEQ32x2(ULong xx,ULong yy)694 ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy )
695 {
696    return mk32x2(
697              cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ),
698              cmpeq32( sel32x2_0(xx), sel32x2_0(yy) )
699           );
700 }
701 
h_generic_calc_CmpEQ16x4(ULong xx,ULong yy)702 ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy )
703 {
704    return mk16x4(
705              cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ),
706              cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ),
707              cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ),
708              cmpeq16( sel16x4_0(xx), sel16x4_0(yy) )
709           );
710 }
711 
h_generic_calc_CmpEQ8x8(ULong xx,ULong yy)712 ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy )
713 {
714    return mk8x8(
715              cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ),
716              cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ),
717              cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ),
718              cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ),
719              cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ),
720              cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ),
721              cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ),
722              cmpeq8( sel8x8_0(xx), sel8x8_0(yy) )
723           );
724 }
725 
h_generic_calc_CmpGT32Sx2(ULong xx,ULong yy)726 ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy )
727 {
728    return mk32x2(
729              cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ),
730              cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) )
731           );
732 }
733 
h_generic_calc_CmpGT16Sx4(ULong xx,ULong yy)734 ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy )
735 {
736    return mk16x4(
737              cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ),
738              cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ),
739              cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ),
740              cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) )
741           );
742 }
743 
h_generic_calc_CmpGT8Sx8(ULong xx,ULong yy)744 ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy )
745 {
746    return mk8x8(
747              cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ),
748              cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ),
749              cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ),
750              cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ),
751              cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ),
752              cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ),
753              cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ),
754              cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) )
755           );
756 }
757 
h_generic_calc_CmpNEZ32x2(ULong xx)758 ULong h_generic_calc_CmpNEZ32x2 ( ULong xx )
759 {
760    return mk32x2(
761              cmpnez32( sel32x2_1(xx) ),
762              cmpnez32( sel32x2_0(xx) )
763           );
764 }
765 
h_generic_calc_CmpNEZ16x4(ULong xx)766 ULong h_generic_calc_CmpNEZ16x4 ( ULong xx )
767 {
768    return mk16x4(
769              cmpnez16( sel16x4_3(xx) ),
770              cmpnez16( sel16x4_2(xx) ),
771              cmpnez16( sel16x4_1(xx) ),
772              cmpnez16( sel16x4_0(xx) )
773           );
774 }
775 
h_generic_calc_CmpNEZ8x8(ULong xx)776 ULong h_generic_calc_CmpNEZ8x8 ( ULong xx )
777 {
778    return mk8x8(
779              cmpnez8( sel8x8_7(xx) ),
780              cmpnez8( sel8x8_6(xx) ),
781              cmpnez8( sel8x8_5(xx) ),
782              cmpnez8( sel8x8_4(xx) ),
783              cmpnez8( sel8x8_3(xx) ),
784              cmpnez8( sel8x8_2(xx) ),
785              cmpnez8( sel8x8_1(xx) ),
786              cmpnez8( sel8x8_0(xx) )
787           );
788 }
789 
790 /* ------------ Saturating narrowing ------------ */
791 
h_generic_calc_QNarrowBin32Sto16Sx4(ULong aa,ULong bb)792 ULong h_generic_calc_QNarrowBin32Sto16Sx4 ( ULong aa, ULong bb )
793 {
794    UInt d = sel32x2_1(aa);
795    UInt c = sel32x2_0(aa);
796    UInt b = sel32x2_1(bb);
797    UInt a = sel32x2_0(bb);
798    return mk16x4(
799              qnarrow32Sto16S(d),
800              qnarrow32Sto16S(c),
801              qnarrow32Sto16S(b),
802              qnarrow32Sto16S(a)
803           );
804 }
805 
h_generic_calc_QNarrowBin16Sto8Sx8(ULong aa,ULong bb)806 ULong h_generic_calc_QNarrowBin16Sto8Sx8 ( ULong aa, ULong bb )
807 {
808    UShort h = sel16x4_3(aa);
809    UShort g = sel16x4_2(aa);
810    UShort f = sel16x4_1(aa);
811    UShort e = sel16x4_0(aa);
812    UShort d = sel16x4_3(bb);
813    UShort c = sel16x4_2(bb);
814    UShort b = sel16x4_1(bb);
815    UShort a = sel16x4_0(bb);
816    return mk8x8(
817              qnarrow16Sto8S(h),
818              qnarrow16Sto8S(g),
819              qnarrow16Sto8S(f),
820              qnarrow16Sto8S(e),
821              qnarrow16Sto8S(d),
822              qnarrow16Sto8S(c),
823              qnarrow16Sto8S(b),
824              qnarrow16Sto8S(a)
825           );
826 }
827 
h_generic_calc_QNarrowBin16Sto8Ux8(ULong aa,ULong bb)828 ULong h_generic_calc_QNarrowBin16Sto8Ux8 ( ULong aa, ULong bb )
829 {
830    UShort h = sel16x4_3(aa);
831    UShort g = sel16x4_2(aa);
832    UShort f = sel16x4_1(aa);
833    UShort e = sel16x4_0(aa);
834    UShort d = sel16x4_3(bb);
835    UShort c = sel16x4_2(bb);
836    UShort b = sel16x4_1(bb);
837    UShort a = sel16x4_0(bb);
838    return mk8x8(
839              qnarrow16Sto8U(h),
840              qnarrow16Sto8U(g),
841              qnarrow16Sto8U(f),
842              qnarrow16Sto8U(e),
843              qnarrow16Sto8U(d),
844              qnarrow16Sto8U(c),
845              qnarrow16Sto8U(b),
846              qnarrow16Sto8U(a)
847           );
848 }
849 
850 /* ------------ Truncating narrowing ------------ */
851 
h_generic_calc_NarrowBin32to16x4(ULong aa,ULong bb)852 ULong h_generic_calc_NarrowBin32to16x4 ( ULong aa, ULong bb )
853 {
854    UInt d = sel32x2_1(aa);
855    UInt c = sel32x2_0(aa);
856    UInt b = sel32x2_1(bb);
857    UInt a = sel32x2_0(bb);
858    return mk16x4(
859              narrow32to16(d),
860              narrow32to16(c),
861              narrow32to16(b),
862              narrow32to16(a)
863           );
864 }
865 
h_generic_calc_NarrowBin16to8x8(ULong aa,ULong bb)866 ULong h_generic_calc_NarrowBin16to8x8 ( ULong aa, ULong bb )
867 {
868    UShort h = sel16x4_3(aa);
869    UShort g = sel16x4_2(aa);
870    UShort f = sel16x4_1(aa);
871    UShort e = sel16x4_0(aa);
872    UShort d = sel16x4_3(bb);
873    UShort c = sel16x4_2(bb);
874    UShort b = sel16x4_1(bb);
875    UShort a = sel16x4_0(bb);
876    return mk8x8(
877              narrow16to8(h),
878              narrow16to8(g),
879              narrow16to8(f),
880              narrow16to8(e),
881              narrow16to8(d),
882              narrow16to8(c),
883              narrow16to8(b),
884              narrow16to8(a)
885           );
886 }
887 
888 /* ------------ Interleaving ------------ */
889 
h_generic_calc_InterleaveHI8x8(ULong aa,ULong bb)890 ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb )
891 {
892    return mk8x8(
893              sel8x8_7(aa),
894              sel8x8_7(bb),
895              sel8x8_6(aa),
896              sel8x8_6(bb),
897              sel8x8_5(aa),
898              sel8x8_5(bb),
899              sel8x8_4(aa),
900              sel8x8_4(bb)
901           );
902 }
903 
h_generic_calc_InterleaveLO8x8(ULong aa,ULong bb)904 ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb )
905 {
906    return mk8x8(
907              sel8x8_3(aa),
908              sel8x8_3(bb),
909              sel8x8_2(aa),
910              sel8x8_2(bb),
911              sel8x8_1(aa),
912              sel8x8_1(bb),
913              sel8x8_0(aa),
914              sel8x8_0(bb)
915           );
916 }
917 
h_generic_calc_InterleaveHI16x4(ULong aa,ULong bb)918 ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb )
919 {
920    return mk16x4(
921              sel16x4_3(aa),
922              sel16x4_3(bb),
923              sel16x4_2(aa),
924              sel16x4_2(bb)
925           );
926 }
927 
h_generic_calc_InterleaveLO16x4(ULong aa,ULong bb)928 ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb )
929 {
930    return mk16x4(
931              sel16x4_1(aa),
932              sel16x4_1(bb),
933              sel16x4_0(aa),
934              sel16x4_0(bb)
935           );
936 }
937 
h_generic_calc_InterleaveHI32x2(ULong aa,ULong bb)938 ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb )
939 {
940    return mk32x2(
941              sel32x2_1(aa),
942              sel32x2_1(bb)
943           );
944 }
945 
h_generic_calc_InterleaveLO32x2(ULong aa,ULong bb)946 ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb )
947 {
948    return mk32x2(
949              sel32x2_0(aa),
950              sel32x2_0(bb)
951           );
952 }
953 
954 /* ------------ Concatenation ------------ */
955 
h_generic_calc_CatOddLanes16x4(ULong aa,ULong bb)956 ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb )
957 {
958    return mk16x4(
959              sel16x4_3(aa),
960              sel16x4_1(aa),
961              sel16x4_3(bb),
962              sel16x4_1(bb)
963           );
964 }
965 
h_generic_calc_CatEvenLanes16x4(ULong aa,ULong bb)966 ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb )
967 {
968    return mk16x4(
969              sel16x4_2(aa),
970              sel16x4_0(aa),
971              sel16x4_2(bb),
972              sel16x4_0(bb)
973           );
974 }
975 
976 /* misc hack looking for a proper home */
h_generic_calc_Perm8x8(ULong aa,ULong bb)977 ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb )
978 {
979    return mk8x8(
980              index8x8(aa, sel8x8_7(bb)),
981              index8x8(aa, sel8x8_6(bb)),
982              index8x8(aa, sel8x8_5(bb)),
983              index8x8(aa, sel8x8_4(bb)),
984              index8x8(aa, sel8x8_3(bb)),
985              index8x8(aa, sel8x8_2(bb)),
986              index8x8(aa, sel8x8_1(bb)),
987              index8x8(aa, sel8x8_0(bb))
988           );
989 }
990 
991 /* ------------ Shifting ------------ */
992 /* Note that because these primops are undefined if the shift amount
993    equals or exceeds the lane width, the shift amount is masked so
994    that the scalar shifts are always in range.  In fact, given the
995    semantics of these primops (ShlN16x4, etc) it is an error if in
996    fact we are ever given an out-of-range shift amount.
997 */
h_generic_calc_ShlN32x2(ULong xx,UInt nn)998 ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn )
999 {
1000    /* vassert(nn < 32); */
1001    nn &= 31;
1002    return mk32x2(
1003              shl32( sel32x2_1(xx), nn ),
1004              shl32( sel32x2_0(xx), nn )
1005           );
1006 }
1007 
h_generic_calc_ShlN16x4(ULong xx,UInt nn)1008 ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn )
1009 {
1010    /* vassert(nn < 16); */
1011    nn &= 15;
1012    return mk16x4(
1013              shl16( sel16x4_3(xx), nn ),
1014              shl16( sel16x4_2(xx), nn ),
1015              shl16( sel16x4_1(xx), nn ),
1016              shl16( sel16x4_0(xx), nn )
1017           );
1018 }
1019 
h_generic_calc_ShlN8x8(ULong xx,UInt nn)1020 ULong h_generic_calc_ShlN8x8  ( ULong xx, UInt nn )
1021 {
1022    /* vassert(nn < 8); */
1023    nn &= 7;
1024    return mk8x8(
1025              shl8( sel8x8_7(xx), nn ),
1026              shl8( sel8x8_6(xx), nn ),
1027              shl8( sel8x8_5(xx), nn ),
1028              shl8( sel8x8_4(xx), nn ),
1029              shl8( sel8x8_3(xx), nn ),
1030              shl8( sel8x8_2(xx), nn ),
1031              shl8( sel8x8_1(xx), nn ),
1032              shl8( sel8x8_0(xx), nn )
1033           );
1034 }
1035 
h_generic_calc_ShrN32x2(ULong xx,UInt nn)1036 ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn )
1037 {
1038    /* vassert(nn < 32); */
1039    nn &= 31;
1040    return mk32x2(
1041              shr32( sel32x2_1(xx), nn ),
1042              shr32( sel32x2_0(xx), nn )
1043           );
1044 }
1045 
h_generic_calc_ShrN16x4(ULong xx,UInt nn)1046 ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn )
1047 {
1048    /* vassert(nn < 16); */
1049    nn &= 15;
1050    return mk16x4(
1051              shr16( sel16x4_3(xx), nn ),
1052              shr16( sel16x4_2(xx), nn ),
1053              shr16( sel16x4_1(xx), nn ),
1054              shr16( sel16x4_0(xx), nn )
1055           );
1056 }
1057 
h_generic_calc_SarN32x2(ULong xx,UInt nn)1058 ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn )
1059 {
1060    /* vassert(nn < 32); */
1061    nn &= 31;
1062    return mk32x2(
1063              sar32( sel32x2_1(xx), nn ),
1064              sar32( sel32x2_0(xx), nn )
1065           );
1066 }
1067 
h_generic_calc_SarN16x4(ULong xx,UInt nn)1068 ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn )
1069 {
1070    /* vassert(nn < 16); */
1071    nn &= 15;
1072    return mk16x4(
1073              sar16( sel16x4_3(xx), nn ),
1074              sar16( sel16x4_2(xx), nn ),
1075              sar16( sel16x4_1(xx), nn ),
1076              sar16( sel16x4_0(xx), nn )
1077           );
1078 }
1079 
h_generic_calc_SarN8x8(ULong xx,UInt nn)1080 ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn )
1081 {
1082    /* vassert(nn < 8); */
1083    nn &= 7;
1084    return mk8x8(
1085              sar8( sel8x8_7(xx), nn ),
1086              sar8( sel8x8_6(xx), nn ),
1087              sar8( sel8x8_5(xx), nn ),
1088              sar8( sel8x8_4(xx), nn ),
1089              sar8( sel8x8_3(xx), nn ),
1090              sar8( sel8x8_2(xx), nn ),
1091              sar8( sel8x8_1(xx), nn ),
1092              sar8( sel8x8_0(xx), nn )
1093           );
1094 }
1095 
1096 /* ------------ Averaging ------------ */
1097 
h_generic_calc_Avg8Ux8(ULong xx,ULong yy)1098 ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy )
1099 {
1100    return mk8x8(
1101              avg8U( sel8x8_7(xx), sel8x8_7(yy) ),
1102              avg8U( sel8x8_6(xx), sel8x8_6(yy) ),
1103              avg8U( sel8x8_5(xx), sel8x8_5(yy) ),
1104              avg8U( sel8x8_4(xx), sel8x8_4(yy) ),
1105              avg8U( sel8x8_3(xx), sel8x8_3(yy) ),
1106              avg8U( sel8x8_2(xx), sel8x8_2(yy) ),
1107              avg8U( sel8x8_1(xx), sel8x8_1(yy) ),
1108              avg8U( sel8x8_0(xx), sel8x8_0(yy) )
1109           );
1110 }
1111 
h_generic_calc_Avg16Ux4(ULong xx,ULong yy)1112 ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy )
1113 {
1114    return mk16x4(
1115              avg16U( sel16x4_3(xx), sel16x4_3(yy) ),
1116              avg16U( sel16x4_2(xx), sel16x4_2(yy) ),
1117              avg16U( sel16x4_1(xx), sel16x4_1(yy) ),
1118              avg16U( sel16x4_0(xx), sel16x4_0(yy) )
1119           );
1120 }
1121 
1122 /* ------------ max/min ------------ */
1123 
h_generic_calc_Max16Sx4(ULong xx,ULong yy)1124 ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy )
1125 {
1126    return mk16x4(
1127              max16S( sel16x4_3(xx), sel16x4_3(yy) ),
1128              max16S( sel16x4_2(xx), sel16x4_2(yy) ),
1129              max16S( sel16x4_1(xx), sel16x4_1(yy) ),
1130              max16S( sel16x4_0(xx), sel16x4_0(yy) )
1131           );
1132 }
1133 
h_generic_calc_Max8Ux8(ULong xx,ULong yy)1134 ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy )
1135 {
1136    return mk8x8(
1137              max8U( sel8x8_7(xx), sel8x8_7(yy) ),
1138              max8U( sel8x8_6(xx), sel8x8_6(yy) ),
1139              max8U( sel8x8_5(xx), sel8x8_5(yy) ),
1140              max8U( sel8x8_4(xx), sel8x8_4(yy) ),
1141              max8U( sel8x8_3(xx), sel8x8_3(yy) ),
1142              max8U( sel8x8_2(xx), sel8x8_2(yy) ),
1143              max8U( sel8x8_1(xx), sel8x8_1(yy) ),
1144              max8U( sel8x8_0(xx), sel8x8_0(yy) )
1145           );
1146 }
1147 
h_generic_calc_Min16Sx4(ULong xx,ULong yy)1148 ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy )
1149 {
1150    return mk16x4(
1151              min16S( sel16x4_3(xx), sel16x4_3(yy) ),
1152              min16S( sel16x4_2(xx), sel16x4_2(yy) ),
1153              min16S( sel16x4_1(xx), sel16x4_1(yy) ),
1154              min16S( sel16x4_0(xx), sel16x4_0(yy) )
1155           );
1156 }
1157 
h_generic_calc_Min8Ux8(ULong xx,ULong yy)1158 ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy )
1159 {
1160    return mk8x8(
1161              min8U( sel8x8_7(xx), sel8x8_7(yy) ),
1162              min8U( sel8x8_6(xx), sel8x8_6(yy) ),
1163              min8U( sel8x8_5(xx), sel8x8_5(yy) ),
1164              min8U( sel8x8_4(xx), sel8x8_4(yy) ),
1165              min8U( sel8x8_3(xx), sel8x8_3(yy) ),
1166              min8U( sel8x8_2(xx), sel8x8_2(yy) ),
1167              min8U( sel8x8_1(xx), sel8x8_1(yy) ),
1168              min8U( sel8x8_0(xx), sel8x8_0(yy) )
1169           );
1170 }
1171 
1172 /* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */
1173 
1174 /* Tuple/select functions for 16x2 vectors. */
mk16x2(UShort w1,UShort w2)1175 static inline UInt mk16x2 ( UShort w1, UShort w2 ) {
1176    return (((UInt)w1) << 16) | ((UInt)w2);
1177 }
1178 
sel16x2_1(UInt w32)1179 static inline UShort sel16x2_1 ( UInt w32 ) {
1180    return 0xFFFF & (UShort)(w32 >> 16);
1181 }
sel16x2_0(UInt w32)1182 static inline UShort sel16x2_0 ( UInt w32 ) {
1183    return 0xFFFF & (UShort)(w32);
1184 }
1185 
mk8x4(UChar w3,UChar w2,UChar w1,UChar w0)1186 static inline UInt mk8x4 ( UChar w3, UChar w2,
1187                            UChar w1, UChar w0 ) {
1188    UInt w32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
1189               | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
1190    return w32;
1191 }
1192 
sel8x4_3(UInt w32)1193 static inline UChar sel8x4_3 ( UInt w32 ) {
1194    return toUChar(0xFF & (w32 >> 24));
1195 }
sel8x4_2(UInt w32)1196 static inline UChar sel8x4_2 ( UInt w32 ) {
1197    return toUChar(0xFF & (w32 >> 16));
1198 }
sel8x4_1(UInt w32)1199 static inline UChar sel8x4_1 ( UInt w32 ) {
1200    return toUChar(0xFF & (w32 >> 8));
1201 }
sel8x4_0(UInt w32)1202 static inline UChar sel8x4_0 ( UInt w32 ) {
1203    return toUChar(0xFF & (w32 >> 0));
1204 }
1205 
1206 
1207 /* ----------------------------------------------------- */
1208 /* More externally visible functions.  These simply
1209    implement the corresponding IR primops. */
1210 /* ----------------------------------------------------- */
1211 
1212 /* ------ 16x2 ------ */
1213 
h_generic_calc_Add16x2(UInt xx,UInt yy)1214 UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy )
1215 {
1216    return mk16x2( sel16x2_1(xx) + sel16x2_1(yy),
1217                   sel16x2_0(xx) + sel16x2_0(yy) );
1218 }
1219 
h_generic_calc_Sub16x2(UInt xx,UInt yy)1220 UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy )
1221 {
1222    return mk16x2( sel16x2_1(xx) - sel16x2_1(yy),
1223                   sel16x2_0(xx) - sel16x2_0(yy) );
1224 }
1225 
h_generic_calc_HAdd16Ux2(UInt xx,UInt yy)1226 UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy )
1227 {
1228    return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1229                   hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1230 }
1231 
h_generic_calc_HAdd16Sx2(UInt xx,UInt yy)1232 UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy )
1233 {
1234    return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1235                   hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1236 }
1237 
h_generic_calc_HSub16Ux2(UInt xx,UInt yy)1238 UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy )
1239 {
1240    return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1241                   hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1242 }
1243 
h_generic_calc_HSub16Sx2(UInt xx,UInt yy)1244 UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy )
1245 {
1246    return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1247                   hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1248 }
1249 
h_generic_calc_QAdd16Ux2(UInt xx,UInt yy)1250 UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy )
1251 {
1252    return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1253                   qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1254 }
1255 
h_generic_calc_QAdd16Sx2(UInt xx,UInt yy)1256 UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy )
1257 {
1258    return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1259                   qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1260 }
1261 
h_generic_calc_QSub16Ux2(UInt xx,UInt yy)1262 UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy )
1263 {
1264    return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1265                   qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1266 }
1267 
h_generic_calc_QSub16Sx2(UInt xx,UInt yy)1268 UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy )
1269 {
1270    return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1271                   qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1272 }
1273 
1274 /* ------ 8x4 ------ */
1275 
h_generic_calc_Add8x4(UInt xx,UInt yy)1276 UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy )
1277 {
1278    return mk8x4(
1279              sel8x4_3(xx) + sel8x4_3(yy),
1280              sel8x4_2(xx) + sel8x4_2(yy),
1281              sel8x4_1(xx) + sel8x4_1(yy),
1282              sel8x4_0(xx) + sel8x4_0(yy)
1283           );
1284 }
1285 
h_generic_calc_Sub8x4(UInt xx,UInt yy)1286 UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy )
1287 {
1288    return mk8x4(
1289              sel8x4_3(xx) - sel8x4_3(yy),
1290              sel8x4_2(xx) - sel8x4_2(yy),
1291              sel8x4_1(xx) - sel8x4_1(yy),
1292              sel8x4_0(xx) - sel8x4_0(yy)
1293           );
1294 }
1295 
h_generic_calc_HAdd8Ux4(UInt xx,UInt yy)1296 UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy )
1297 {
1298    return mk8x4(
1299              hadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1300              hadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1301              hadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1302              hadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1303           );
1304 }
1305 
h_generic_calc_HAdd8Sx4(UInt xx,UInt yy)1306 UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy )
1307 {
1308    return mk8x4(
1309              hadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1310              hadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1311              hadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1312              hadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1313           );
1314 }
1315 
h_generic_calc_HSub8Ux4(UInt xx,UInt yy)1316 UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy )
1317 {
1318    return mk8x4(
1319              hsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1320              hsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1321              hsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1322              hsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1323           );
1324 }
1325 
h_generic_calc_HSub8Sx4(UInt xx,UInt yy)1326 UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy )
1327 {
1328    return mk8x4(
1329              hsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1330              hsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1331              hsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1332              hsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1333           );
1334 }
1335 
h_generic_calc_QAdd8Ux4(UInt xx,UInt yy)1336 UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy )
1337 {
1338    return mk8x4(
1339              qadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1340              qadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1341              qadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1342              qadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1343           );
1344 }
1345 
h_generic_calc_QAdd8Sx4(UInt xx,UInt yy)1346 UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy )
1347 {
1348    return mk8x4(
1349              qadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1350              qadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1351              qadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1352              qadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1353           );
1354 }
1355 
h_generic_calc_QSub8Ux4(UInt xx,UInt yy)1356 UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy )
1357 {
1358    return mk8x4(
1359              qsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1360              qsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1361              qsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1362              qsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1363           );
1364 }
1365 
h_generic_calc_QSub8Sx4(UInt xx,UInt yy)1366 UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy )
1367 {
1368    return mk8x4(
1369              qsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1370              qsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1371              qsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1372              qsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1373           );
1374 }
1375 
h_generic_calc_CmpNEZ16x2(UInt xx)1376 UInt h_generic_calc_CmpNEZ16x2 ( UInt xx )
1377 {
1378    return mk16x2(
1379              cmpnez16( sel16x2_1(xx) ),
1380              cmpnez16( sel16x2_0(xx) )
1381           );
1382 }
1383 
h_generic_calc_CmpNEZ8x4(UInt xx)1384 UInt h_generic_calc_CmpNEZ8x4 ( UInt xx )
1385 {
1386    return mk8x4(
1387              cmpnez8( sel8x4_3(xx) ),
1388              cmpnez8( sel8x4_2(xx) ),
1389              cmpnez8( sel8x4_1(xx) ),
1390              cmpnez8( sel8x4_0(xx) )
1391           );
1392 }
1393 
h_generic_calc_Sad8Ux4(UInt xx,UInt yy)1394 UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy )
1395 {
1396    return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) )
1397           + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) )
1398           + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) )
1399           + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) );
1400 }
1401 
h_generic_calc_QAdd32S(UInt xx,UInt yy)1402 UInt h_generic_calc_QAdd32S ( UInt xx, UInt yy )
1403 {
1404    return qadd32S( xx, yy );
1405 }
1406 
h_generic_calc_QSub32S(UInt xx,UInt yy)1407 UInt h_generic_calc_QSub32S ( UInt xx, UInt yy )
1408 {
1409    return qsub32S( xx, yy );
1410 }
1411 
1412 
1413 /*------------------------------------------------------------------*/
1414 /* Decimal Floating Point (DFP) externally visible helper functions */
1415 /* that implement Iop_BCDtoDPB and Iop_DPBtoBCD                     */
1416 /*------------------------------------------------------------------*/
1417 
1418 #define NOT( x )    ( ( ( x ) == 0) ? 1 : 0)
1419 #define GET( x, y ) ( ( ( x ) & ( 0x1UL << ( y ) ) ) >> ( y ) )
1420 #define PUT( x, y ) ( ( x )<< ( y ) )
1421 
dpb_to_bcd(ULong chunk)1422 ULong dpb_to_bcd( ULong chunk )
1423 {
1424    Short a, b, c, d, e, f, g, h, i, j, k, m;
1425    Short p, q, r, s, t, u, v, w, x, y;
1426    ULong value;
1427 
1428    /* convert 10 bit densely packed BCD to BCD */
1429    p = GET( chunk, 9 );
1430    q = GET( chunk, 8 );
1431    r = GET( chunk, 7 );
1432    s = GET( chunk, 6 );
1433    t = GET( chunk, 5 );
1434    u = GET( chunk, 4 );
1435    v = GET( chunk, 3 );
1436    w = GET( chunk, 2 );
1437    x = GET( chunk, 1 );
1438    y = GET( chunk, 0 );
1439 
1440    /* The BCD bit values are given by the following boolean equations.*/
1441    a = ( NOT(s) & v & w ) | ( t & v & w & s ) | ( v & w & NOT(x) );
1442    b = ( p & s & x & NOT(t) ) | ( p & NOT(w) ) | ( p & NOT(v) );
1443    c = ( q & s & x & NOT(t) ) | ( q & NOT(w) ) | ( q & NOT(v) );
1444    d = r;
1445    e = ( v & NOT(w) & x ) | ( s & v & w & x ) | ( NOT(t) & v & x & w );
1446    f = ( p & t & v & w & x & NOT(s) ) | ( s & NOT(x) & v ) | ( s & NOT(v) );
1447    g = ( q & t & w & v & x & NOT(s) ) | ( t & NOT(x) & v ) | ( t & NOT(v) );
1448    h = u;
1449    i = ( t & v & w & x ) | ( s & v & w & x ) | ( v & NOT(w) & NOT(x) );
1450    j = ( p & NOT(s) & NOT(t) & w & v ) | ( s & v & NOT(w) & x )
1451             | ( p & w & NOT(x) & v ) | ( w & NOT(v) );
1452    k = ( q & NOT(s) & NOT(t) & v & w ) | ( t & v & NOT(w) & x )
1453             | ( q & v & w & NOT(x) ) | ( x & NOT(v) );
1454    m = y;
1455 
1456    value = PUT(a, 11) | PUT(b, 10) | PUT(c, 9) | PUT(d, 8) | PUT(e, 7)
1457             | PUT(f, 6) | PUT(g, 5) | PUT(h, 4) | PUT(i, 3) | PUT(j, 2)
1458             | PUT(k, 1) | PUT(m, 0);
1459    return value;
1460 }
1461 
bcd_to_dpb(ULong chunk)1462 ULong bcd_to_dpb( ULong chunk )
1463 {
1464    Short a, b, c, d, e, f, g, h, i, j, k, m;
1465    Short p, q, r, s, t, u, v, w, x, y;
1466    ULong value;
1467    /* Convert a 3 digit BCD value to a 10 bit Densely Packed Binary (DPD) value
1468     The boolean equations to calculate the value of each of the DPD bit
1469     is given in Appendix B  of Book 1: Power ISA User Instruction set.  The
1470     bits for the DPD number are [abcdefghijkm].  The bits for the BCD value
1471     are [pqrstuvwxy].  The boolean logic equations in psuedo C code are:
1472     */
1473    a = GET( chunk, 11 );
1474    b = GET( chunk, 10 );
1475    c = GET( chunk, 9 );
1476    d = GET( chunk, 8 );
1477    e = GET( chunk, 7 );
1478    f = GET( chunk, 6 );
1479    g = GET( chunk, 5 );
1480    h = GET( chunk, 4 );
1481    i = GET( chunk, 3 );
1482    j = GET( chunk, 2 );
1483    k = GET( chunk, 1 );
1484    m = GET( chunk, 0 );
1485 
1486    p = ( f & a & i & NOT(e) ) | ( j & a & NOT(i) ) | ( b & NOT(a) );
1487    q = ( g & a & i & NOT(e) ) | ( k & a & NOT(i) ) | ( c & NOT(a) );
1488    r = d;
1489    s = ( j & NOT(a) & e & NOT(i) ) | ( f & NOT(i) & NOT(e) )
1490             | ( f & NOT(a) & NOT(e) ) | ( e & i );
1491    t = ( k & NOT(a) & e & NOT(i) ) | ( g & NOT(i) & NOT(e) )
1492             | ( g & NOT(a) & NOT(e) ) | ( a & i );
1493    u = h;
1494    v = a | e | i;
1495    w = ( NOT(e) & j & NOT(i) ) | ( e & i ) | a;
1496    x = ( NOT(a) & k & NOT(i) ) | ( a & i ) | e;
1497    y = m;
1498 
1499    value = PUT(p, 9) | PUT(q, 8) | PUT(r, 7) | PUT(s, 6) | PUT(t, 5)
1500             | PUT(u, 4) | PUT(v, 3) | PUT(w, 2) | PUT(x, 1) | y;
1501 
1502    return value;
1503 }
1504 
h_DPBtoBCD(ULong dpb)1505 ULong h_DPBtoBCD( ULong dpb )
1506 {
1507    ULong result, chunk;
1508    Int i;
1509 
1510    result = 0;
1511 
1512    for (i = 0; i < 5; i++) {
1513       chunk = dpb >> ( 4 - i ) * 10;
1514       result = result << 12;
1515       result |= dpb_to_bcd( chunk & 0x3FF );
1516    }
1517    return result;
1518 }
1519 
h_BCDtoDPB(ULong bcd)1520 ULong h_BCDtoDPB( ULong bcd )
1521 {
1522    ULong result, chunk;
1523    Int i;
1524 
1525    result = 0;
1526 
1527    for (i = 0; i < 5; i++) {
1528       chunk = bcd >> ( 4 - i ) * 12;
1529       result = result << 10;
1530       result |= bcd_to_dpb( chunk & 0xFFF );
1531    }
1532    return result;
1533 }
1534 #undef NOT
1535 #undef GET
1536 #undef PUT
1537 
1538 /*---------------------------------------------------------------*/
1539 /*--- end                               host_generic_simd64.c ---*/
1540 /*---------------------------------------------------------------*/
1541 
1542