1
2 /*---------------------------------------------------------------*/
3 /*--- begin host_generic_simd64.c ---*/
4 /*---------------------------------------------------------------*/
5
6 /*
7 This file is part of Valgrind, a dynamic binary instrumentation
8 framework.
9
10 Copyright (C) 2004-2012 OpenWorks LLP
11 info@open-works.net
12
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
17
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, write to the Free Software
25 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26 02110-1301, USA.
27
28 The GNU General Public License is contained in the file COPYING.
29
30 Neither the names of the U.S. Department of Energy nor the
31 University of California nor the names of its contributors may be
32 used to endorse or promote products derived from this software
33 without prior written permission.
34 */
35
36 /* Generic helper functions for doing 64-bit SIMD arithmetic in cases
37 where the instruction selectors cannot generate code in-line.
38 These are purely back-end entities and cannot be seen/referenced
39 from IR. */
40
41 #include "libvex_basictypes.h"
42 #include "host_generic_simd64.h"
43
44
45
46 /* Tuple/select functions for 32x2 vectors. */
47
mk32x2(UInt w1,UInt w0)48 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
49 return (((ULong)w1) << 32) | ((ULong)w0);
50 }
51
sel32x2_1(ULong w64)52 static inline UInt sel32x2_1 ( ULong w64 ) {
53 return 0xFFFFFFFF & toUInt(w64 >> 32);
54 }
sel32x2_0(ULong w64)55 static inline UInt sel32x2_0 ( ULong w64 ) {
56 return 0xFFFFFFFF & toUInt(w64);
57 }
58
59
60 /* Tuple/select functions for 16x4 vectors. gcc is pretty hopeless
61 with 64-bit shifts so we give it a hand. */
62
mk16x4(UShort w3,UShort w2,UShort w1,UShort w0)63 static inline ULong mk16x4 ( UShort w3, UShort w2,
64 UShort w1, UShort w0 ) {
65 UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2);
66 UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0);
67 return mk32x2(hi32, lo32);
68 }
69
sel16x4_3(ULong w64)70 static inline UShort sel16x4_3 ( ULong w64 ) {
71 UInt hi32 = toUInt(w64 >> 32);
72 return toUShort(0xFFFF & (hi32 >> 16));
73 }
sel16x4_2(ULong w64)74 static inline UShort sel16x4_2 ( ULong w64 ) {
75 UInt hi32 = toUInt(w64 >> 32);
76 return toUShort(0xFFFF & hi32);
77 }
sel16x4_1(ULong w64)78 static inline UShort sel16x4_1 ( ULong w64 ) {
79 UInt lo32 = (UInt)w64;
80 return toUShort(0xFFFF & (lo32 >> 16));
81 }
sel16x4_0(ULong w64)82 static inline UShort sel16x4_0 ( ULong w64 ) {
83 UInt lo32 = (UInt)w64;
84 return toUShort(0xFFFF & lo32);
85 }
86
87
88 /* Tuple/select functions for 8x8 vectors. */
89
mk8x8(UChar w7,UChar w6,UChar w5,UChar w4,UChar w3,UChar w2,UChar w1,UChar w0)90 static inline ULong mk8x8 ( UChar w7, UChar w6,
91 UChar w5, UChar w4,
92 UChar w3, UChar w2,
93 UChar w1, UChar w0 ) {
94 UInt hi32 = (((UInt)w7) << 24) | (((UInt)w6) << 16)
95 | (((UInt)w5) << 8) | (((UInt)w4) << 0);
96 UInt lo32 = (((UInt)w3) << 24) | (((UInt)w2) << 16)
97 | (((UInt)w1) << 8) | (((UInt)w0) << 0);
98 return mk32x2(hi32, lo32);
99 }
100
sel8x8_7(ULong w64)101 static inline UChar sel8x8_7 ( ULong w64 ) {
102 UInt hi32 = toUInt(w64 >> 32);
103 return toUChar(0xFF & (hi32 >> 24));
104 }
sel8x8_6(ULong w64)105 static inline UChar sel8x8_6 ( ULong w64 ) {
106 UInt hi32 = toUInt(w64 >> 32);
107 return toUChar(0xFF & (hi32 >> 16));
108 }
sel8x8_5(ULong w64)109 static inline UChar sel8x8_5 ( ULong w64 ) {
110 UInt hi32 = toUInt(w64 >> 32);
111 return toUChar(0xFF & (hi32 >> 8));
112 }
sel8x8_4(ULong w64)113 static inline UChar sel8x8_4 ( ULong w64 ) {
114 UInt hi32 = toUInt(w64 >> 32);
115 return toUChar(0xFF & (hi32 >> 0));
116 }
sel8x8_3(ULong w64)117 static inline UChar sel8x8_3 ( ULong w64 ) {
118 UInt lo32 = (UInt)w64;
119 return toUChar(0xFF & (lo32 >> 24));
120 }
sel8x8_2(ULong w64)121 static inline UChar sel8x8_2 ( ULong w64 ) {
122 UInt lo32 = (UInt)w64;
123 return toUChar(0xFF & (lo32 >> 16));
124 }
sel8x8_1(ULong w64)125 static inline UChar sel8x8_1 ( ULong w64 ) {
126 UInt lo32 = (UInt)w64;
127 return toUChar(0xFF & (lo32 >> 8));
128 }
sel8x8_0(ULong w64)129 static inline UChar sel8x8_0 ( ULong w64 ) {
130 UInt lo32 = (UInt)w64;
131 return toUChar(0xFF & (lo32 >> 0));
132 }
133
index8x8(ULong w64,UChar ix)134 static inline UChar index8x8 ( ULong w64, UChar ix ) {
135 ix &= 7;
136 return toUChar((w64 >> (8*ix)) & 0xFF);
137 }
138
139
140 /* Scalar helpers. */
141
qadd32S(Int xx,Int yy)142 static inline Int qadd32S ( Int xx, Int yy )
143 {
144 Long t = ((Long)xx) + ((Long)yy);
145 const Long loLim = -0x80000000LL;
146 const Long hiLim = 0x7FFFFFFFLL;
147 if (t < loLim) t = loLim;
148 if (t > hiLim) t = hiLim;
149 return (Int)t;
150 }
151
qadd16S(Short xx,Short yy)152 static inline Short qadd16S ( Short xx, Short yy )
153 {
154 Int t = ((Int)xx) + ((Int)yy);
155 if (t < -32768) t = -32768;
156 if (t > 32767) t = 32767;
157 return (Short)t;
158 }
159
qadd8S(Char xx,Char yy)160 static inline Char qadd8S ( Char xx, Char yy )
161 {
162 Int t = ((Int)xx) + ((Int)yy);
163 if (t < -128) t = -128;
164 if (t > 127) t = 127;
165 return (Char)t;
166 }
167
qadd16U(UShort xx,UShort yy)168 static inline UShort qadd16U ( UShort xx, UShort yy )
169 {
170 UInt t = ((UInt)xx) + ((UInt)yy);
171 if (t > 0xFFFF) t = 0xFFFF;
172 return (UShort)t;
173 }
174
qadd8U(UChar xx,UChar yy)175 static inline UChar qadd8U ( UChar xx, UChar yy )
176 {
177 UInt t = ((UInt)xx) + ((UInt)yy);
178 if (t > 0xFF) t = 0xFF;
179 return (UChar)t;
180 }
181
qsub32S(Int xx,Int yy)182 static inline Int qsub32S ( Int xx, Int yy )
183 {
184 Long t = ((Long)xx) - ((Long)yy);
185 const Long loLim = -0x80000000LL;
186 const Long hiLim = 0x7FFFFFFFLL;
187 if (t < loLim) t = loLim;
188 if (t > hiLim) t = hiLim;
189 return (Int)t;
190 }
191
qsub16S(Short xx,Short yy)192 static inline Short qsub16S ( Short xx, Short yy )
193 {
194 Int t = ((Int)xx) - ((Int)yy);
195 if (t < -32768) t = -32768;
196 if (t > 32767) t = 32767;
197 return (Short)t;
198 }
199
qsub8S(Char xx,Char yy)200 static inline Char qsub8S ( Char xx, Char yy )
201 {
202 Int t = ((Int)xx) - ((Int)yy);
203 if (t < -128) t = -128;
204 if (t > 127) t = 127;
205 return (Char)t;
206 }
207
qsub16U(UShort xx,UShort yy)208 static inline UShort qsub16U ( UShort xx, UShort yy )
209 {
210 Int t = ((Int)xx) - ((Int)yy);
211 if (t < 0) t = 0;
212 if (t > 0xFFFF) t = 0xFFFF;
213 return (UShort)t;
214 }
215
qsub8U(UChar xx,UChar yy)216 static inline UChar qsub8U ( UChar xx, UChar yy )
217 {
218 Int t = ((Int)xx) - ((Int)yy);
219 if (t < 0) t = 0;
220 if (t > 0xFF) t = 0xFF;
221 return (UChar)t;
222 }
223
mul16(Short xx,Short yy)224 static inline Short mul16 ( Short xx, Short yy )
225 {
226 Int t = ((Int)xx) * ((Int)yy);
227 return (Short)t;
228 }
229
mul32(Int xx,Int yy)230 static inline Int mul32 ( Int xx, Int yy )
231 {
232 Int t = ((Int)xx) * ((Int)yy);
233 return (Int)t;
234 }
235
mulhi16S(Short xx,Short yy)236 static inline Short mulhi16S ( Short xx, Short yy )
237 {
238 Int t = ((Int)xx) * ((Int)yy);
239 t >>=/*s*/ 16;
240 return (Short)t;
241 }
242
mulhi16U(UShort xx,UShort yy)243 static inline UShort mulhi16U ( UShort xx, UShort yy )
244 {
245 UInt t = ((UInt)xx) * ((UInt)yy);
246 t >>=/*u*/ 16;
247 return (UShort)t;
248 }
249
cmpeq32(UInt xx,UInt yy)250 static inline UInt cmpeq32 ( UInt xx, UInt yy )
251 {
252 return xx==yy ? 0xFFFFFFFF : 0;
253 }
254
cmpeq16(UShort xx,UShort yy)255 static inline UShort cmpeq16 ( UShort xx, UShort yy )
256 {
257 return toUShort(xx==yy ? 0xFFFF : 0);
258 }
259
cmpeq8(UChar xx,UChar yy)260 static inline UChar cmpeq8 ( UChar xx, UChar yy )
261 {
262 return toUChar(xx==yy ? 0xFF : 0);
263 }
264
cmpgt32S(Int xx,Int yy)265 static inline UInt cmpgt32S ( Int xx, Int yy )
266 {
267 return xx>yy ? 0xFFFFFFFF : 0;
268 }
269
cmpgt16S(Short xx,Short yy)270 static inline UShort cmpgt16S ( Short xx, Short yy )
271 {
272 return toUShort(xx>yy ? 0xFFFF : 0);
273 }
274
cmpgt8S(Char xx,Char yy)275 static inline UChar cmpgt8S ( Char xx, Char yy )
276 {
277 return toUChar(xx>yy ? 0xFF : 0);
278 }
279
cmpnez32(UInt xx)280 static inline UInt cmpnez32 ( UInt xx )
281 {
282 return xx==0 ? 0 : 0xFFFFFFFF;
283 }
284
cmpnez16(UShort xx)285 static inline UShort cmpnez16 ( UShort xx )
286 {
287 return toUShort(xx==0 ? 0 : 0xFFFF);
288 }
289
cmpnez8(UChar xx)290 static inline UChar cmpnez8 ( UChar xx )
291 {
292 return toUChar(xx==0 ? 0 : 0xFF);
293 }
294
qnarrow32Sto16S(UInt xx0)295 static inline Short qnarrow32Sto16S ( UInt xx0 )
296 {
297 Int xx = (Int)xx0;
298 if (xx < -32768) xx = -32768;
299 if (xx > 32767) xx = 32767;
300 return (Short)xx;
301 }
302
qnarrow16Sto8S(UShort xx0)303 static inline Char qnarrow16Sto8S ( UShort xx0 )
304 {
305 Short xx = (Short)xx0;
306 if (xx < -128) xx = -128;
307 if (xx > 127) xx = 127;
308 return (Char)xx;
309 }
310
qnarrow16Sto8U(UShort xx0)311 static inline UChar qnarrow16Sto8U ( UShort xx0 )
312 {
313 Short xx = (Short)xx0;
314 if (xx < 0) xx = 0;
315 if (xx > 255) xx = 255;
316 return (UChar)xx;
317 }
318
narrow32to16(UInt xx)319 static inline UShort narrow32to16 ( UInt xx )
320 {
321 return (UShort)xx;
322 }
323
narrow16to8(UShort xx)324 static inline UChar narrow16to8 ( UShort xx )
325 {
326 return (UChar)xx;
327 }
328
329 /* shifts: we don't care about out-of-range ones, since
330 that is dealt with at a higher level. */
331
shl8(UChar v,UInt n)332 static inline UChar shl8 ( UChar v, UInt n )
333 {
334 return toUChar(v << n);
335 }
336
sar8(UChar v,UInt n)337 static inline UChar sar8 ( UChar v, UInt n )
338 {
339 return toUChar(((Char)v) >> n);
340 }
341
shl16(UShort v,UInt n)342 static inline UShort shl16 ( UShort v, UInt n )
343 {
344 return toUShort(v << n);
345 }
346
shr16(UShort v,UInt n)347 static inline UShort shr16 ( UShort v, UInt n )
348 {
349 return toUShort((((UShort)v) >> n));
350 }
351
sar16(UShort v,UInt n)352 static inline UShort sar16 ( UShort v, UInt n )
353 {
354 return toUShort(((Short)v) >> n);
355 }
356
shl32(UInt v,UInt n)357 static inline UInt shl32 ( UInt v, UInt n )
358 {
359 return v << n;
360 }
361
shr32(UInt v,UInt n)362 static inline UInt shr32 ( UInt v, UInt n )
363 {
364 return (((UInt)v) >> n);
365 }
366
sar32(UInt v,UInt n)367 static inline UInt sar32 ( UInt v, UInt n )
368 {
369 return ((Int)v) >> n;
370 }
371
avg8U(UChar xx,UChar yy)372 static inline UChar avg8U ( UChar xx, UChar yy )
373 {
374 UInt xxi = (UInt)xx;
375 UInt yyi = (UInt)yy;
376 UInt r = (xxi + yyi + 1) >> 1;
377 return (UChar)r;
378 }
379
avg16U(UShort xx,UShort yy)380 static inline UShort avg16U ( UShort xx, UShort yy )
381 {
382 UInt xxi = (UInt)xx;
383 UInt yyi = (UInt)yy;
384 UInt r = (xxi + yyi + 1) >> 1;
385 return (UShort)r;
386 }
387
max16S(Short xx,Short yy)388 static inline Short max16S ( Short xx, Short yy )
389 {
390 return toUShort((xx > yy) ? xx : yy);
391 }
392
max8U(UChar xx,UChar yy)393 static inline UChar max8U ( UChar xx, UChar yy )
394 {
395 return toUChar((xx > yy) ? xx : yy);
396 }
397
min16S(Short xx,Short yy)398 static inline Short min16S ( Short xx, Short yy )
399 {
400 return toUShort((xx < yy) ? xx : yy);
401 }
402
min8U(UChar xx,UChar yy)403 static inline UChar min8U ( UChar xx, UChar yy )
404 {
405 return toUChar((xx < yy) ? xx : yy);
406 }
407
hadd16U(UShort xx,UShort yy)408 static inline UShort hadd16U ( UShort xx, UShort yy )
409 {
410 UInt xxi = (UInt)xx;
411 UInt yyi = (UInt)yy;
412 UInt r = (xxi + yyi) >> 1;
413 return (UShort)r;
414 }
415
hadd16S(Short xx,Short yy)416 static inline Short hadd16S ( Short xx, Short yy )
417 {
418 Int xxi = (Int)xx;
419 Int yyi = (Int)yy;
420 Int r = (xxi + yyi) >> 1;
421 return (Short)r;
422 }
423
hsub16U(UShort xx,UShort yy)424 static inline UShort hsub16U ( UShort xx, UShort yy )
425 {
426 UInt xxi = (UInt)xx;
427 UInt yyi = (UInt)yy;
428 UInt r = (xxi - yyi) >> 1;
429 return (UShort)r;
430 }
431
hsub16S(Short xx,Short yy)432 static inline Short hsub16S ( Short xx, Short yy )
433 {
434 Int xxi = (Int)xx;
435 Int yyi = (Int)yy;
436 Int r = (xxi - yyi) >> 1;
437 return (Short)r;
438 }
439
hadd8U(UChar xx,UChar yy)440 static inline UChar hadd8U ( UChar xx, UChar yy )
441 {
442 UInt xxi = (UInt)xx;
443 UInt yyi = (UInt)yy;
444 UInt r = (xxi + yyi) >> 1;
445 return (UChar)r;
446 }
447
hadd8S(Char xx,Char yy)448 static inline Char hadd8S ( Char xx, Char yy )
449 {
450 Int xxi = (Int)xx;
451 Int yyi = (Int)yy;
452 Int r = (xxi + yyi) >> 1;
453 return (Char)r;
454 }
455
hsub8U(UChar xx,UChar yy)456 static inline UChar hsub8U ( UChar xx, UChar yy )
457 {
458 UInt xxi = (UInt)xx;
459 UInt yyi = (UInt)yy;
460 UInt r = (xxi - yyi) >> 1;
461 return (UChar)r;
462 }
463
hsub8S(Char xx,Char yy)464 static inline Char hsub8S ( Char xx, Char yy )
465 {
466 Int xxi = (Int)xx;
467 Int yyi = (Int)yy;
468 Int r = (xxi - yyi) >> 1;
469 return (Char)r;
470 }
471
absdiff8U(UChar xx,UChar yy)472 static inline UInt absdiff8U ( UChar xx, UChar yy )
473 {
474 UInt xxu = (UChar)xx;
475 UInt yyu = (UChar)yy;
476 return xxu >= yyu ? xxu - yyu : yyu - xxu;
477 }
478
479 /* ----------------------------------------------------- */
480 /* Start of the externally visible functions. These simply
481 implement the corresponding IR primops. */
482 /* ----------------------------------------------------- */
483
484 /* ------------ Normal addition ------------ */
485
h_generic_calc_Add32x2(ULong xx,ULong yy)486 ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy )
487 {
488 return mk32x2(
489 sel32x2_1(xx) + sel32x2_1(yy),
490 sel32x2_0(xx) + sel32x2_0(yy)
491 );
492 }
493
h_generic_calc_Add16x4(ULong xx,ULong yy)494 ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy )
495 {
496 return mk16x4(
497 toUShort( sel16x4_3(xx) + sel16x4_3(yy) ),
498 toUShort( sel16x4_2(xx) + sel16x4_2(yy) ),
499 toUShort( sel16x4_1(xx) + sel16x4_1(yy) ),
500 toUShort( sel16x4_0(xx) + sel16x4_0(yy) )
501 );
502 }
503
h_generic_calc_Add8x8(ULong xx,ULong yy)504 ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy )
505 {
506 return mk8x8(
507 toUChar( sel8x8_7(xx) + sel8x8_7(yy) ),
508 toUChar( sel8x8_6(xx) + sel8x8_6(yy) ),
509 toUChar( sel8x8_5(xx) + sel8x8_5(yy) ),
510 toUChar( sel8x8_4(xx) + sel8x8_4(yy) ),
511 toUChar( sel8x8_3(xx) + sel8x8_3(yy) ),
512 toUChar( sel8x8_2(xx) + sel8x8_2(yy) ),
513 toUChar( sel8x8_1(xx) + sel8x8_1(yy) ),
514 toUChar( sel8x8_0(xx) + sel8x8_0(yy) )
515 );
516 }
517
518 /* ------------ Saturating addition ------------ */
519
h_generic_calc_QAdd16Sx4(ULong xx,ULong yy)520 ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy )
521 {
522 return mk16x4(
523 qadd16S( sel16x4_3(xx), sel16x4_3(yy) ),
524 qadd16S( sel16x4_2(xx), sel16x4_2(yy) ),
525 qadd16S( sel16x4_1(xx), sel16x4_1(yy) ),
526 qadd16S( sel16x4_0(xx), sel16x4_0(yy) )
527 );
528 }
529
h_generic_calc_QAdd8Sx8(ULong xx,ULong yy)530 ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy )
531 {
532 return mk8x8(
533 qadd8S( sel8x8_7(xx), sel8x8_7(yy) ),
534 qadd8S( sel8x8_6(xx), sel8x8_6(yy) ),
535 qadd8S( sel8x8_5(xx), sel8x8_5(yy) ),
536 qadd8S( sel8x8_4(xx), sel8x8_4(yy) ),
537 qadd8S( sel8x8_3(xx), sel8x8_3(yy) ),
538 qadd8S( sel8x8_2(xx), sel8x8_2(yy) ),
539 qadd8S( sel8x8_1(xx), sel8x8_1(yy) ),
540 qadd8S( sel8x8_0(xx), sel8x8_0(yy) )
541 );
542 }
543
h_generic_calc_QAdd16Ux4(ULong xx,ULong yy)544 ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy )
545 {
546 return mk16x4(
547 qadd16U( sel16x4_3(xx), sel16x4_3(yy) ),
548 qadd16U( sel16x4_2(xx), sel16x4_2(yy) ),
549 qadd16U( sel16x4_1(xx), sel16x4_1(yy) ),
550 qadd16U( sel16x4_0(xx), sel16x4_0(yy) )
551 );
552 }
553
h_generic_calc_QAdd8Ux8(ULong xx,ULong yy)554 ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy )
555 {
556 return mk8x8(
557 qadd8U( sel8x8_7(xx), sel8x8_7(yy) ),
558 qadd8U( sel8x8_6(xx), sel8x8_6(yy) ),
559 qadd8U( sel8x8_5(xx), sel8x8_5(yy) ),
560 qadd8U( sel8x8_4(xx), sel8x8_4(yy) ),
561 qadd8U( sel8x8_3(xx), sel8x8_3(yy) ),
562 qadd8U( sel8x8_2(xx), sel8x8_2(yy) ),
563 qadd8U( sel8x8_1(xx), sel8x8_1(yy) ),
564 qadd8U( sel8x8_0(xx), sel8x8_0(yy) )
565 );
566 }
567
568 /* ------------ Normal subtraction ------------ */
569
h_generic_calc_Sub32x2(ULong xx,ULong yy)570 ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy )
571 {
572 return mk32x2(
573 sel32x2_1(xx) - sel32x2_1(yy),
574 sel32x2_0(xx) - sel32x2_0(yy)
575 );
576 }
577
h_generic_calc_Sub16x4(ULong xx,ULong yy)578 ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy )
579 {
580 return mk16x4(
581 toUShort( sel16x4_3(xx) - sel16x4_3(yy) ),
582 toUShort( sel16x4_2(xx) - sel16x4_2(yy) ),
583 toUShort( sel16x4_1(xx) - sel16x4_1(yy) ),
584 toUShort( sel16x4_0(xx) - sel16x4_0(yy) )
585 );
586 }
587
h_generic_calc_Sub8x8(ULong xx,ULong yy)588 ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy )
589 {
590 return mk8x8(
591 toUChar( sel8x8_7(xx) - sel8x8_7(yy) ),
592 toUChar( sel8x8_6(xx) - sel8x8_6(yy) ),
593 toUChar( sel8x8_5(xx) - sel8x8_5(yy) ),
594 toUChar( sel8x8_4(xx) - sel8x8_4(yy) ),
595 toUChar( sel8x8_3(xx) - sel8x8_3(yy) ),
596 toUChar( sel8x8_2(xx) - sel8x8_2(yy) ),
597 toUChar( sel8x8_1(xx) - sel8x8_1(yy) ),
598 toUChar( sel8x8_0(xx) - sel8x8_0(yy) )
599 );
600 }
601
602 /* ------------ Saturating subtraction ------------ */
603
h_generic_calc_QSub16Sx4(ULong xx,ULong yy)604 ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy )
605 {
606 return mk16x4(
607 qsub16S( sel16x4_3(xx), sel16x4_3(yy) ),
608 qsub16S( sel16x4_2(xx), sel16x4_2(yy) ),
609 qsub16S( sel16x4_1(xx), sel16x4_1(yy) ),
610 qsub16S( sel16x4_0(xx), sel16x4_0(yy) )
611 );
612 }
613
h_generic_calc_QSub8Sx8(ULong xx,ULong yy)614 ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy )
615 {
616 return mk8x8(
617 qsub8S( sel8x8_7(xx), sel8x8_7(yy) ),
618 qsub8S( sel8x8_6(xx), sel8x8_6(yy) ),
619 qsub8S( sel8x8_5(xx), sel8x8_5(yy) ),
620 qsub8S( sel8x8_4(xx), sel8x8_4(yy) ),
621 qsub8S( sel8x8_3(xx), sel8x8_3(yy) ),
622 qsub8S( sel8x8_2(xx), sel8x8_2(yy) ),
623 qsub8S( sel8x8_1(xx), sel8x8_1(yy) ),
624 qsub8S( sel8x8_0(xx), sel8x8_0(yy) )
625 );
626 }
627
h_generic_calc_QSub16Ux4(ULong xx,ULong yy)628 ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy )
629 {
630 return mk16x4(
631 qsub16U( sel16x4_3(xx), sel16x4_3(yy) ),
632 qsub16U( sel16x4_2(xx), sel16x4_2(yy) ),
633 qsub16U( sel16x4_1(xx), sel16x4_1(yy) ),
634 qsub16U( sel16x4_0(xx), sel16x4_0(yy) )
635 );
636 }
637
h_generic_calc_QSub8Ux8(ULong xx,ULong yy)638 ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy )
639 {
640 return mk8x8(
641 qsub8U( sel8x8_7(xx), sel8x8_7(yy) ),
642 qsub8U( sel8x8_6(xx), sel8x8_6(yy) ),
643 qsub8U( sel8x8_5(xx), sel8x8_5(yy) ),
644 qsub8U( sel8x8_4(xx), sel8x8_4(yy) ),
645 qsub8U( sel8x8_3(xx), sel8x8_3(yy) ),
646 qsub8U( sel8x8_2(xx), sel8x8_2(yy) ),
647 qsub8U( sel8x8_1(xx), sel8x8_1(yy) ),
648 qsub8U( sel8x8_0(xx), sel8x8_0(yy) )
649 );
650 }
651
652 /* ------------ Multiplication ------------ */
653
h_generic_calc_Mul16x4(ULong xx,ULong yy)654 ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy )
655 {
656 return mk16x4(
657 mul16( sel16x4_3(xx), sel16x4_3(yy) ),
658 mul16( sel16x4_2(xx), sel16x4_2(yy) ),
659 mul16( sel16x4_1(xx), sel16x4_1(yy) ),
660 mul16( sel16x4_0(xx), sel16x4_0(yy) )
661 );
662 }
663
h_generic_calc_Mul32x2(ULong xx,ULong yy)664 ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy )
665 {
666 return mk32x2(
667 mul32( sel32x2_1(xx), sel32x2_1(yy) ),
668 mul32( sel32x2_0(xx), sel32x2_0(yy) )
669 );
670 }
671
h_generic_calc_MulHi16Sx4(ULong xx,ULong yy)672 ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy )
673 {
674 return mk16x4(
675 mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ),
676 mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ),
677 mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ),
678 mulhi16S( sel16x4_0(xx), sel16x4_0(yy) )
679 );
680 }
681
h_generic_calc_MulHi16Ux4(ULong xx,ULong yy)682 ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy )
683 {
684 return mk16x4(
685 mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ),
686 mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ),
687 mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ),
688 mulhi16U( sel16x4_0(xx), sel16x4_0(yy) )
689 );
690 }
691
692 /* ------------ Comparison ------------ */
693
h_generic_calc_CmpEQ32x2(ULong xx,ULong yy)694 ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy )
695 {
696 return mk32x2(
697 cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ),
698 cmpeq32( sel32x2_0(xx), sel32x2_0(yy) )
699 );
700 }
701
h_generic_calc_CmpEQ16x4(ULong xx,ULong yy)702 ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy )
703 {
704 return mk16x4(
705 cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ),
706 cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ),
707 cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ),
708 cmpeq16( sel16x4_0(xx), sel16x4_0(yy) )
709 );
710 }
711
h_generic_calc_CmpEQ8x8(ULong xx,ULong yy)712 ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy )
713 {
714 return mk8x8(
715 cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ),
716 cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ),
717 cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ),
718 cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ),
719 cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ),
720 cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ),
721 cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ),
722 cmpeq8( sel8x8_0(xx), sel8x8_0(yy) )
723 );
724 }
725
h_generic_calc_CmpGT32Sx2(ULong xx,ULong yy)726 ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy )
727 {
728 return mk32x2(
729 cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ),
730 cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) )
731 );
732 }
733
h_generic_calc_CmpGT16Sx4(ULong xx,ULong yy)734 ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy )
735 {
736 return mk16x4(
737 cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ),
738 cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ),
739 cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ),
740 cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) )
741 );
742 }
743
h_generic_calc_CmpGT8Sx8(ULong xx,ULong yy)744 ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy )
745 {
746 return mk8x8(
747 cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ),
748 cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ),
749 cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ),
750 cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ),
751 cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ),
752 cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ),
753 cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ),
754 cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) )
755 );
756 }
757
h_generic_calc_CmpNEZ32x2(ULong xx)758 ULong h_generic_calc_CmpNEZ32x2 ( ULong xx )
759 {
760 return mk32x2(
761 cmpnez32( sel32x2_1(xx) ),
762 cmpnez32( sel32x2_0(xx) )
763 );
764 }
765
h_generic_calc_CmpNEZ16x4(ULong xx)766 ULong h_generic_calc_CmpNEZ16x4 ( ULong xx )
767 {
768 return mk16x4(
769 cmpnez16( sel16x4_3(xx) ),
770 cmpnez16( sel16x4_2(xx) ),
771 cmpnez16( sel16x4_1(xx) ),
772 cmpnez16( sel16x4_0(xx) )
773 );
774 }
775
h_generic_calc_CmpNEZ8x8(ULong xx)776 ULong h_generic_calc_CmpNEZ8x8 ( ULong xx )
777 {
778 return mk8x8(
779 cmpnez8( sel8x8_7(xx) ),
780 cmpnez8( sel8x8_6(xx) ),
781 cmpnez8( sel8x8_5(xx) ),
782 cmpnez8( sel8x8_4(xx) ),
783 cmpnez8( sel8x8_3(xx) ),
784 cmpnez8( sel8x8_2(xx) ),
785 cmpnez8( sel8x8_1(xx) ),
786 cmpnez8( sel8x8_0(xx) )
787 );
788 }
789
790 /* ------------ Saturating narrowing ------------ */
791
h_generic_calc_QNarrowBin32Sto16Sx4(ULong aa,ULong bb)792 ULong h_generic_calc_QNarrowBin32Sto16Sx4 ( ULong aa, ULong bb )
793 {
794 UInt d = sel32x2_1(aa);
795 UInt c = sel32x2_0(aa);
796 UInt b = sel32x2_1(bb);
797 UInt a = sel32x2_0(bb);
798 return mk16x4(
799 qnarrow32Sto16S(d),
800 qnarrow32Sto16S(c),
801 qnarrow32Sto16S(b),
802 qnarrow32Sto16S(a)
803 );
804 }
805
h_generic_calc_QNarrowBin16Sto8Sx8(ULong aa,ULong bb)806 ULong h_generic_calc_QNarrowBin16Sto8Sx8 ( ULong aa, ULong bb )
807 {
808 UShort h = sel16x4_3(aa);
809 UShort g = sel16x4_2(aa);
810 UShort f = sel16x4_1(aa);
811 UShort e = sel16x4_0(aa);
812 UShort d = sel16x4_3(bb);
813 UShort c = sel16x4_2(bb);
814 UShort b = sel16x4_1(bb);
815 UShort a = sel16x4_0(bb);
816 return mk8x8(
817 qnarrow16Sto8S(h),
818 qnarrow16Sto8S(g),
819 qnarrow16Sto8S(f),
820 qnarrow16Sto8S(e),
821 qnarrow16Sto8S(d),
822 qnarrow16Sto8S(c),
823 qnarrow16Sto8S(b),
824 qnarrow16Sto8S(a)
825 );
826 }
827
h_generic_calc_QNarrowBin16Sto8Ux8(ULong aa,ULong bb)828 ULong h_generic_calc_QNarrowBin16Sto8Ux8 ( ULong aa, ULong bb )
829 {
830 UShort h = sel16x4_3(aa);
831 UShort g = sel16x4_2(aa);
832 UShort f = sel16x4_1(aa);
833 UShort e = sel16x4_0(aa);
834 UShort d = sel16x4_3(bb);
835 UShort c = sel16x4_2(bb);
836 UShort b = sel16x4_1(bb);
837 UShort a = sel16x4_0(bb);
838 return mk8x8(
839 qnarrow16Sto8U(h),
840 qnarrow16Sto8U(g),
841 qnarrow16Sto8U(f),
842 qnarrow16Sto8U(e),
843 qnarrow16Sto8U(d),
844 qnarrow16Sto8U(c),
845 qnarrow16Sto8U(b),
846 qnarrow16Sto8U(a)
847 );
848 }
849
850 /* ------------ Truncating narrowing ------------ */
851
h_generic_calc_NarrowBin32to16x4(ULong aa,ULong bb)852 ULong h_generic_calc_NarrowBin32to16x4 ( ULong aa, ULong bb )
853 {
854 UInt d = sel32x2_1(aa);
855 UInt c = sel32x2_0(aa);
856 UInt b = sel32x2_1(bb);
857 UInt a = sel32x2_0(bb);
858 return mk16x4(
859 narrow32to16(d),
860 narrow32to16(c),
861 narrow32to16(b),
862 narrow32to16(a)
863 );
864 }
865
h_generic_calc_NarrowBin16to8x8(ULong aa,ULong bb)866 ULong h_generic_calc_NarrowBin16to8x8 ( ULong aa, ULong bb )
867 {
868 UShort h = sel16x4_3(aa);
869 UShort g = sel16x4_2(aa);
870 UShort f = sel16x4_1(aa);
871 UShort e = sel16x4_0(aa);
872 UShort d = sel16x4_3(bb);
873 UShort c = sel16x4_2(bb);
874 UShort b = sel16x4_1(bb);
875 UShort a = sel16x4_0(bb);
876 return mk8x8(
877 narrow16to8(h),
878 narrow16to8(g),
879 narrow16to8(f),
880 narrow16to8(e),
881 narrow16to8(d),
882 narrow16to8(c),
883 narrow16to8(b),
884 narrow16to8(a)
885 );
886 }
887
888 /* ------------ Interleaving ------------ */
889
h_generic_calc_InterleaveHI8x8(ULong aa,ULong bb)890 ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb )
891 {
892 return mk8x8(
893 sel8x8_7(aa),
894 sel8x8_7(bb),
895 sel8x8_6(aa),
896 sel8x8_6(bb),
897 sel8x8_5(aa),
898 sel8x8_5(bb),
899 sel8x8_4(aa),
900 sel8x8_4(bb)
901 );
902 }
903
h_generic_calc_InterleaveLO8x8(ULong aa,ULong bb)904 ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb )
905 {
906 return mk8x8(
907 sel8x8_3(aa),
908 sel8x8_3(bb),
909 sel8x8_2(aa),
910 sel8x8_2(bb),
911 sel8x8_1(aa),
912 sel8x8_1(bb),
913 sel8x8_0(aa),
914 sel8x8_0(bb)
915 );
916 }
917
h_generic_calc_InterleaveHI16x4(ULong aa,ULong bb)918 ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb )
919 {
920 return mk16x4(
921 sel16x4_3(aa),
922 sel16x4_3(bb),
923 sel16x4_2(aa),
924 sel16x4_2(bb)
925 );
926 }
927
h_generic_calc_InterleaveLO16x4(ULong aa,ULong bb)928 ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb )
929 {
930 return mk16x4(
931 sel16x4_1(aa),
932 sel16x4_1(bb),
933 sel16x4_0(aa),
934 sel16x4_0(bb)
935 );
936 }
937
h_generic_calc_InterleaveHI32x2(ULong aa,ULong bb)938 ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb )
939 {
940 return mk32x2(
941 sel32x2_1(aa),
942 sel32x2_1(bb)
943 );
944 }
945
h_generic_calc_InterleaveLO32x2(ULong aa,ULong bb)946 ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb )
947 {
948 return mk32x2(
949 sel32x2_0(aa),
950 sel32x2_0(bb)
951 );
952 }
953
954 /* ------------ Concatenation ------------ */
955
h_generic_calc_CatOddLanes16x4(ULong aa,ULong bb)956 ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb )
957 {
958 return mk16x4(
959 sel16x4_3(aa),
960 sel16x4_1(aa),
961 sel16x4_3(bb),
962 sel16x4_1(bb)
963 );
964 }
965
h_generic_calc_CatEvenLanes16x4(ULong aa,ULong bb)966 ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb )
967 {
968 return mk16x4(
969 sel16x4_2(aa),
970 sel16x4_0(aa),
971 sel16x4_2(bb),
972 sel16x4_0(bb)
973 );
974 }
975
976 /* misc hack looking for a proper home */
h_generic_calc_Perm8x8(ULong aa,ULong bb)977 ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb )
978 {
979 return mk8x8(
980 index8x8(aa, sel8x8_7(bb)),
981 index8x8(aa, sel8x8_6(bb)),
982 index8x8(aa, sel8x8_5(bb)),
983 index8x8(aa, sel8x8_4(bb)),
984 index8x8(aa, sel8x8_3(bb)),
985 index8x8(aa, sel8x8_2(bb)),
986 index8x8(aa, sel8x8_1(bb)),
987 index8x8(aa, sel8x8_0(bb))
988 );
989 }
990
991 /* ------------ Shifting ------------ */
992 /* Note that because these primops are undefined if the shift amount
993 equals or exceeds the lane width, the shift amount is masked so
994 that the scalar shifts are always in range. In fact, given the
995 semantics of these primops (ShlN16x4, etc) it is an error if in
996 fact we are ever given an out-of-range shift amount.
997 */
h_generic_calc_ShlN32x2(ULong xx,UInt nn)998 ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn )
999 {
1000 /* vassert(nn < 32); */
1001 nn &= 31;
1002 return mk32x2(
1003 shl32( sel32x2_1(xx), nn ),
1004 shl32( sel32x2_0(xx), nn )
1005 );
1006 }
1007
h_generic_calc_ShlN16x4(ULong xx,UInt nn)1008 ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn )
1009 {
1010 /* vassert(nn < 16); */
1011 nn &= 15;
1012 return mk16x4(
1013 shl16( sel16x4_3(xx), nn ),
1014 shl16( sel16x4_2(xx), nn ),
1015 shl16( sel16x4_1(xx), nn ),
1016 shl16( sel16x4_0(xx), nn )
1017 );
1018 }
1019
h_generic_calc_ShlN8x8(ULong xx,UInt nn)1020 ULong h_generic_calc_ShlN8x8 ( ULong xx, UInt nn )
1021 {
1022 /* vassert(nn < 8); */
1023 nn &= 7;
1024 return mk8x8(
1025 shl8( sel8x8_7(xx), nn ),
1026 shl8( sel8x8_6(xx), nn ),
1027 shl8( sel8x8_5(xx), nn ),
1028 shl8( sel8x8_4(xx), nn ),
1029 shl8( sel8x8_3(xx), nn ),
1030 shl8( sel8x8_2(xx), nn ),
1031 shl8( sel8x8_1(xx), nn ),
1032 shl8( sel8x8_0(xx), nn )
1033 );
1034 }
1035
h_generic_calc_ShrN32x2(ULong xx,UInt nn)1036 ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn )
1037 {
1038 /* vassert(nn < 32); */
1039 nn &= 31;
1040 return mk32x2(
1041 shr32( sel32x2_1(xx), nn ),
1042 shr32( sel32x2_0(xx), nn )
1043 );
1044 }
1045
h_generic_calc_ShrN16x4(ULong xx,UInt nn)1046 ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn )
1047 {
1048 /* vassert(nn < 16); */
1049 nn &= 15;
1050 return mk16x4(
1051 shr16( sel16x4_3(xx), nn ),
1052 shr16( sel16x4_2(xx), nn ),
1053 shr16( sel16x4_1(xx), nn ),
1054 shr16( sel16x4_0(xx), nn )
1055 );
1056 }
1057
h_generic_calc_SarN32x2(ULong xx,UInt nn)1058 ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn )
1059 {
1060 /* vassert(nn < 32); */
1061 nn &= 31;
1062 return mk32x2(
1063 sar32( sel32x2_1(xx), nn ),
1064 sar32( sel32x2_0(xx), nn )
1065 );
1066 }
1067
h_generic_calc_SarN16x4(ULong xx,UInt nn)1068 ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn )
1069 {
1070 /* vassert(nn < 16); */
1071 nn &= 15;
1072 return mk16x4(
1073 sar16( sel16x4_3(xx), nn ),
1074 sar16( sel16x4_2(xx), nn ),
1075 sar16( sel16x4_1(xx), nn ),
1076 sar16( sel16x4_0(xx), nn )
1077 );
1078 }
1079
h_generic_calc_SarN8x8(ULong xx,UInt nn)1080 ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn )
1081 {
1082 /* vassert(nn < 8); */
1083 nn &= 7;
1084 return mk8x8(
1085 sar8( sel8x8_7(xx), nn ),
1086 sar8( sel8x8_6(xx), nn ),
1087 sar8( sel8x8_5(xx), nn ),
1088 sar8( sel8x8_4(xx), nn ),
1089 sar8( sel8x8_3(xx), nn ),
1090 sar8( sel8x8_2(xx), nn ),
1091 sar8( sel8x8_1(xx), nn ),
1092 sar8( sel8x8_0(xx), nn )
1093 );
1094 }
1095
1096 /* ------------ Averaging ------------ */
1097
h_generic_calc_Avg8Ux8(ULong xx,ULong yy)1098 ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy )
1099 {
1100 return mk8x8(
1101 avg8U( sel8x8_7(xx), sel8x8_7(yy) ),
1102 avg8U( sel8x8_6(xx), sel8x8_6(yy) ),
1103 avg8U( sel8x8_5(xx), sel8x8_5(yy) ),
1104 avg8U( sel8x8_4(xx), sel8x8_4(yy) ),
1105 avg8U( sel8x8_3(xx), sel8x8_3(yy) ),
1106 avg8U( sel8x8_2(xx), sel8x8_2(yy) ),
1107 avg8U( sel8x8_1(xx), sel8x8_1(yy) ),
1108 avg8U( sel8x8_0(xx), sel8x8_0(yy) )
1109 );
1110 }
1111
h_generic_calc_Avg16Ux4(ULong xx,ULong yy)1112 ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy )
1113 {
1114 return mk16x4(
1115 avg16U( sel16x4_3(xx), sel16x4_3(yy) ),
1116 avg16U( sel16x4_2(xx), sel16x4_2(yy) ),
1117 avg16U( sel16x4_1(xx), sel16x4_1(yy) ),
1118 avg16U( sel16x4_0(xx), sel16x4_0(yy) )
1119 );
1120 }
1121
1122 /* ------------ max/min ------------ */
1123
h_generic_calc_Max16Sx4(ULong xx,ULong yy)1124 ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy )
1125 {
1126 return mk16x4(
1127 max16S( sel16x4_3(xx), sel16x4_3(yy) ),
1128 max16S( sel16x4_2(xx), sel16x4_2(yy) ),
1129 max16S( sel16x4_1(xx), sel16x4_1(yy) ),
1130 max16S( sel16x4_0(xx), sel16x4_0(yy) )
1131 );
1132 }
1133
h_generic_calc_Max8Ux8(ULong xx,ULong yy)1134 ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy )
1135 {
1136 return mk8x8(
1137 max8U( sel8x8_7(xx), sel8x8_7(yy) ),
1138 max8U( sel8x8_6(xx), sel8x8_6(yy) ),
1139 max8U( sel8x8_5(xx), sel8x8_5(yy) ),
1140 max8U( sel8x8_4(xx), sel8x8_4(yy) ),
1141 max8U( sel8x8_3(xx), sel8x8_3(yy) ),
1142 max8U( sel8x8_2(xx), sel8x8_2(yy) ),
1143 max8U( sel8x8_1(xx), sel8x8_1(yy) ),
1144 max8U( sel8x8_0(xx), sel8x8_0(yy) )
1145 );
1146 }
1147
h_generic_calc_Min16Sx4(ULong xx,ULong yy)1148 ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy )
1149 {
1150 return mk16x4(
1151 min16S( sel16x4_3(xx), sel16x4_3(yy) ),
1152 min16S( sel16x4_2(xx), sel16x4_2(yy) ),
1153 min16S( sel16x4_1(xx), sel16x4_1(yy) ),
1154 min16S( sel16x4_0(xx), sel16x4_0(yy) )
1155 );
1156 }
1157
h_generic_calc_Min8Ux8(ULong xx,ULong yy)1158 ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy )
1159 {
1160 return mk8x8(
1161 min8U( sel8x8_7(xx), sel8x8_7(yy) ),
1162 min8U( sel8x8_6(xx), sel8x8_6(yy) ),
1163 min8U( sel8x8_5(xx), sel8x8_5(yy) ),
1164 min8U( sel8x8_4(xx), sel8x8_4(yy) ),
1165 min8U( sel8x8_3(xx), sel8x8_3(yy) ),
1166 min8U( sel8x8_2(xx), sel8x8_2(yy) ),
1167 min8U( sel8x8_1(xx), sel8x8_1(yy) ),
1168 min8U( sel8x8_0(xx), sel8x8_0(yy) )
1169 );
1170 }
1171
1172 /* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */
1173
1174 /* Tuple/select functions for 16x2 vectors. */
mk16x2(UShort w1,UShort w2)1175 static inline UInt mk16x2 ( UShort w1, UShort w2 ) {
1176 return (((UInt)w1) << 16) | ((UInt)w2);
1177 }
1178
sel16x2_1(UInt w32)1179 static inline UShort sel16x2_1 ( UInt w32 ) {
1180 return 0xFFFF & (UShort)(w32 >> 16);
1181 }
sel16x2_0(UInt w32)1182 static inline UShort sel16x2_0 ( UInt w32 ) {
1183 return 0xFFFF & (UShort)(w32);
1184 }
1185
mk8x4(UChar w3,UChar w2,UChar w1,UChar w0)1186 static inline UInt mk8x4 ( UChar w3, UChar w2,
1187 UChar w1, UChar w0 ) {
1188 UInt w32 = (((UInt)w3) << 24) | (((UInt)w2) << 16)
1189 | (((UInt)w1) << 8) | (((UInt)w0) << 0);
1190 return w32;
1191 }
1192
sel8x4_3(UInt w32)1193 static inline UChar sel8x4_3 ( UInt w32 ) {
1194 return toUChar(0xFF & (w32 >> 24));
1195 }
sel8x4_2(UInt w32)1196 static inline UChar sel8x4_2 ( UInt w32 ) {
1197 return toUChar(0xFF & (w32 >> 16));
1198 }
sel8x4_1(UInt w32)1199 static inline UChar sel8x4_1 ( UInt w32 ) {
1200 return toUChar(0xFF & (w32 >> 8));
1201 }
sel8x4_0(UInt w32)1202 static inline UChar sel8x4_0 ( UInt w32 ) {
1203 return toUChar(0xFF & (w32 >> 0));
1204 }
1205
1206
1207 /* ----------------------------------------------------- */
1208 /* More externally visible functions. These simply
1209 implement the corresponding IR primops. */
1210 /* ----------------------------------------------------- */
1211
1212 /* ------ 16x2 ------ */
1213
h_generic_calc_Add16x2(UInt xx,UInt yy)1214 UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy )
1215 {
1216 return mk16x2( sel16x2_1(xx) + sel16x2_1(yy),
1217 sel16x2_0(xx) + sel16x2_0(yy) );
1218 }
1219
h_generic_calc_Sub16x2(UInt xx,UInt yy)1220 UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy )
1221 {
1222 return mk16x2( sel16x2_1(xx) - sel16x2_1(yy),
1223 sel16x2_0(xx) - sel16x2_0(yy) );
1224 }
1225
h_generic_calc_HAdd16Ux2(UInt xx,UInt yy)1226 UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy )
1227 {
1228 return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1229 hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1230 }
1231
h_generic_calc_HAdd16Sx2(UInt xx,UInt yy)1232 UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy )
1233 {
1234 return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1235 hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1236 }
1237
h_generic_calc_HSub16Ux2(UInt xx,UInt yy)1238 UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy )
1239 {
1240 return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1241 hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1242 }
1243
h_generic_calc_HSub16Sx2(UInt xx,UInt yy)1244 UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy )
1245 {
1246 return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1247 hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1248 }
1249
h_generic_calc_QAdd16Ux2(UInt xx,UInt yy)1250 UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy )
1251 {
1252 return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1253 qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1254 }
1255
h_generic_calc_QAdd16Sx2(UInt xx,UInt yy)1256 UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy )
1257 {
1258 return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1259 qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1260 }
1261
h_generic_calc_QSub16Ux2(UInt xx,UInt yy)1262 UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy )
1263 {
1264 return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1265 qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1266 }
1267
h_generic_calc_QSub16Sx2(UInt xx,UInt yy)1268 UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy )
1269 {
1270 return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1271 qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1272 }
1273
1274 /* ------ 8x4 ------ */
1275
h_generic_calc_Add8x4(UInt xx,UInt yy)1276 UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy )
1277 {
1278 return mk8x4(
1279 sel8x4_3(xx) + sel8x4_3(yy),
1280 sel8x4_2(xx) + sel8x4_2(yy),
1281 sel8x4_1(xx) + sel8x4_1(yy),
1282 sel8x4_0(xx) + sel8x4_0(yy)
1283 );
1284 }
1285
h_generic_calc_Sub8x4(UInt xx,UInt yy)1286 UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy )
1287 {
1288 return mk8x4(
1289 sel8x4_3(xx) - sel8x4_3(yy),
1290 sel8x4_2(xx) - sel8x4_2(yy),
1291 sel8x4_1(xx) - sel8x4_1(yy),
1292 sel8x4_0(xx) - sel8x4_0(yy)
1293 );
1294 }
1295
h_generic_calc_HAdd8Ux4(UInt xx,UInt yy)1296 UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy )
1297 {
1298 return mk8x4(
1299 hadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1300 hadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1301 hadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1302 hadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1303 );
1304 }
1305
h_generic_calc_HAdd8Sx4(UInt xx,UInt yy)1306 UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy )
1307 {
1308 return mk8x4(
1309 hadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1310 hadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1311 hadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1312 hadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1313 );
1314 }
1315
h_generic_calc_HSub8Ux4(UInt xx,UInt yy)1316 UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy )
1317 {
1318 return mk8x4(
1319 hsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1320 hsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1321 hsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1322 hsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1323 );
1324 }
1325
h_generic_calc_HSub8Sx4(UInt xx,UInt yy)1326 UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy )
1327 {
1328 return mk8x4(
1329 hsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1330 hsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1331 hsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1332 hsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1333 );
1334 }
1335
h_generic_calc_QAdd8Ux4(UInt xx,UInt yy)1336 UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy )
1337 {
1338 return mk8x4(
1339 qadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1340 qadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1341 qadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1342 qadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1343 );
1344 }
1345
h_generic_calc_QAdd8Sx4(UInt xx,UInt yy)1346 UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy )
1347 {
1348 return mk8x4(
1349 qadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1350 qadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1351 qadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1352 qadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1353 );
1354 }
1355
h_generic_calc_QSub8Ux4(UInt xx,UInt yy)1356 UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy )
1357 {
1358 return mk8x4(
1359 qsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1360 qsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1361 qsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1362 qsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1363 );
1364 }
1365
h_generic_calc_QSub8Sx4(UInt xx,UInt yy)1366 UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy )
1367 {
1368 return mk8x4(
1369 qsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1370 qsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1371 qsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1372 qsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1373 );
1374 }
1375
h_generic_calc_CmpNEZ16x2(UInt xx)1376 UInt h_generic_calc_CmpNEZ16x2 ( UInt xx )
1377 {
1378 return mk16x2(
1379 cmpnez16( sel16x2_1(xx) ),
1380 cmpnez16( sel16x2_0(xx) )
1381 );
1382 }
1383
h_generic_calc_CmpNEZ8x4(UInt xx)1384 UInt h_generic_calc_CmpNEZ8x4 ( UInt xx )
1385 {
1386 return mk8x4(
1387 cmpnez8( sel8x4_3(xx) ),
1388 cmpnez8( sel8x4_2(xx) ),
1389 cmpnez8( sel8x4_1(xx) ),
1390 cmpnez8( sel8x4_0(xx) )
1391 );
1392 }
1393
h_generic_calc_Sad8Ux4(UInt xx,UInt yy)1394 UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy )
1395 {
1396 return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) )
1397 + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) )
1398 + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) )
1399 + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) );
1400 }
1401
h_generic_calc_QAdd32S(UInt xx,UInt yy)1402 UInt h_generic_calc_QAdd32S ( UInt xx, UInt yy )
1403 {
1404 return qadd32S( xx, yy );
1405 }
1406
h_generic_calc_QSub32S(UInt xx,UInt yy)1407 UInt h_generic_calc_QSub32S ( UInt xx, UInt yy )
1408 {
1409 return qsub32S( xx, yy );
1410 }
1411
1412
1413 /*------------------------------------------------------------------*/
1414 /* Decimal Floating Point (DFP) externally visible helper functions */
1415 /* that implement Iop_BCDtoDPB and Iop_DPBtoBCD */
1416 /*------------------------------------------------------------------*/
1417
1418 #define NOT( x ) ( ( ( x ) == 0) ? 1 : 0)
1419 #define GET( x, y ) ( ( ( x ) & ( 0x1UL << ( y ) ) ) >> ( y ) )
1420 #define PUT( x, y ) ( ( x )<< ( y ) )
1421
dpb_to_bcd(ULong chunk)1422 ULong dpb_to_bcd( ULong chunk )
1423 {
1424 Short a, b, c, d, e, f, g, h, i, j, k, m;
1425 Short p, q, r, s, t, u, v, w, x, y;
1426 ULong value;
1427
1428 /* convert 10 bit densely packed BCD to BCD */
1429 p = GET( chunk, 9 );
1430 q = GET( chunk, 8 );
1431 r = GET( chunk, 7 );
1432 s = GET( chunk, 6 );
1433 t = GET( chunk, 5 );
1434 u = GET( chunk, 4 );
1435 v = GET( chunk, 3 );
1436 w = GET( chunk, 2 );
1437 x = GET( chunk, 1 );
1438 y = GET( chunk, 0 );
1439
1440 /* The BCD bit values are given by the following boolean equations.*/
1441 a = ( NOT(s) & v & w ) | ( t & v & w & s ) | ( v & w & NOT(x) );
1442 b = ( p & s & x & NOT(t) ) | ( p & NOT(w) ) | ( p & NOT(v) );
1443 c = ( q & s & x & NOT(t) ) | ( q & NOT(w) ) | ( q & NOT(v) );
1444 d = r;
1445 e = ( v & NOT(w) & x ) | ( s & v & w & x ) | ( NOT(t) & v & x & w );
1446 f = ( p & t & v & w & x & NOT(s) ) | ( s & NOT(x) & v ) | ( s & NOT(v) );
1447 g = ( q & t & w & v & x & NOT(s) ) | ( t & NOT(x) & v ) | ( t & NOT(v) );
1448 h = u;
1449 i = ( t & v & w & x ) | ( s & v & w & x ) | ( v & NOT(w) & NOT(x) );
1450 j = ( p & NOT(s) & NOT(t) & w & v ) | ( s & v & NOT(w) & x )
1451 | ( p & w & NOT(x) & v ) | ( w & NOT(v) );
1452 k = ( q & NOT(s) & NOT(t) & v & w ) | ( t & v & NOT(w) & x )
1453 | ( q & v & w & NOT(x) ) | ( x & NOT(v) );
1454 m = y;
1455
1456 value = PUT(a, 11) | PUT(b, 10) | PUT(c, 9) | PUT(d, 8) | PUT(e, 7)
1457 | PUT(f, 6) | PUT(g, 5) | PUT(h, 4) | PUT(i, 3) | PUT(j, 2)
1458 | PUT(k, 1) | PUT(m, 0);
1459 return value;
1460 }
1461
bcd_to_dpb(ULong chunk)1462 ULong bcd_to_dpb( ULong chunk )
1463 {
1464 Short a, b, c, d, e, f, g, h, i, j, k, m;
1465 Short p, q, r, s, t, u, v, w, x, y;
1466 ULong value;
1467 /* Convert a 3 digit BCD value to a 10 bit Densely Packed Binary (DPD) value
1468 The boolean equations to calculate the value of each of the DPD bit
1469 is given in Appendix B of Book 1: Power ISA User Instruction set. The
1470 bits for the DPD number are [abcdefghijkm]. The bits for the BCD value
1471 are [pqrstuvwxy]. The boolean logic equations in psuedo C code are:
1472 */
1473 a = GET( chunk, 11 );
1474 b = GET( chunk, 10 );
1475 c = GET( chunk, 9 );
1476 d = GET( chunk, 8 );
1477 e = GET( chunk, 7 );
1478 f = GET( chunk, 6 );
1479 g = GET( chunk, 5 );
1480 h = GET( chunk, 4 );
1481 i = GET( chunk, 3 );
1482 j = GET( chunk, 2 );
1483 k = GET( chunk, 1 );
1484 m = GET( chunk, 0 );
1485
1486 p = ( f & a & i & NOT(e) ) | ( j & a & NOT(i) ) | ( b & NOT(a) );
1487 q = ( g & a & i & NOT(e) ) | ( k & a & NOT(i) ) | ( c & NOT(a) );
1488 r = d;
1489 s = ( j & NOT(a) & e & NOT(i) ) | ( f & NOT(i) & NOT(e) )
1490 | ( f & NOT(a) & NOT(e) ) | ( e & i );
1491 t = ( k & NOT(a) & e & NOT(i) ) | ( g & NOT(i) & NOT(e) )
1492 | ( g & NOT(a) & NOT(e) ) | ( a & i );
1493 u = h;
1494 v = a | e | i;
1495 w = ( NOT(e) & j & NOT(i) ) | ( e & i ) | a;
1496 x = ( NOT(a) & k & NOT(i) ) | ( a & i ) | e;
1497 y = m;
1498
1499 value = PUT(p, 9) | PUT(q, 8) | PUT(r, 7) | PUT(s, 6) | PUT(t, 5)
1500 | PUT(u, 4) | PUT(v, 3) | PUT(w, 2) | PUT(x, 1) | y;
1501
1502 return value;
1503 }
1504
h_DPBtoBCD(ULong dpb)1505 ULong h_DPBtoBCD( ULong dpb )
1506 {
1507 ULong result, chunk;
1508 Int i;
1509
1510 result = 0;
1511
1512 for (i = 0; i < 5; i++) {
1513 chunk = dpb >> ( 4 - i ) * 10;
1514 result = result << 12;
1515 result |= dpb_to_bcd( chunk & 0x3FF );
1516 }
1517 return result;
1518 }
1519
h_BCDtoDPB(ULong bcd)1520 ULong h_BCDtoDPB( ULong bcd )
1521 {
1522 ULong result, chunk;
1523 Int i;
1524
1525 result = 0;
1526
1527 for (i = 0; i < 5; i++) {
1528 chunk = bcd >> ( 4 - i ) * 12;
1529 result = result << 10;
1530 result |= bcd_to_dpb( chunk & 0xFFF );
1531 }
1532 return result;
1533 }
1534 #undef NOT
1535 #undef GET
1536 #undef PUT
1537
1538 /*---------------------------------------------------------------*/
1539 /*--- end host_generic_simd64.c ---*/
1540 /*---------------------------------------------------------------*/
1541
1542