1
2 /*---------------------------------------------------------------*/
3 /*--- begin host_generic_simd64.c ---*/
4 /*---------------------------------------------------------------*/
5
6 /*
7 This file is part of Valgrind, a dynamic binary instrumentation
8 framework.
9
10 Copyright (C) 2004-2011 OpenWorks LLP
11 info@open-works.net
12
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
17
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, write to the Free Software
25 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26 02110-1301, USA.
27
28 The GNU General Public License is contained in the file COPYING.
29
30 Neither the names of the U.S. Department of Energy nor the
31 University of California nor the names of its contributors may be
32 used to endorse or promote products derived from this software
33 without prior written permission.
34 */
35
36 /* Generic helper functions for doing 64-bit SIMD arithmetic in cases
37 where the instruction selectors cannot generate code in-line.
38 These are purely back-end entities and cannot be seen/referenced
39 from IR. */
40
41 #include "libvex_basictypes.h"
42 #include "host_generic_simd64.h"
43
44
45
46 /* Tuple/select functions for 32x2 vectors. */
47
mk32x2(UInt w1,UInt w0)48 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
49 return (((ULong)w1) << 32) | ((ULong)w0);
50 }
51
sel32x2_1(ULong w64)52 static inline UInt sel32x2_1 ( ULong w64 ) {
53 return 0xFFFFFFFF & toUInt(w64 >> 32);
54 }
sel32x2_0(ULong w64)55 static inline UInt sel32x2_0 ( ULong w64 ) {
56 return 0xFFFFFFFF & toUInt(w64);
57 }
58
59
60 /* Tuple/select functions for 16x4 vectors. gcc is pretty hopeless
61 with 64-bit shifts so we give it a hand. */
62
mk16x4(UShort w3,UShort w2,UShort w1,UShort w0)63 static inline ULong mk16x4 ( UShort w3, UShort w2,
64 UShort w1, UShort w0 ) {
65 UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2);
66 UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0);
67 return mk32x2(hi32, lo32);
68 }
69
sel16x4_3(ULong w64)70 static inline UShort sel16x4_3 ( ULong w64 ) {
71 UInt hi32 = toUInt(w64 >> 32);
72 return toUShort(0xFFFF & (hi32 >> 16));
73 }
sel16x4_2(ULong w64)74 static inline UShort sel16x4_2 ( ULong w64 ) {
75 UInt hi32 = toUInt(w64 >> 32);
76 return toUShort(0xFFFF & hi32);
77 }
sel16x4_1(ULong w64)78 static inline UShort sel16x4_1 ( ULong w64 ) {
79 UInt lo32 = (UInt)w64;
80 return toUShort(0xFFFF & (lo32 >> 16));
81 }
sel16x4_0(ULong w64)82 static inline UShort sel16x4_0 ( ULong w64 ) {
83 UInt lo32 = (UInt)w64;
84 return toUShort(0xFFFF & lo32);
85 }
86
87
88 /* Tuple/select functions for 8x8 vectors. */
89
mk8x8(UChar w7,UChar w6,UChar w5,UChar w4,UChar w3,UChar w2,UChar w1,UChar w0)90 static inline ULong mk8x8 ( UChar w7, UChar w6,
91 UChar w5, UChar w4,
92 UChar w3, UChar w2,
93 UChar w1, UChar w0 ) {
94 UInt hi32 = (((UInt)w7) << 24) | (((UInt)w6) << 16)
95 | (((UInt)w5) << 8) | (((UInt)w4) << 0);
96 UInt lo32 = (((UInt)w3) << 24) | (((UInt)w2) << 16)
97 | (((UInt)w1) << 8) | (((UInt)w0) << 0);
98 return mk32x2(hi32, lo32);
99 }
100
sel8x8_7(ULong w64)101 static inline UChar sel8x8_7 ( ULong w64 ) {
102 UInt hi32 = toUInt(w64 >> 32);
103 return toUChar(0xFF & (hi32 >> 24));
104 }
sel8x8_6(ULong w64)105 static inline UChar sel8x8_6 ( ULong w64 ) {
106 UInt hi32 = toUInt(w64 >> 32);
107 return toUChar(0xFF & (hi32 >> 16));
108 }
sel8x8_5(ULong w64)109 static inline UChar sel8x8_5 ( ULong w64 ) {
110 UInt hi32 = toUInt(w64 >> 32);
111 return toUChar(0xFF & (hi32 >> 8));
112 }
sel8x8_4(ULong w64)113 static inline UChar sel8x8_4 ( ULong w64 ) {
114 UInt hi32 = toUInt(w64 >> 32);
115 return toUChar(0xFF & (hi32 >> 0));
116 }
sel8x8_3(ULong w64)117 static inline UChar sel8x8_3 ( ULong w64 ) {
118 UInt lo32 = (UInt)w64;
119 return toUChar(0xFF & (lo32 >> 24));
120 }
sel8x8_2(ULong w64)121 static inline UChar sel8x8_2 ( ULong w64 ) {
122 UInt lo32 = (UInt)w64;
123 return toUChar(0xFF & (lo32 >> 16));
124 }
sel8x8_1(ULong w64)125 static inline UChar sel8x8_1 ( ULong w64 ) {
126 UInt lo32 = (UInt)w64;
127 return toUChar(0xFF & (lo32 >> 8));
128 }
sel8x8_0(ULong w64)129 static inline UChar sel8x8_0 ( ULong w64 ) {
130 UInt lo32 = (UInt)w64;
131 return toUChar(0xFF & (lo32 >> 0));
132 }
133
index8x8(ULong w64,UChar ix)134 static inline UChar index8x8 ( ULong w64, UChar ix ) {
135 ix &= 7;
136 return toUChar((w64 >> (8*ix)) & 0xFF);
137 }
138
139
140 /* Scalar helpers. */
141
qadd16S(Short xx,Short yy)142 static inline Short qadd16S ( Short xx, Short yy )
143 {
144 Int t = ((Int)xx) + ((Int)yy);
145 if (t < -32768) t = -32768;
146 if (t > 32767) t = 32767;
147 return (Short)t;
148 }
149
qadd8S(Char xx,Char yy)150 static inline Char qadd8S ( Char xx, Char yy )
151 {
152 Int t = ((Int)xx) + ((Int)yy);
153 if (t < -128) t = -128;
154 if (t > 127) t = 127;
155 return (Char)t;
156 }
157
qadd16U(UShort xx,UShort yy)158 static inline UShort qadd16U ( UShort xx, UShort yy )
159 {
160 UInt t = ((UInt)xx) + ((UInt)yy);
161 if (t > 0xFFFF) t = 0xFFFF;
162 return (UShort)t;
163 }
164
qadd8U(UChar xx,UChar yy)165 static inline UChar qadd8U ( UChar xx, UChar yy )
166 {
167 UInt t = ((UInt)xx) + ((UInt)yy);
168 if (t > 0xFF) t = 0xFF;
169 return (UChar)t;
170 }
171
qsub16S(Short xx,Short yy)172 static inline Short qsub16S ( Short xx, Short yy )
173 {
174 Int t = ((Int)xx) - ((Int)yy);
175 if (t < -32768) t = -32768;
176 if (t > 32767) t = 32767;
177 return (Short)t;
178 }
179
qsub8S(Char xx,Char yy)180 static inline Char qsub8S ( Char xx, Char yy )
181 {
182 Int t = ((Int)xx) - ((Int)yy);
183 if (t < -128) t = -128;
184 if (t > 127) t = 127;
185 return (Char)t;
186 }
187
qsub16U(UShort xx,UShort yy)188 static inline UShort qsub16U ( UShort xx, UShort yy )
189 {
190 Int t = ((Int)xx) - ((Int)yy);
191 if (t < 0) t = 0;
192 if (t > 0xFFFF) t = 0xFFFF;
193 return (UShort)t;
194 }
195
qsub8U(UChar xx,UChar yy)196 static inline UChar qsub8U ( UChar xx, UChar yy )
197 {
198 Int t = ((Int)xx) - ((Int)yy);
199 if (t < 0) t = 0;
200 if (t > 0xFF) t = 0xFF;
201 return (UChar)t;
202 }
203
mul16(Short xx,Short yy)204 static inline Short mul16 ( Short xx, Short yy )
205 {
206 Int t = ((Int)xx) * ((Int)yy);
207 return (Short)t;
208 }
209
mul32(Int xx,Int yy)210 static inline Int mul32 ( Int xx, Int yy )
211 {
212 Int t = ((Int)xx) * ((Int)yy);
213 return (Int)t;
214 }
215
mulhi16S(Short xx,Short yy)216 static inline Short mulhi16S ( Short xx, Short yy )
217 {
218 Int t = ((Int)xx) * ((Int)yy);
219 t >>=/*s*/ 16;
220 return (Short)t;
221 }
222
mulhi16U(UShort xx,UShort yy)223 static inline UShort mulhi16U ( UShort xx, UShort yy )
224 {
225 UInt t = ((UInt)xx) * ((UInt)yy);
226 t >>=/*u*/ 16;
227 return (UShort)t;
228 }
229
cmpeq32(UInt xx,UInt yy)230 static inline UInt cmpeq32 ( UInt xx, UInt yy )
231 {
232 return xx==yy ? 0xFFFFFFFF : 0;
233 }
234
cmpeq16(UShort xx,UShort yy)235 static inline UShort cmpeq16 ( UShort xx, UShort yy )
236 {
237 return toUShort(xx==yy ? 0xFFFF : 0);
238 }
239
cmpeq8(UChar xx,UChar yy)240 static inline UChar cmpeq8 ( UChar xx, UChar yy )
241 {
242 return toUChar(xx==yy ? 0xFF : 0);
243 }
244
cmpgt32S(Int xx,Int yy)245 static inline UInt cmpgt32S ( Int xx, Int yy )
246 {
247 return xx>yy ? 0xFFFFFFFF : 0;
248 }
249
cmpgt16S(Short xx,Short yy)250 static inline UShort cmpgt16S ( Short xx, Short yy )
251 {
252 return toUShort(xx>yy ? 0xFFFF : 0);
253 }
254
cmpgt8S(Char xx,Char yy)255 static inline UChar cmpgt8S ( Char xx, Char yy )
256 {
257 return toUChar(xx>yy ? 0xFF : 0);
258 }
259
cmpnez32(UInt xx)260 static inline UInt cmpnez32 ( UInt xx )
261 {
262 return xx==0 ? 0 : 0xFFFFFFFF;
263 }
264
cmpnez16(UShort xx)265 static inline UShort cmpnez16 ( UShort xx )
266 {
267 return toUShort(xx==0 ? 0 : 0xFFFF);
268 }
269
cmpnez8(UChar xx)270 static inline UChar cmpnez8 ( UChar xx )
271 {
272 return toUChar(xx==0 ? 0 : 0xFF);
273 }
274
qnarrow32Sto16S(UInt xx0)275 static inline Short qnarrow32Sto16S ( UInt xx0 )
276 {
277 Int xx = (Int)xx0;
278 if (xx < -32768) xx = -32768;
279 if (xx > 32767) xx = 32767;
280 return (Short)xx;
281 }
282
qnarrow16Sto8S(UShort xx0)283 static inline Char qnarrow16Sto8S ( UShort xx0 )
284 {
285 Short xx = (Short)xx0;
286 if (xx < -128) xx = -128;
287 if (xx > 127) xx = 127;
288 return (Char)xx;
289 }
290
qnarrow16Sto8U(UShort xx0)291 static inline UChar qnarrow16Sto8U ( UShort xx0 )
292 {
293 Short xx = (Short)xx0;
294 if (xx < 0) xx = 0;
295 if (xx > 255) xx = 255;
296 return (UChar)xx;
297 }
298
narrow32to16(UInt xx)299 static inline UShort narrow32to16 ( UInt xx )
300 {
301 return (UShort)xx;
302 }
303
narrow16to8(UShort xx)304 static inline UChar narrow16to8 ( UShort xx )
305 {
306 return (UChar)xx;
307 }
308
309 /* shifts: we don't care about out-of-range ones, since
310 that is dealt with at a higher level. */
311
shl8(UChar v,UInt n)312 static inline UChar shl8 ( UChar v, UInt n )
313 {
314 return toUChar(v << n);
315 }
316
sar8(UChar v,UInt n)317 static inline UChar sar8 ( UChar v, UInt n )
318 {
319 return toUChar(((Char)v) >> n);
320 }
321
shl16(UShort v,UInt n)322 static inline UShort shl16 ( UShort v, UInt n )
323 {
324 return toUShort(v << n);
325 }
326
shr16(UShort v,UInt n)327 static inline UShort shr16 ( UShort v, UInt n )
328 {
329 return toUShort((((UShort)v) >> n));
330 }
331
sar16(UShort v,UInt n)332 static inline UShort sar16 ( UShort v, UInt n )
333 {
334 return toUShort(((Short)v) >> n);
335 }
336
shl32(UInt v,UInt n)337 static inline UInt shl32 ( UInt v, UInt n )
338 {
339 return v << n;
340 }
341
shr32(UInt v,UInt n)342 static inline UInt shr32 ( UInt v, UInt n )
343 {
344 return (((UInt)v) >> n);
345 }
346
sar32(UInt v,UInt n)347 static inline UInt sar32 ( UInt v, UInt n )
348 {
349 return ((Int)v) >> n;
350 }
351
avg8U(UChar xx,UChar yy)352 static inline UChar avg8U ( UChar xx, UChar yy )
353 {
354 UInt xxi = (UInt)xx;
355 UInt yyi = (UInt)yy;
356 UInt r = (xxi + yyi + 1) >> 1;
357 return (UChar)r;
358 }
359
avg16U(UShort xx,UShort yy)360 static inline UShort avg16U ( UShort xx, UShort yy )
361 {
362 UInt xxi = (UInt)xx;
363 UInt yyi = (UInt)yy;
364 UInt r = (xxi + yyi + 1) >> 1;
365 return (UShort)r;
366 }
367
max16S(Short xx,Short yy)368 static inline Short max16S ( Short xx, Short yy )
369 {
370 return toUShort((xx > yy) ? xx : yy);
371 }
372
max8U(UChar xx,UChar yy)373 static inline UChar max8U ( UChar xx, UChar yy )
374 {
375 return toUChar((xx > yy) ? xx : yy);
376 }
377
min16S(Short xx,Short yy)378 static inline Short min16S ( Short xx, Short yy )
379 {
380 return toUShort((xx < yy) ? xx : yy);
381 }
382
min8U(UChar xx,UChar yy)383 static inline UChar min8U ( UChar xx, UChar yy )
384 {
385 return toUChar((xx < yy) ? xx : yy);
386 }
387
hadd16U(UShort xx,UShort yy)388 static inline UShort hadd16U ( UShort xx, UShort yy )
389 {
390 UInt xxi = (UInt)xx;
391 UInt yyi = (UInt)yy;
392 UInt r = (xxi + yyi) >> 1;
393 return (UShort)r;
394 }
395
hadd16S(Short xx,Short yy)396 static inline Short hadd16S ( Short xx, Short yy )
397 {
398 Int xxi = (Int)xx;
399 Int yyi = (Int)yy;
400 Int r = (xxi + yyi) >> 1;
401 return (Short)r;
402 }
403
hsub16U(UShort xx,UShort yy)404 static inline UShort hsub16U ( UShort xx, UShort yy )
405 {
406 UInt xxi = (UInt)xx;
407 UInt yyi = (UInt)yy;
408 UInt r = (xxi - yyi) >> 1;
409 return (UShort)r;
410 }
411
hsub16S(Short xx,Short yy)412 static inline Short hsub16S ( Short xx, Short yy )
413 {
414 Int xxi = (Int)xx;
415 Int yyi = (Int)yy;
416 Int r = (xxi - yyi) >> 1;
417 return (Short)r;
418 }
419
hadd8U(UChar xx,UChar yy)420 static inline UChar hadd8U ( UChar xx, UChar yy )
421 {
422 UInt xxi = (UInt)xx;
423 UInt yyi = (UInt)yy;
424 UInt r = (xxi + yyi) >> 1;
425 return (UChar)r;
426 }
427
hadd8S(Char xx,Char yy)428 static inline Char hadd8S ( Char xx, Char yy )
429 {
430 Int xxi = (Int)xx;
431 Int yyi = (Int)yy;
432 Int r = (xxi + yyi) >> 1;
433 return (Char)r;
434 }
435
hsub8U(UChar xx,UChar yy)436 static inline UChar hsub8U ( UChar xx, UChar yy )
437 {
438 UInt xxi = (UInt)xx;
439 UInt yyi = (UInt)yy;
440 UInt r = (xxi - yyi) >> 1;
441 return (UChar)r;
442 }
443
hsub8S(Char xx,Char yy)444 static inline Char hsub8S ( Char xx, Char yy )
445 {
446 Int xxi = (Int)xx;
447 Int yyi = (Int)yy;
448 Int r = (xxi - yyi) >> 1;
449 return (Char)r;
450 }
451
absdiff8U(UChar xx,UChar yy)452 static inline UInt absdiff8U ( UChar xx, UChar yy )
453 {
454 UInt xxu = (UChar)xx;
455 UInt yyu = (UChar)yy;
456 return xxu >= yyu ? xxu - yyu : yyu - xxu;
457 }
458
459 /* ----------------------------------------------------- */
460 /* Start of the externally visible functions. These simply
461 implement the corresponding IR primops. */
462 /* ----------------------------------------------------- */
463
464 /* ------------ Normal addition ------------ */
465
h_generic_calc_Add32x2(ULong xx,ULong yy)466 ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy )
467 {
468 return mk32x2(
469 sel32x2_1(xx) + sel32x2_1(yy),
470 sel32x2_0(xx) + sel32x2_0(yy)
471 );
472 }
473
h_generic_calc_Add16x4(ULong xx,ULong yy)474 ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy )
475 {
476 return mk16x4(
477 toUShort( sel16x4_3(xx) + sel16x4_3(yy) ),
478 toUShort( sel16x4_2(xx) + sel16x4_2(yy) ),
479 toUShort( sel16x4_1(xx) + sel16x4_1(yy) ),
480 toUShort( sel16x4_0(xx) + sel16x4_0(yy) )
481 );
482 }
483
h_generic_calc_Add8x8(ULong xx,ULong yy)484 ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy )
485 {
486 return mk8x8(
487 toUChar( sel8x8_7(xx) + sel8x8_7(yy) ),
488 toUChar( sel8x8_6(xx) + sel8x8_6(yy) ),
489 toUChar( sel8x8_5(xx) + sel8x8_5(yy) ),
490 toUChar( sel8x8_4(xx) + sel8x8_4(yy) ),
491 toUChar( sel8x8_3(xx) + sel8x8_3(yy) ),
492 toUChar( sel8x8_2(xx) + sel8x8_2(yy) ),
493 toUChar( sel8x8_1(xx) + sel8x8_1(yy) ),
494 toUChar( sel8x8_0(xx) + sel8x8_0(yy) )
495 );
496 }
497
498 /* ------------ Saturating addition ------------ */
499
h_generic_calc_QAdd16Sx4(ULong xx,ULong yy)500 ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy )
501 {
502 return mk16x4(
503 qadd16S( sel16x4_3(xx), sel16x4_3(yy) ),
504 qadd16S( sel16x4_2(xx), sel16x4_2(yy) ),
505 qadd16S( sel16x4_1(xx), sel16x4_1(yy) ),
506 qadd16S( sel16x4_0(xx), sel16x4_0(yy) )
507 );
508 }
509
h_generic_calc_QAdd8Sx8(ULong xx,ULong yy)510 ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy )
511 {
512 return mk8x8(
513 qadd8S( sel8x8_7(xx), sel8x8_7(yy) ),
514 qadd8S( sel8x8_6(xx), sel8x8_6(yy) ),
515 qadd8S( sel8x8_5(xx), sel8x8_5(yy) ),
516 qadd8S( sel8x8_4(xx), sel8x8_4(yy) ),
517 qadd8S( sel8x8_3(xx), sel8x8_3(yy) ),
518 qadd8S( sel8x8_2(xx), sel8x8_2(yy) ),
519 qadd8S( sel8x8_1(xx), sel8x8_1(yy) ),
520 qadd8S( sel8x8_0(xx), sel8x8_0(yy) )
521 );
522 }
523
h_generic_calc_QAdd16Ux4(ULong xx,ULong yy)524 ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy )
525 {
526 return mk16x4(
527 qadd16U( sel16x4_3(xx), sel16x4_3(yy) ),
528 qadd16U( sel16x4_2(xx), sel16x4_2(yy) ),
529 qadd16U( sel16x4_1(xx), sel16x4_1(yy) ),
530 qadd16U( sel16x4_0(xx), sel16x4_0(yy) )
531 );
532 }
533
h_generic_calc_QAdd8Ux8(ULong xx,ULong yy)534 ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy )
535 {
536 return mk8x8(
537 qadd8U( sel8x8_7(xx), sel8x8_7(yy) ),
538 qadd8U( sel8x8_6(xx), sel8x8_6(yy) ),
539 qadd8U( sel8x8_5(xx), sel8x8_5(yy) ),
540 qadd8U( sel8x8_4(xx), sel8x8_4(yy) ),
541 qadd8U( sel8x8_3(xx), sel8x8_3(yy) ),
542 qadd8U( sel8x8_2(xx), sel8x8_2(yy) ),
543 qadd8U( sel8x8_1(xx), sel8x8_1(yy) ),
544 qadd8U( sel8x8_0(xx), sel8x8_0(yy) )
545 );
546 }
547
548 /* ------------ Normal subtraction ------------ */
549
h_generic_calc_Sub32x2(ULong xx,ULong yy)550 ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy )
551 {
552 return mk32x2(
553 sel32x2_1(xx) - sel32x2_1(yy),
554 sel32x2_0(xx) - sel32x2_0(yy)
555 );
556 }
557
h_generic_calc_Sub16x4(ULong xx,ULong yy)558 ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy )
559 {
560 return mk16x4(
561 toUShort( sel16x4_3(xx) - sel16x4_3(yy) ),
562 toUShort( sel16x4_2(xx) - sel16x4_2(yy) ),
563 toUShort( sel16x4_1(xx) - sel16x4_1(yy) ),
564 toUShort( sel16x4_0(xx) - sel16x4_0(yy) )
565 );
566 }
567
h_generic_calc_Sub8x8(ULong xx,ULong yy)568 ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy )
569 {
570 return mk8x8(
571 toUChar( sel8x8_7(xx) - sel8x8_7(yy) ),
572 toUChar( sel8x8_6(xx) - sel8x8_6(yy) ),
573 toUChar( sel8x8_5(xx) - sel8x8_5(yy) ),
574 toUChar( sel8x8_4(xx) - sel8x8_4(yy) ),
575 toUChar( sel8x8_3(xx) - sel8x8_3(yy) ),
576 toUChar( sel8x8_2(xx) - sel8x8_2(yy) ),
577 toUChar( sel8x8_1(xx) - sel8x8_1(yy) ),
578 toUChar( sel8x8_0(xx) - sel8x8_0(yy) )
579 );
580 }
581
582 /* ------------ Saturating subtraction ------------ */
583
h_generic_calc_QSub16Sx4(ULong xx,ULong yy)584 ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy )
585 {
586 return mk16x4(
587 qsub16S( sel16x4_3(xx), sel16x4_3(yy) ),
588 qsub16S( sel16x4_2(xx), sel16x4_2(yy) ),
589 qsub16S( sel16x4_1(xx), sel16x4_1(yy) ),
590 qsub16S( sel16x4_0(xx), sel16x4_0(yy) )
591 );
592 }
593
h_generic_calc_QSub8Sx8(ULong xx,ULong yy)594 ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy )
595 {
596 return mk8x8(
597 qsub8S( sel8x8_7(xx), sel8x8_7(yy) ),
598 qsub8S( sel8x8_6(xx), sel8x8_6(yy) ),
599 qsub8S( sel8x8_5(xx), sel8x8_5(yy) ),
600 qsub8S( sel8x8_4(xx), sel8x8_4(yy) ),
601 qsub8S( sel8x8_3(xx), sel8x8_3(yy) ),
602 qsub8S( sel8x8_2(xx), sel8x8_2(yy) ),
603 qsub8S( sel8x8_1(xx), sel8x8_1(yy) ),
604 qsub8S( sel8x8_0(xx), sel8x8_0(yy) )
605 );
606 }
607
h_generic_calc_QSub16Ux4(ULong xx,ULong yy)608 ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy )
609 {
610 return mk16x4(
611 qsub16U( sel16x4_3(xx), sel16x4_3(yy) ),
612 qsub16U( sel16x4_2(xx), sel16x4_2(yy) ),
613 qsub16U( sel16x4_1(xx), sel16x4_1(yy) ),
614 qsub16U( sel16x4_0(xx), sel16x4_0(yy) )
615 );
616 }
617
h_generic_calc_QSub8Ux8(ULong xx,ULong yy)618 ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy )
619 {
620 return mk8x8(
621 qsub8U( sel8x8_7(xx), sel8x8_7(yy) ),
622 qsub8U( sel8x8_6(xx), sel8x8_6(yy) ),
623 qsub8U( sel8x8_5(xx), sel8x8_5(yy) ),
624 qsub8U( sel8x8_4(xx), sel8x8_4(yy) ),
625 qsub8U( sel8x8_3(xx), sel8x8_3(yy) ),
626 qsub8U( sel8x8_2(xx), sel8x8_2(yy) ),
627 qsub8U( sel8x8_1(xx), sel8x8_1(yy) ),
628 qsub8U( sel8x8_0(xx), sel8x8_0(yy) )
629 );
630 }
631
632 /* ------------ Multiplication ------------ */
633
h_generic_calc_Mul16x4(ULong xx,ULong yy)634 ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy )
635 {
636 return mk16x4(
637 mul16( sel16x4_3(xx), sel16x4_3(yy) ),
638 mul16( sel16x4_2(xx), sel16x4_2(yy) ),
639 mul16( sel16x4_1(xx), sel16x4_1(yy) ),
640 mul16( sel16x4_0(xx), sel16x4_0(yy) )
641 );
642 }
643
h_generic_calc_Mul32x2(ULong xx,ULong yy)644 ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy )
645 {
646 return mk32x2(
647 mul32( sel32x2_1(xx), sel32x2_1(yy) ),
648 mul32( sel32x2_0(xx), sel32x2_0(yy) )
649 );
650 }
651
h_generic_calc_MulHi16Sx4(ULong xx,ULong yy)652 ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy )
653 {
654 return mk16x4(
655 mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ),
656 mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ),
657 mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ),
658 mulhi16S( sel16x4_0(xx), sel16x4_0(yy) )
659 );
660 }
661
h_generic_calc_MulHi16Ux4(ULong xx,ULong yy)662 ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy )
663 {
664 return mk16x4(
665 mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ),
666 mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ),
667 mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ),
668 mulhi16U( sel16x4_0(xx), sel16x4_0(yy) )
669 );
670 }
671
672 /* ------------ Comparison ------------ */
673
h_generic_calc_CmpEQ32x2(ULong xx,ULong yy)674 ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy )
675 {
676 return mk32x2(
677 cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ),
678 cmpeq32( sel32x2_0(xx), sel32x2_0(yy) )
679 );
680 }
681
h_generic_calc_CmpEQ16x4(ULong xx,ULong yy)682 ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy )
683 {
684 return mk16x4(
685 cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ),
686 cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ),
687 cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ),
688 cmpeq16( sel16x4_0(xx), sel16x4_0(yy) )
689 );
690 }
691
h_generic_calc_CmpEQ8x8(ULong xx,ULong yy)692 ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy )
693 {
694 return mk8x8(
695 cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ),
696 cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ),
697 cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ),
698 cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ),
699 cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ),
700 cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ),
701 cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ),
702 cmpeq8( sel8x8_0(xx), sel8x8_0(yy) )
703 );
704 }
705
h_generic_calc_CmpGT32Sx2(ULong xx,ULong yy)706 ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy )
707 {
708 return mk32x2(
709 cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ),
710 cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) )
711 );
712 }
713
h_generic_calc_CmpGT16Sx4(ULong xx,ULong yy)714 ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy )
715 {
716 return mk16x4(
717 cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ),
718 cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ),
719 cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ),
720 cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) )
721 );
722 }
723
h_generic_calc_CmpGT8Sx8(ULong xx,ULong yy)724 ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy )
725 {
726 return mk8x8(
727 cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ),
728 cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ),
729 cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ),
730 cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ),
731 cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ),
732 cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ),
733 cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ),
734 cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) )
735 );
736 }
737
h_generic_calc_CmpNEZ32x2(ULong xx)738 ULong h_generic_calc_CmpNEZ32x2 ( ULong xx )
739 {
740 return mk32x2(
741 cmpnez32( sel32x2_1(xx) ),
742 cmpnez32( sel32x2_0(xx) )
743 );
744 }
745
h_generic_calc_CmpNEZ16x4(ULong xx)746 ULong h_generic_calc_CmpNEZ16x4 ( ULong xx )
747 {
748 return mk16x4(
749 cmpnez16( sel16x4_3(xx) ),
750 cmpnez16( sel16x4_2(xx) ),
751 cmpnez16( sel16x4_1(xx) ),
752 cmpnez16( sel16x4_0(xx) )
753 );
754 }
755
h_generic_calc_CmpNEZ8x8(ULong xx)756 ULong h_generic_calc_CmpNEZ8x8 ( ULong xx )
757 {
758 return mk8x8(
759 cmpnez8( sel8x8_7(xx) ),
760 cmpnez8( sel8x8_6(xx) ),
761 cmpnez8( sel8x8_5(xx) ),
762 cmpnez8( sel8x8_4(xx) ),
763 cmpnez8( sel8x8_3(xx) ),
764 cmpnez8( sel8x8_2(xx) ),
765 cmpnez8( sel8x8_1(xx) ),
766 cmpnez8( sel8x8_0(xx) )
767 );
768 }
769
770 /* ------------ Saturating narrowing ------------ */
771
h_generic_calc_QNarrowBin32Sto16Sx4(ULong aa,ULong bb)772 ULong h_generic_calc_QNarrowBin32Sto16Sx4 ( ULong aa, ULong bb )
773 {
774 UInt d = sel32x2_1(aa);
775 UInt c = sel32x2_0(aa);
776 UInt b = sel32x2_1(bb);
777 UInt a = sel32x2_0(bb);
778 return mk16x4(
779 qnarrow32Sto16S(d),
780 qnarrow32Sto16S(c),
781 qnarrow32Sto16S(b),
782 qnarrow32Sto16S(a)
783 );
784 }
785
h_generic_calc_QNarrowBin16Sto8Sx8(ULong aa,ULong bb)786 ULong h_generic_calc_QNarrowBin16Sto8Sx8 ( ULong aa, ULong bb )
787 {
788 UShort h = sel16x4_3(aa);
789 UShort g = sel16x4_2(aa);
790 UShort f = sel16x4_1(aa);
791 UShort e = sel16x4_0(aa);
792 UShort d = sel16x4_3(bb);
793 UShort c = sel16x4_2(bb);
794 UShort b = sel16x4_1(bb);
795 UShort a = sel16x4_0(bb);
796 return mk8x8(
797 qnarrow16Sto8S(h),
798 qnarrow16Sto8S(g),
799 qnarrow16Sto8S(f),
800 qnarrow16Sto8S(e),
801 qnarrow16Sto8S(d),
802 qnarrow16Sto8S(c),
803 qnarrow16Sto8S(b),
804 qnarrow16Sto8S(a)
805 );
806 }
807
h_generic_calc_QNarrowBin16Sto8Ux8(ULong aa,ULong bb)808 ULong h_generic_calc_QNarrowBin16Sto8Ux8 ( ULong aa, ULong bb )
809 {
810 UShort h = sel16x4_3(aa);
811 UShort g = sel16x4_2(aa);
812 UShort f = sel16x4_1(aa);
813 UShort e = sel16x4_0(aa);
814 UShort d = sel16x4_3(bb);
815 UShort c = sel16x4_2(bb);
816 UShort b = sel16x4_1(bb);
817 UShort a = sel16x4_0(bb);
818 return mk8x8(
819 qnarrow16Sto8U(h),
820 qnarrow16Sto8U(g),
821 qnarrow16Sto8U(f),
822 qnarrow16Sto8U(e),
823 qnarrow16Sto8U(d),
824 qnarrow16Sto8U(c),
825 qnarrow16Sto8U(b),
826 qnarrow16Sto8U(a)
827 );
828 }
829
830 /* ------------ Truncating narrowing ------------ */
831
h_generic_calc_NarrowBin32to16x4(ULong aa,ULong bb)832 ULong h_generic_calc_NarrowBin32to16x4 ( ULong aa, ULong bb )
833 {
834 UInt d = sel32x2_1(aa);
835 UInt c = sel32x2_0(aa);
836 UInt b = sel32x2_1(bb);
837 UInt a = sel32x2_0(bb);
838 return mk16x4(
839 narrow32to16(d),
840 narrow32to16(c),
841 narrow32to16(b),
842 narrow32to16(a)
843 );
844 }
845
h_generic_calc_NarrowBin16to8x8(ULong aa,ULong bb)846 ULong h_generic_calc_NarrowBin16to8x8 ( ULong aa, ULong bb )
847 {
848 UShort h = sel16x4_3(aa);
849 UShort g = sel16x4_2(aa);
850 UShort f = sel16x4_1(aa);
851 UShort e = sel16x4_0(aa);
852 UShort d = sel16x4_3(bb);
853 UShort c = sel16x4_2(bb);
854 UShort b = sel16x4_1(bb);
855 UShort a = sel16x4_0(bb);
856 return mk8x8(
857 narrow16to8(h),
858 narrow16to8(g),
859 narrow16to8(f),
860 narrow16to8(e),
861 narrow16to8(d),
862 narrow16to8(c),
863 narrow16to8(b),
864 narrow16to8(a)
865 );
866 }
867
868 /* ------------ Interleaving ------------ */
869
h_generic_calc_InterleaveHI8x8(ULong aa,ULong bb)870 ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb )
871 {
872 return mk8x8(
873 sel8x8_7(aa),
874 sel8x8_7(bb),
875 sel8x8_6(aa),
876 sel8x8_6(bb),
877 sel8x8_5(aa),
878 sel8x8_5(bb),
879 sel8x8_4(aa),
880 sel8x8_4(bb)
881 );
882 }
883
h_generic_calc_InterleaveLO8x8(ULong aa,ULong bb)884 ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb )
885 {
886 return mk8x8(
887 sel8x8_3(aa),
888 sel8x8_3(bb),
889 sel8x8_2(aa),
890 sel8x8_2(bb),
891 sel8x8_1(aa),
892 sel8x8_1(bb),
893 sel8x8_0(aa),
894 sel8x8_0(bb)
895 );
896 }
897
h_generic_calc_InterleaveHI16x4(ULong aa,ULong bb)898 ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb )
899 {
900 return mk16x4(
901 sel16x4_3(aa),
902 sel16x4_3(bb),
903 sel16x4_2(aa),
904 sel16x4_2(bb)
905 );
906 }
907
h_generic_calc_InterleaveLO16x4(ULong aa,ULong bb)908 ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb )
909 {
910 return mk16x4(
911 sel16x4_1(aa),
912 sel16x4_1(bb),
913 sel16x4_0(aa),
914 sel16x4_0(bb)
915 );
916 }
917
h_generic_calc_InterleaveHI32x2(ULong aa,ULong bb)918 ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb )
919 {
920 return mk32x2(
921 sel32x2_1(aa),
922 sel32x2_1(bb)
923 );
924 }
925
h_generic_calc_InterleaveLO32x2(ULong aa,ULong bb)926 ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb )
927 {
928 return mk32x2(
929 sel32x2_0(aa),
930 sel32x2_0(bb)
931 );
932 }
933
934 /* ------------ Concatenation ------------ */
935
h_generic_calc_CatOddLanes16x4(ULong aa,ULong bb)936 ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb )
937 {
938 return mk16x4(
939 sel16x4_3(aa),
940 sel16x4_1(aa),
941 sel16x4_3(bb),
942 sel16x4_1(bb)
943 );
944 }
945
h_generic_calc_CatEvenLanes16x4(ULong aa,ULong bb)946 ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb )
947 {
948 return mk16x4(
949 sel16x4_2(aa),
950 sel16x4_0(aa),
951 sel16x4_2(bb),
952 sel16x4_0(bb)
953 );
954 }
955
956 /* misc hack looking for a proper home */
h_generic_calc_Perm8x8(ULong aa,ULong bb)957 ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb )
958 {
959 return mk8x8(
960 index8x8(aa, sel8x8_7(bb)),
961 index8x8(aa, sel8x8_6(bb)),
962 index8x8(aa, sel8x8_5(bb)),
963 index8x8(aa, sel8x8_4(bb)),
964 index8x8(aa, sel8x8_3(bb)),
965 index8x8(aa, sel8x8_2(bb)),
966 index8x8(aa, sel8x8_1(bb)),
967 index8x8(aa, sel8x8_0(bb))
968 );
969 }
970
971 /* ------------ Shifting ------------ */
972 /* Note that because these primops are undefined if the shift amount
973 equals or exceeds the lane width, the shift amount is masked so
974 that the scalar shifts are always in range. In fact, given the
975 semantics of these primops (ShlN16x4, etc) it is an error if in
976 fact we are ever given an out-of-range shift amount.
977 */
h_generic_calc_ShlN32x2(ULong xx,UInt nn)978 ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn )
979 {
980 /* vassert(nn < 32); */
981 nn &= 31;
982 return mk32x2(
983 shl32( sel32x2_1(xx), nn ),
984 shl32( sel32x2_0(xx), nn )
985 );
986 }
987
h_generic_calc_ShlN16x4(ULong xx,UInt nn)988 ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn )
989 {
990 /* vassert(nn < 16); */
991 nn &= 15;
992 return mk16x4(
993 shl16( sel16x4_3(xx), nn ),
994 shl16( sel16x4_2(xx), nn ),
995 shl16( sel16x4_1(xx), nn ),
996 shl16( sel16x4_0(xx), nn )
997 );
998 }
999
h_generic_calc_ShlN8x8(ULong xx,UInt nn)1000 ULong h_generic_calc_ShlN8x8 ( ULong xx, UInt nn )
1001 {
1002 /* vassert(nn < 8); */
1003 nn &= 7;
1004 return mk8x8(
1005 shl8( sel8x8_7(xx), nn ),
1006 shl8( sel8x8_6(xx), nn ),
1007 shl8( sel8x8_5(xx), nn ),
1008 shl8( sel8x8_4(xx), nn ),
1009 shl8( sel8x8_3(xx), nn ),
1010 shl8( sel8x8_2(xx), nn ),
1011 shl8( sel8x8_1(xx), nn ),
1012 shl8( sel8x8_0(xx), nn )
1013 );
1014 }
1015
h_generic_calc_ShrN32x2(ULong xx,UInt nn)1016 ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn )
1017 {
1018 /* vassert(nn < 32); */
1019 nn &= 31;
1020 return mk32x2(
1021 shr32( sel32x2_1(xx), nn ),
1022 shr32( sel32x2_0(xx), nn )
1023 );
1024 }
1025
h_generic_calc_ShrN16x4(ULong xx,UInt nn)1026 ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn )
1027 {
1028 /* vassert(nn < 16); */
1029 nn &= 15;
1030 return mk16x4(
1031 shr16( sel16x4_3(xx), nn ),
1032 shr16( sel16x4_2(xx), nn ),
1033 shr16( sel16x4_1(xx), nn ),
1034 shr16( sel16x4_0(xx), nn )
1035 );
1036 }
1037
h_generic_calc_SarN32x2(ULong xx,UInt nn)1038 ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn )
1039 {
1040 /* vassert(nn < 32); */
1041 nn &= 31;
1042 return mk32x2(
1043 sar32( sel32x2_1(xx), nn ),
1044 sar32( sel32x2_0(xx), nn )
1045 );
1046 }
1047
h_generic_calc_SarN16x4(ULong xx,UInt nn)1048 ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn )
1049 {
1050 /* vassert(nn < 16); */
1051 nn &= 15;
1052 return mk16x4(
1053 sar16( sel16x4_3(xx), nn ),
1054 sar16( sel16x4_2(xx), nn ),
1055 sar16( sel16x4_1(xx), nn ),
1056 sar16( sel16x4_0(xx), nn )
1057 );
1058 }
1059
h_generic_calc_SarN8x8(ULong xx,UInt nn)1060 ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn )
1061 {
1062 /* vassert(nn < 8); */
1063 nn &= 7;
1064 return mk8x8(
1065 sar8( sel8x8_7(xx), nn ),
1066 sar8( sel8x8_6(xx), nn ),
1067 sar8( sel8x8_5(xx), nn ),
1068 sar8( sel8x8_4(xx), nn ),
1069 sar8( sel8x8_3(xx), nn ),
1070 sar8( sel8x8_2(xx), nn ),
1071 sar8( sel8x8_1(xx), nn ),
1072 sar8( sel8x8_0(xx), nn )
1073 );
1074 }
1075
1076 /* ------------ Averaging ------------ */
1077
h_generic_calc_Avg8Ux8(ULong xx,ULong yy)1078 ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy )
1079 {
1080 return mk8x8(
1081 avg8U( sel8x8_7(xx), sel8x8_7(yy) ),
1082 avg8U( sel8x8_6(xx), sel8x8_6(yy) ),
1083 avg8U( sel8x8_5(xx), sel8x8_5(yy) ),
1084 avg8U( sel8x8_4(xx), sel8x8_4(yy) ),
1085 avg8U( sel8x8_3(xx), sel8x8_3(yy) ),
1086 avg8U( sel8x8_2(xx), sel8x8_2(yy) ),
1087 avg8U( sel8x8_1(xx), sel8x8_1(yy) ),
1088 avg8U( sel8x8_0(xx), sel8x8_0(yy) )
1089 );
1090 }
1091
h_generic_calc_Avg16Ux4(ULong xx,ULong yy)1092 ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy )
1093 {
1094 return mk16x4(
1095 avg16U( sel16x4_3(xx), sel16x4_3(yy) ),
1096 avg16U( sel16x4_2(xx), sel16x4_2(yy) ),
1097 avg16U( sel16x4_1(xx), sel16x4_1(yy) ),
1098 avg16U( sel16x4_0(xx), sel16x4_0(yy) )
1099 );
1100 }
1101
1102 /* ------------ max/min ------------ */
1103
h_generic_calc_Max16Sx4(ULong xx,ULong yy)1104 ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy )
1105 {
1106 return mk16x4(
1107 max16S( sel16x4_3(xx), sel16x4_3(yy) ),
1108 max16S( sel16x4_2(xx), sel16x4_2(yy) ),
1109 max16S( sel16x4_1(xx), sel16x4_1(yy) ),
1110 max16S( sel16x4_0(xx), sel16x4_0(yy) )
1111 );
1112 }
1113
h_generic_calc_Max8Ux8(ULong xx,ULong yy)1114 ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy )
1115 {
1116 return mk8x8(
1117 max8U( sel8x8_7(xx), sel8x8_7(yy) ),
1118 max8U( sel8x8_6(xx), sel8x8_6(yy) ),
1119 max8U( sel8x8_5(xx), sel8x8_5(yy) ),
1120 max8U( sel8x8_4(xx), sel8x8_4(yy) ),
1121 max8U( sel8x8_3(xx), sel8x8_3(yy) ),
1122 max8U( sel8x8_2(xx), sel8x8_2(yy) ),
1123 max8U( sel8x8_1(xx), sel8x8_1(yy) ),
1124 max8U( sel8x8_0(xx), sel8x8_0(yy) )
1125 );
1126 }
1127
h_generic_calc_Min16Sx4(ULong xx,ULong yy)1128 ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy )
1129 {
1130 return mk16x4(
1131 min16S( sel16x4_3(xx), sel16x4_3(yy) ),
1132 min16S( sel16x4_2(xx), sel16x4_2(yy) ),
1133 min16S( sel16x4_1(xx), sel16x4_1(yy) ),
1134 min16S( sel16x4_0(xx), sel16x4_0(yy) )
1135 );
1136 }
1137
h_generic_calc_Min8Ux8(ULong xx,ULong yy)1138 ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy )
1139 {
1140 return mk8x8(
1141 min8U( sel8x8_7(xx), sel8x8_7(yy) ),
1142 min8U( sel8x8_6(xx), sel8x8_6(yy) ),
1143 min8U( sel8x8_5(xx), sel8x8_5(yy) ),
1144 min8U( sel8x8_4(xx), sel8x8_4(yy) ),
1145 min8U( sel8x8_3(xx), sel8x8_3(yy) ),
1146 min8U( sel8x8_2(xx), sel8x8_2(yy) ),
1147 min8U( sel8x8_1(xx), sel8x8_1(yy) ),
1148 min8U( sel8x8_0(xx), sel8x8_0(yy) )
1149 );
1150 }
1151
1152 /* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */
1153
1154 /* Tuple/select functions for 16x2 vectors. */
mk16x2(UShort w1,UShort w2)1155 static inline UInt mk16x2 ( UShort w1, UShort w2 ) {
1156 return (((UInt)w1) << 16) | ((UInt)w2);
1157 }
1158
sel16x2_1(UInt w32)1159 static inline UShort sel16x2_1 ( UInt w32 ) {
1160 return 0xFFFF & (UShort)(w32 >> 16);
1161 }
sel16x2_0(UInt w32)1162 static inline UShort sel16x2_0 ( UInt w32 ) {
1163 return 0xFFFF & (UShort)(w32);
1164 }
1165
mk8x4(UChar w3,UChar w2,UChar w1,UChar w0)1166 static inline UInt mk8x4 ( UChar w3, UChar w2,
1167 UChar w1, UChar w0 ) {
1168 UInt w32 = (((UInt)w3) << 24) | (((UInt)w2) << 16)
1169 | (((UInt)w1) << 8) | (((UInt)w0) << 0);
1170 return w32;
1171 }
1172
sel8x4_3(UInt w32)1173 static inline UChar sel8x4_3 ( UInt w32 ) {
1174 return toUChar(0xFF & (w32 >> 24));
1175 }
sel8x4_2(UInt w32)1176 static inline UChar sel8x4_2 ( UInt w32 ) {
1177 return toUChar(0xFF & (w32 >> 16));
1178 }
sel8x4_1(UInt w32)1179 static inline UChar sel8x4_1 ( UInt w32 ) {
1180 return toUChar(0xFF & (w32 >> 8));
1181 }
sel8x4_0(UInt w32)1182 static inline UChar sel8x4_0 ( UInt w32 ) {
1183 return toUChar(0xFF & (w32 >> 0));
1184 }
1185
1186
1187 /* ----------------------------------------------------- */
1188 /* More externally visible functions. These simply
1189 implement the corresponding IR primops. */
1190 /* ----------------------------------------------------- */
1191
1192 /* ------ 16x2 ------ */
1193
h_generic_calc_Add16x2(UInt xx,UInt yy)1194 UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy )
1195 {
1196 return mk16x2( sel16x2_1(xx) + sel16x2_1(yy),
1197 sel16x2_0(xx) + sel16x2_0(yy) );
1198 }
1199
h_generic_calc_Sub16x2(UInt xx,UInt yy)1200 UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy )
1201 {
1202 return mk16x2( sel16x2_1(xx) - sel16x2_1(yy),
1203 sel16x2_0(xx) - sel16x2_0(yy) );
1204 }
1205
h_generic_calc_HAdd16Ux2(UInt xx,UInt yy)1206 UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy )
1207 {
1208 return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1209 hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1210 }
1211
h_generic_calc_HAdd16Sx2(UInt xx,UInt yy)1212 UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy )
1213 {
1214 return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1215 hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1216 }
1217
h_generic_calc_HSub16Ux2(UInt xx,UInt yy)1218 UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy )
1219 {
1220 return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1221 hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1222 }
1223
h_generic_calc_HSub16Sx2(UInt xx,UInt yy)1224 UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy )
1225 {
1226 return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1227 hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1228 }
1229
h_generic_calc_QAdd16Ux2(UInt xx,UInt yy)1230 UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy )
1231 {
1232 return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1233 qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1234 }
1235
h_generic_calc_QAdd16Sx2(UInt xx,UInt yy)1236 UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy )
1237 {
1238 return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1239 qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1240 }
1241
h_generic_calc_QSub16Ux2(UInt xx,UInt yy)1242 UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy )
1243 {
1244 return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1245 qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1246 }
1247
h_generic_calc_QSub16Sx2(UInt xx,UInt yy)1248 UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy )
1249 {
1250 return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1251 qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1252 }
1253
1254 /* ------ 8x4 ------ */
1255
h_generic_calc_Add8x4(UInt xx,UInt yy)1256 UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy )
1257 {
1258 return mk8x4(
1259 sel8x4_3(xx) + sel8x4_3(yy),
1260 sel8x4_2(xx) + sel8x4_2(yy),
1261 sel8x4_1(xx) + sel8x4_1(yy),
1262 sel8x4_0(xx) + sel8x4_0(yy)
1263 );
1264 }
1265
h_generic_calc_Sub8x4(UInt xx,UInt yy)1266 UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy )
1267 {
1268 return mk8x4(
1269 sel8x4_3(xx) - sel8x4_3(yy),
1270 sel8x4_2(xx) - sel8x4_2(yy),
1271 sel8x4_1(xx) - sel8x4_1(yy),
1272 sel8x4_0(xx) - sel8x4_0(yy)
1273 );
1274 }
1275
h_generic_calc_HAdd8Ux4(UInt xx,UInt yy)1276 UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy )
1277 {
1278 return mk8x4(
1279 hadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1280 hadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1281 hadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1282 hadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1283 );
1284 }
1285
h_generic_calc_HAdd8Sx4(UInt xx,UInt yy)1286 UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy )
1287 {
1288 return mk8x4(
1289 hadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1290 hadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1291 hadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1292 hadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1293 );
1294 }
1295
h_generic_calc_HSub8Ux4(UInt xx,UInt yy)1296 UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy )
1297 {
1298 return mk8x4(
1299 hsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1300 hsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1301 hsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1302 hsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1303 );
1304 }
1305
h_generic_calc_HSub8Sx4(UInt xx,UInt yy)1306 UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy )
1307 {
1308 return mk8x4(
1309 hsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1310 hsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1311 hsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1312 hsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1313 );
1314 }
1315
h_generic_calc_QAdd8Ux4(UInt xx,UInt yy)1316 UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy )
1317 {
1318 return mk8x4(
1319 qadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1320 qadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1321 qadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1322 qadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1323 );
1324 }
1325
h_generic_calc_QAdd8Sx4(UInt xx,UInt yy)1326 UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy )
1327 {
1328 return mk8x4(
1329 qadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1330 qadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1331 qadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1332 qadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1333 );
1334 }
1335
h_generic_calc_QSub8Ux4(UInt xx,UInt yy)1336 UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy )
1337 {
1338 return mk8x4(
1339 qsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1340 qsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1341 qsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1342 qsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1343 );
1344 }
1345
h_generic_calc_QSub8Sx4(UInt xx,UInt yy)1346 UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy )
1347 {
1348 return mk8x4(
1349 qsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1350 qsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1351 qsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1352 qsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1353 );
1354 }
1355
h_generic_calc_CmpNEZ16x2(UInt xx)1356 UInt h_generic_calc_CmpNEZ16x2 ( UInt xx )
1357 {
1358 return mk16x2(
1359 cmpnez16( sel16x2_1(xx) ),
1360 cmpnez16( sel16x2_0(xx) )
1361 );
1362 }
1363
h_generic_calc_CmpNEZ8x4(UInt xx)1364 UInt h_generic_calc_CmpNEZ8x4 ( UInt xx )
1365 {
1366 return mk8x4(
1367 cmpnez8( sel8x4_3(xx) ),
1368 cmpnez8( sel8x4_2(xx) ),
1369 cmpnez8( sel8x4_1(xx) ),
1370 cmpnez8( sel8x4_0(xx) )
1371 );
1372 }
1373
h_generic_calc_Sad8Ux4(UInt xx,UInt yy)1374 UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy )
1375 {
1376 return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) )
1377 + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) )
1378 + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) )
1379 + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) );
1380 }
1381
1382
1383 /*---------------------------------------------------------------*/
1384 /*--- end host_generic_simd64.c ---*/
1385 /*---------------------------------------------------------------*/
1386