• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 
2 /*---------------------------------------------------------------*/
3 /*--- begin                            host_generic_simd128.c ---*/
4 /*---------------------------------------------------------------*/
5 
6 /*
7    This file is part of Valgrind, a dynamic binary instrumentation
8    framework.
9 
10    Copyright (C) 2010-2010 OpenWorks GbR
11       info@open-works.net
12 
13    This program is free software; you can redistribute it and/or
14    modify it under the terms of the GNU General Public License as
15    published by the Free Software Foundation; either version 2 of the
16    License, or (at your option) any later version.
17 
18    This program is distributed in the hope that it will be useful, but
19    WITHOUT ANY WARRANTY; without even the implied warranty of
20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21    General Public License for more details.
22 
23    You should have received a copy of the GNU General Public License
24    along with this program; if not, write to the Free Software
25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26    02110-1301, USA.
27 
28    The GNU General Public License is contained in the file COPYING.
29 */
30 
31 /* Generic helper functions for doing 128-bit SIMD arithmetic in cases
32    where the instruction selectors cannot generate code in-line.
33    These are purely back-end entities and cannot be seen/referenced
34    from IR. */
35 
36 #include "libvex_basictypes.h"
37 #include "host_generic_simd128.h"
38 
39 
40 /* Primitive helpers always take args of the real type (signed vs
41    unsigned) but return an unsigned result, so there's no conversion
42    weirdness when stuffing results back in the V128 union fields,
43    which are all unsigned. */
44 
mul32(Int xx,Int yy)45 static inline UInt mul32 ( Int xx, Int yy )
46 {
47    Int t = ((Int)xx) * ((Int)yy);
48    return toUInt(t);
49 }
50 
max32S(Int xx,Int yy)51 static inline UInt max32S ( Int xx, Int yy )
52 {
53    return toUInt((xx > yy) ? xx : yy);
54 }
55 
min32S(Int xx,Int yy)56 static inline UInt min32S ( Int xx, Int yy )
57 {
58    return toUInt((xx < yy) ? xx : yy);
59 }
60 
max32U(UInt xx,UInt yy)61 static inline UInt max32U ( UInt xx, UInt yy )
62 {
63    return toUInt((xx > yy) ? xx : yy);
64 }
65 
min32U(UInt xx,UInt yy)66 static inline UInt min32U ( UInt xx, UInt yy )
67 {
68    return toUInt((xx < yy) ? xx : yy);
69 }
70 
max16U(UShort xx,UShort yy)71 static inline UShort max16U ( UShort xx, UShort yy )
72 {
73    return toUShort((xx > yy) ? xx : yy);
74 }
75 
min16U(UShort xx,UShort yy)76 static inline UShort min16U ( UShort xx, UShort yy )
77 {
78    return toUShort((xx < yy) ? xx : yy);
79 }
80 
max8S(Char xx,Char yy)81 static inline UChar max8S ( Char xx, Char yy )
82 {
83    return toUChar((xx > yy) ? xx : yy);
84 }
85 
min8S(Char xx,Char yy)86 static inline UChar min8S ( Char xx, Char yy )
87 {
88    return toUChar((xx < yy) ? xx : yy);
89 }
90 
cmpGT64S(Long xx,Long yy)91 static inline ULong cmpGT64S ( Long xx, Long yy )
92 {
93    return (((Long)xx) > ((Long)yy))
94              ? 0xFFFFFFFFFFFFFFFFULL : 0ULL;
95 }
96 
sar64(ULong v,UInt n)97 static inline ULong sar64 ( ULong v, UInt n )
98 {
99    return ((Long)v) >> n;
100 }
101 
sar8(UChar v,UInt n)102 static inline UChar sar8 ( UChar v, UInt n )
103 {
104    return toUChar(((Char)v) >> n);
105 }
106 
h_generic_calc_Mul32x4(V128 * res,V128 * argL,V128 * argR)107 void h_generic_calc_Mul32x4 ( /*OUT*/V128* res,
108                               V128* argL, V128* argR )
109 {
110    res->w32[0] = mul32(argL->w32[0], argR->w32[0]);
111    res->w32[1] = mul32(argL->w32[1], argR->w32[1]);
112    res->w32[2] = mul32(argL->w32[2], argR->w32[2]);
113    res->w32[3] = mul32(argL->w32[3], argR->w32[3]);
114 }
115 
h_generic_calc_Max32Sx4(V128 * res,V128 * argL,V128 * argR)116 void h_generic_calc_Max32Sx4 ( /*OUT*/V128* res,
117                                V128* argL, V128* argR )
118 {
119    res->w32[0] = max32S(argL->w32[0], argR->w32[0]);
120    res->w32[1] = max32S(argL->w32[1], argR->w32[1]);
121    res->w32[2] = max32S(argL->w32[2], argR->w32[2]);
122    res->w32[3] = max32S(argL->w32[3], argR->w32[3]);
123 }
124 
h_generic_calc_Min32Sx4(V128 * res,V128 * argL,V128 * argR)125 void h_generic_calc_Min32Sx4 ( /*OUT*/V128* res,
126                                V128* argL, V128* argR )
127 {
128    res->w32[0] = min32S(argL->w32[0], argR->w32[0]);
129    res->w32[1] = min32S(argL->w32[1], argR->w32[1]);
130    res->w32[2] = min32S(argL->w32[2], argR->w32[2]);
131    res->w32[3] = min32S(argL->w32[3], argR->w32[3]);
132 }
133 
h_generic_calc_Max32Ux4(V128 * res,V128 * argL,V128 * argR)134 void h_generic_calc_Max32Ux4 ( /*OUT*/V128* res,
135                                V128* argL, V128* argR )
136 {
137    res->w32[0] = max32U(argL->w32[0], argR->w32[0]);
138    res->w32[1] = max32U(argL->w32[1], argR->w32[1]);
139    res->w32[2] = max32U(argL->w32[2], argR->w32[2]);
140    res->w32[3] = max32U(argL->w32[3], argR->w32[3]);
141 }
142 
h_generic_calc_Min32Ux4(V128 * res,V128 * argL,V128 * argR)143 void h_generic_calc_Min32Ux4 ( /*OUT*/V128* res,
144                                V128* argL, V128* argR )
145 {
146    res->w32[0] = min32U(argL->w32[0], argR->w32[0]);
147    res->w32[1] = min32U(argL->w32[1], argR->w32[1]);
148    res->w32[2] = min32U(argL->w32[2], argR->w32[2]);
149    res->w32[3] = min32U(argL->w32[3], argR->w32[3]);
150 }
151 
h_generic_calc_Max16Ux8(V128 * res,V128 * argL,V128 * argR)152 void h_generic_calc_Max16Ux8 ( /*OUT*/V128* res,
153                                V128* argL, V128* argR )
154 {
155    res->w16[0] = max16U(argL->w16[0], argR->w16[0]);
156    res->w16[1] = max16U(argL->w16[1], argR->w16[1]);
157    res->w16[2] = max16U(argL->w16[2], argR->w16[2]);
158    res->w16[3] = max16U(argL->w16[3], argR->w16[3]);
159    res->w16[4] = max16U(argL->w16[4], argR->w16[4]);
160    res->w16[5] = max16U(argL->w16[5], argR->w16[5]);
161    res->w16[6] = max16U(argL->w16[6], argR->w16[6]);
162    res->w16[7] = max16U(argL->w16[7], argR->w16[7]);
163 }
164 
h_generic_calc_Min16Ux8(V128 * res,V128 * argL,V128 * argR)165 void h_generic_calc_Min16Ux8 ( /*OUT*/V128* res,
166                                V128* argL, V128* argR )
167 {
168    res->w16[0] = min16U(argL->w16[0], argR->w16[0]);
169    res->w16[1] = min16U(argL->w16[1], argR->w16[1]);
170    res->w16[2] = min16U(argL->w16[2], argR->w16[2]);
171    res->w16[3] = min16U(argL->w16[3], argR->w16[3]);
172    res->w16[4] = min16U(argL->w16[4], argR->w16[4]);
173    res->w16[5] = min16U(argL->w16[5], argR->w16[5]);
174    res->w16[6] = min16U(argL->w16[6], argR->w16[6]);
175    res->w16[7] = min16U(argL->w16[7], argR->w16[7]);
176 }
177 
h_generic_calc_Max8Sx16(V128 * res,V128 * argL,V128 * argR)178 void h_generic_calc_Max8Sx16 ( /*OUT*/V128* res,
179                                V128* argL, V128* argR )
180 {
181    res->w8[ 0] = max8S(argL->w8[ 0], argR->w8[ 0]);
182    res->w8[ 1] = max8S(argL->w8[ 1], argR->w8[ 1]);
183    res->w8[ 2] = max8S(argL->w8[ 2], argR->w8[ 2]);
184    res->w8[ 3] = max8S(argL->w8[ 3], argR->w8[ 3]);
185    res->w8[ 4] = max8S(argL->w8[ 4], argR->w8[ 4]);
186    res->w8[ 5] = max8S(argL->w8[ 5], argR->w8[ 5]);
187    res->w8[ 6] = max8S(argL->w8[ 6], argR->w8[ 6]);
188    res->w8[ 7] = max8S(argL->w8[ 7], argR->w8[ 7]);
189    res->w8[ 8] = max8S(argL->w8[ 8], argR->w8[ 8]);
190    res->w8[ 9] = max8S(argL->w8[ 9], argR->w8[ 9]);
191    res->w8[10] = max8S(argL->w8[10], argR->w8[10]);
192    res->w8[11] = max8S(argL->w8[11], argR->w8[11]);
193    res->w8[12] = max8S(argL->w8[12], argR->w8[12]);
194    res->w8[13] = max8S(argL->w8[13], argR->w8[13]);
195    res->w8[14] = max8S(argL->w8[14], argR->w8[14]);
196    res->w8[15] = max8S(argL->w8[15], argR->w8[15]);
197 }
198 
h_generic_calc_Min8Sx16(V128 * res,V128 * argL,V128 * argR)199 void h_generic_calc_Min8Sx16 ( /*OUT*/V128* res,
200                                V128* argL, V128* argR )
201 {
202    res->w8[ 0] = min8S(argL->w8[ 0], argR->w8[ 0]);
203    res->w8[ 1] = min8S(argL->w8[ 1], argR->w8[ 1]);
204    res->w8[ 2] = min8S(argL->w8[ 2], argR->w8[ 2]);
205    res->w8[ 3] = min8S(argL->w8[ 3], argR->w8[ 3]);
206    res->w8[ 4] = min8S(argL->w8[ 4], argR->w8[ 4]);
207    res->w8[ 5] = min8S(argL->w8[ 5], argR->w8[ 5]);
208    res->w8[ 6] = min8S(argL->w8[ 6], argR->w8[ 6]);
209    res->w8[ 7] = min8S(argL->w8[ 7], argR->w8[ 7]);
210    res->w8[ 8] = min8S(argL->w8[ 8], argR->w8[ 8]);
211    res->w8[ 9] = min8S(argL->w8[ 9], argR->w8[ 9]);
212    res->w8[10] = min8S(argL->w8[10], argR->w8[10]);
213    res->w8[11] = min8S(argL->w8[11], argR->w8[11]);
214    res->w8[12] = min8S(argL->w8[12], argR->w8[12]);
215    res->w8[13] = min8S(argL->w8[13], argR->w8[13]);
216    res->w8[14] = min8S(argL->w8[14], argR->w8[14]);
217    res->w8[15] = min8S(argL->w8[15], argR->w8[15]);
218 }
219 
h_generic_calc_CmpGT64Sx2(V128 * res,V128 * argL,V128 * argR)220 void h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128* res,
221                                  V128* argL, V128* argR )
222 {
223    res->w64[0] = cmpGT64S(argL->w64[0], argR->w64[0]);
224    res->w64[1] = cmpGT64S(argL->w64[1], argR->w64[1]);
225 }
226 
227 /* ------------ Shifting ------------ */
228 /* Note that because these primops are undefined if the shift amount
229    equals or exceeds the lane width, the shift amount is masked so
230    that the scalar shifts are always in range.  In fact, given the
231    semantics of these primops (Sar64x2, etc) it is an error if in
232    fact we are ever given an out-of-range shift amount.
233 */
h_generic_calc_SarN64x2(V128 * res,V128 * argL,UInt nn)234 void h_generic_calc_SarN64x2 ( /*OUT*/V128* res,
235                                V128* argL, UInt nn)
236 {
237    /* vassert(nn < 64); */
238    nn &= 63;
239    res->w64[0] = sar64(argL->w64[0], nn);
240    res->w64[1] = sar64(argL->w64[1], nn);
241 }
242 
h_generic_calc_SarN8x16(V128 * res,V128 * argL,UInt nn)243 void h_generic_calc_SarN8x16 ( /*OUT*/V128* res,
244                               V128* argL, UInt nn)
245 {
246    /* vassert(nn < 8); */
247    nn &= 7;
248    res->w8[ 0] = sar8(argL->w8[ 0], nn);
249    res->w8[ 1] = sar8(argL->w8[ 1], nn);
250    res->w8[ 2] = sar8(argL->w8[ 2], nn);
251    res->w8[ 3] = sar8(argL->w8[ 3], nn);
252    res->w8[ 4] = sar8(argL->w8[ 4], nn);
253    res->w8[ 5] = sar8(argL->w8[ 5], nn);
254    res->w8[ 6] = sar8(argL->w8[ 6], nn);
255    res->w8[ 7] = sar8(argL->w8[ 7], nn);
256    res->w8[ 8] = sar8(argL->w8[ 8], nn);
257    res->w8[ 9] = sar8(argL->w8[ 9], nn);
258    res->w8[10] = sar8(argL->w8[10], nn);
259    res->w8[11] = sar8(argL->w8[11], nn);
260    res->w8[12] = sar8(argL->w8[12], nn);
261    res->w8[13] = sar8(argL->w8[13], nn);
262    res->w8[14] = sar8(argL->w8[14], nn);
263    res->w8[15] = sar8(argL->w8[15], nn);
264 }
265 
266 /*---------------------------------------------------------------*/
267 /*--- end                              host_generic_simd128.c ---*/
268 /*---------------------------------------------------------------*/
269