1
2 /*---------------------------------------------------------------*/
3 /*--- begin host_generic_simd128.c ---*/
4 /*---------------------------------------------------------------*/
5
6 /*
7 This file is part of Valgrind, a dynamic binary instrumentation
8 framework.
9
10 Copyright (C) 2010-2010 OpenWorks GbR
11 info@open-works.net
12
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
17
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, write to the Free Software
25 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26 02110-1301, USA.
27
28 The GNU General Public License is contained in the file COPYING.
29 */
30
31 /* Generic helper functions for doing 128-bit SIMD arithmetic in cases
32 where the instruction selectors cannot generate code in-line.
33 These are purely back-end entities and cannot be seen/referenced
34 from IR. */
35
36 #include "libvex_basictypes.h"
37 #include "host_generic_simd128.h"
38
39
40 /* Primitive helpers always take args of the real type (signed vs
41 unsigned) but return an unsigned result, so there's no conversion
42 weirdness when stuffing results back in the V128 union fields,
43 which are all unsigned. */
44
mul32(Int xx,Int yy)45 static inline UInt mul32 ( Int xx, Int yy )
46 {
47 Int t = ((Int)xx) * ((Int)yy);
48 return toUInt(t);
49 }
50
max32S(Int xx,Int yy)51 static inline UInt max32S ( Int xx, Int yy )
52 {
53 return toUInt((xx > yy) ? xx : yy);
54 }
55
min32S(Int xx,Int yy)56 static inline UInt min32S ( Int xx, Int yy )
57 {
58 return toUInt((xx < yy) ? xx : yy);
59 }
60
max32U(UInt xx,UInt yy)61 static inline UInt max32U ( UInt xx, UInt yy )
62 {
63 return toUInt((xx > yy) ? xx : yy);
64 }
65
min32U(UInt xx,UInt yy)66 static inline UInt min32U ( UInt xx, UInt yy )
67 {
68 return toUInt((xx < yy) ? xx : yy);
69 }
70
max16U(UShort xx,UShort yy)71 static inline UShort max16U ( UShort xx, UShort yy )
72 {
73 return toUShort((xx > yy) ? xx : yy);
74 }
75
min16U(UShort xx,UShort yy)76 static inline UShort min16U ( UShort xx, UShort yy )
77 {
78 return toUShort((xx < yy) ? xx : yy);
79 }
80
max8S(Char xx,Char yy)81 static inline UChar max8S ( Char xx, Char yy )
82 {
83 return toUChar((xx > yy) ? xx : yy);
84 }
85
min8S(Char xx,Char yy)86 static inline UChar min8S ( Char xx, Char yy )
87 {
88 return toUChar((xx < yy) ? xx : yy);
89 }
90
cmpGT64S(Long xx,Long yy)91 static inline ULong cmpGT64S ( Long xx, Long yy )
92 {
93 return (((Long)xx) > ((Long)yy))
94 ? 0xFFFFFFFFFFFFFFFFULL : 0ULL;
95 }
96
sar64(ULong v,UInt n)97 static inline ULong sar64 ( ULong v, UInt n )
98 {
99 return ((Long)v) >> n;
100 }
101
sar8(UChar v,UInt n)102 static inline UChar sar8 ( UChar v, UInt n )
103 {
104 return toUChar(((Char)v) >> n);
105 }
106
h_generic_calc_Mul32x4(V128 * res,V128 * argL,V128 * argR)107 void h_generic_calc_Mul32x4 ( /*OUT*/V128* res,
108 V128* argL, V128* argR )
109 {
110 res->w32[0] = mul32(argL->w32[0], argR->w32[0]);
111 res->w32[1] = mul32(argL->w32[1], argR->w32[1]);
112 res->w32[2] = mul32(argL->w32[2], argR->w32[2]);
113 res->w32[3] = mul32(argL->w32[3], argR->w32[3]);
114 }
115
h_generic_calc_Max32Sx4(V128 * res,V128 * argL,V128 * argR)116 void h_generic_calc_Max32Sx4 ( /*OUT*/V128* res,
117 V128* argL, V128* argR )
118 {
119 res->w32[0] = max32S(argL->w32[0], argR->w32[0]);
120 res->w32[1] = max32S(argL->w32[1], argR->w32[1]);
121 res->w32[2] = max32S(argL->w32[2], argR->w32[2]);
122 res->w32[3] = max32S(argL->w32[3], argR->w32[3]);
123 }
124
h_generic_calc_Min32Sx4(V128 * res,V128 * argL,V128 * argR)125 void h_generic_calc_Min32Sx4 ( /*OUT*/V128* res,
126 V128* argL, V128* argR )
127 {
128 res->w32[0] = min32S(argL->w32[0], argR->w32[0]);
129 res->w32[1] = min32S(argL->w32[1], argR->w32[1]);
130 res->w32[2] = min32S(argL->w32[2], argR->w32[2]);
131 res->w32[3] = min32S(argL->w32[3], argR->w32[3]);
132 }
133
h_generic_calc_Max32Ux4(V128 * res,V128 * argL,V128 * argR)134 void h_generic_calc_Max32Ux4 ( /*OUT*/V128* res,
135 V128* argL, V128* argR )
136 {
137 res->w32[0] = max32U(argL->w32[0], argR->w32[0]);
138 res->w32[1] = max32U(argL->w32[1], argR->w32[1]);
139 res->w32[2] = max32U(argL->w32[2], argR->w32[2]);
140 res->w32[3] = max32U(argL->w32[3], argR->w32[3]);
141 }
142
h_generic_calc_Min32Ux4(V128 * res,V128 * argL,V128 * argR)143 void h_generic_calc_Min32Ux4 ( /*OUT*/V128* res,
144 V128* argL, V128* argR )
145 {
146 res->w32[0] = min32U(argL->w32[0], argR->w32[0]);
147 res->w32[1] = min32U(argL->w32[1], argR->w32[1]);
148 res->w32[2] = min32U(argL->w32[2], argR->w32[2]);
149 res->w32[3] = min32U(argL->w32[3], argR->w32[3]);
150 }
151
h_generic_calc_Max16Ux8(V128 * res,V128 * argL,V128 * argR)152 void h_generic_calc_Max16Ux8 ( /*OUT*/V128* res,
153 V128* argL, V128* argR )
154 {
155 res->w16[0] = max16U(argL->w16[0], argR->w16[0]);
156 res->w16[1] = max16U(argL->w16[1], argR->w16[1]);
157 res->w16[2] = max16U(argL->w16[2], argR->w16[2]);
158 res->w16[3] = max16U(argL->w16[3], argR->w16[3]);
159 res->w16[4] = max16U(argL->w16[4], argR->w16[4]);
160 res->w16[5] = max16U(argL->w16[5], argR->w16[5]);
161 res->w16[6] = max16U(argL->w16[6], argR->w16[6]);
162 res->w16[7] = max16U(argL->w16[7], argR->w16[7]);
163 }
164
h_generic_calc_Min16Ux8(V128 * res,V128 * argL,V128 * argR)165 void h_generic_calc_Min16Ux8 ( /*OUT*/V128* res,
166 V128* argL, V128* argR )
167 {
168 res->w16[0] = min16U(argL->w16[0], argR->w16[0]);
169 res->w16[1] = min16U(argL->w16[1], argR->w16[1]);
170 res->w16[2] = min16U(argL->w16[2], argR->w16[2]);
171 res->w16[3] = min16U(argL->w16[3], argR->w16[3]);
172 res->w16[4] = min16U(argL->w16[4], argR->w16[4]);
173 res->w16[5] = min16U(argL->w16[5], argR->w16[5]);
174 res->w16[6] = min16U(argL->w16[6], argR->w16[6]);
175 res->w16[7] = min16U(argL->w16[7], argR->w16[7]);
176 }
177
h_generic_calc_Max8Sx16(V128 * res,V128 * argL,V128 * argR)178 void h_generic_calc_Max8Sx16 ( /*OUT*/V128* res,
179 V128* argL, V128* argR )
180 {
181 res->w8[ 0] = max8S(argL->w8[ 0], argR->w8[ 0]);
182 res->w8[ 1] = max8S(argL->w8[ 1], argR->w8[ 1]);
183 res->w8[ 2] = max8S(argL->w8[ 2], argR->w8[ 2]);
184 res->w8[ 3] = max8S(argL->w8[ 3], argR->w8[ 3]);
185 res->w8[ 4] = max8S(argL->w8[ 4], argR->w8[ 4]);
186 res->w8[ 5] = max8S(argL->w8[ 5], argR->w8[ 5]);
187 res->w8[ 6] = max8S(argL->w8[ 6], argR->w8[ 6]);
188 res->w8[ 7] = max8S(argL->w8[ 7], argR->w8[ 7]);
189 res->w8[ 8] = max8S(argL->w8[ 8], argR->w8[ 8]);
190 res->w8[ 9] = max8S(argL->w8[ 9], argR->w8[ 9]);
191 res->w8[10] = max8S(argL->w8[10], argR->w8[10]);
192 res->w8[11] = max8S(argL->w8[11], argR->w8[11]);
193 res->w8[12] = max8S(argL->w8[12], argR->w8[12]);
194 res->w8[13] = max8S(argL->w8[13], argR->w8[13]);
195 res->w8[14] = max8S(argL->w8[14], argR->w8[14]);
196 res->w8[15] = max8S(argL->w8[15], argR->w8[15]);
197 }
198
h_generic_calc_Min8Sx16(V128 * res,V128 * argL,V128 * argR)199 void h_generic_calc_Min8Sx16 ( /*OUT*/V128* res,
200 V128* argL, V128* argR )
201 {
202 res->w8[ 0] = min8S(argL->w8[ 0], argR->w8[ 0]);
203 res->w8[ 1] = min8S(argL->w8[ 1], argR->w8[ 1]);
204 res->w8[ 2] = min8S(argL->w8[ 2], argR->w8[ 2]);
205 res->w8[ 3] = min8S(argL->w8[ 3], argR->w8[ 3]);
206 res->w8[ 4] = min8S(argL->w8[ 4], argR->w8[ 4]);
207 res->w8[ 5] = min8S(argL->w8[ 5], argR->w8[ 5]);
208 res->w8[ 6] = min8S(argL->w8[ 6], argR->w8[ 6]);
209 res->w8[ 7] = min8S(argL->w8[ 7], argR->w8[ 7]);
210 res->w8[ 8] = min8S(argL->w8[ 8], argR->w8[ 8]);
211 res->w8[ 9] = min8S(argL->w8[ 9], argR->w8[ 9]);
212 res->w8[10] = min8S(argL->w8[10], argR->w8[10]);
213 res->w8[11] = min8S(argL->w8[11], argR->w8[11]);
214 res->w8[12] = min8S(argL->w8[12], argR->w8[12]);
215 res->w8[13] = min8S(argL->w8[13], argR->w8[13]);
216 res->w8[14] = min8S(argL->w8[14], argR->w8[14]);
217 res->w8[15] = min8S(argL->w8[15], argR->w8[15]);
218 }
219
h_generic_calc_CmpGT64Sx2(V128 * res,V128 * argL,V128 * argR)220 void h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128* res,
221 V128* argL, V128* argR )
222 {
223 res->w64[0] = cmpGT64S(argL->w64[0], argR->w64[0]);
224 res->w64[1] = cmpGT64S(argL->w64[1], argR->w64[1]);
225 }
226
227 /* ------------ Shifting ------------ */
228 /* Note that because these primops are undefined if the shift amount
229 equals or exceeds the lane width, the shift amount is masked so
230 that the scalar shifts are always in range. In fact, given the
231 semantics of these primops (Sar64x2, etc) it is an error if in
232 fact we are ever given an out-of-range shift amount.
233 */
h_generic_calc_SarN64x2(V128 * res,V128 * argL,UInt nn)234 void h_generic_calc_SarN64x2 ( /*OUT*/V128* res,
235 V128* argL, UInt nn)
236 {
237 /* vassert(nn < 64); */
238 nn &= 63;
239 res->w64[0] = sar64(argL->w64[0], nn);
240 res->w64[1] = sar64(argL->w64[1], nn);
241 }
242
h_generic_calc_SarN8x16(V128 * res,V128 * argL,UInt nn)243 void h_generic_calc_SarN8x16 ( /*OUT*/V128* res,
244 V128* argL, UInt nn)
245 {
246 /* vassert(nn < 8); */
247 nn &= 7;
248 res->w8[ 0] = sar8(argL->w8[ 0], nn);
249 res->w8[ 1] = sar8(argL->w8[ 1], nn);
250 res->w8[ 2] = sar8(argL->w8[ 2], nn);
251 res->w8[ 3] = sar8(argL->w8[ 3], nn);
252 res->w8[ 4] = sar8(argL->w8[ 4], nn);
253 res->w8[ 5] = sar8(argL->w8[ 5], nn);
254 res->w8[ 6] = sar8(argL->w8[ 6], nn);
255 res->w8[ 7] = sar8(argL->w8[ 7], nn);
256 res->w8[ 8] = sar8(argL->w8[ 8], nn);
257 res->w8[ 9] = sar8(argL->w8[ 9], nn);
258 res->w8[10] = sar8(argL->w8[10], nn);
259 res->w8[11] = sar8(argL->w8[11], nn);
260 res->w8[12] = sar8(argL->w8[12], nn);
261 res->w8[13] = sar8(argL->w8[13], nn);
262 res->w8[14] = sar8(argL->w8[14], nn);
263 res->w8[15] = sar8(argL->w8[15], nn);
264 }
265
266 /*---------------------------------------------------------------*/
267 /*--- end host_generic_simd128.c ---*/
268 /*---------------------------------------------------------------*/
269