1
2 /*---------------------------------------------------------------*/
3 /*--- begin guest_amd64_helpers.c ---*/
4 /*---------------------------------------------------------------*/
5
6 /*
7 This file is part of Valgrind, a dynamic binary instrumentation
8 framework.
9
10 Copyright (C) 2004-2012 OpenWorks LLP
11 info@open-works.net
12
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
17
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, write to the Free Software
25 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26 02110-1301, USA.
27
28 The GNU General Public License is contained in the file COPYING.
29
30 Neither the names of the U.S. Department of Energy nor the
31 University of California nor the names of its contributors may be
32 used to endorse or promote products derived from this software
33 without prior written permission.
34 */
35
36 #include "libvex_basictypes.h"
37 #include "libvex_emwarn.h"
38 #include "libvex_guest_amd64.h"
39 #include "libvex_ir.h"
40 #include "libvex.h"
41
42 #include "main_util.h"
43 #include "guest_generic_bb_to_IR.h"
44 #include "guest_amd64_defs.h"
45 #include "guest_generic_x87.h"
46
47
48 /* This file contains helper functions for amd64 guest code.
49 Calls to these functions are generated by the back end.
50 These calls are of course in the host machine code and
51 this file will be compiled to host machine code, so that
52 all makes sense.
53
54 Only change the signatures of these helper functions very
55 carefully. If you change the signature here, you'll have to change
56 the parameters passed to it in the IR calls constructed by
57 guest-amd64/toIR.c.
58
59 The convention used is that all functions called from generated
60 code are named amd64g_<something>, and any function whose name lacks
61 that prefix is not called from generated code. Note that some
62 LibVEX_* functions can however be called by VEX's client, but that
63 is not the same as calling them from VEX-generated code.
64 */
65
66
67 /* Set to 1 to get detailed profiling info about use of the flag
68 machinery. */
69 #define PROFILE_RFLAGS 0
70
71
72 /*---------------------------------------------------------------*/
73 /*--- %rflags run-time helpers. ---*/
74 /*---------------------------------------------------------------*/
75
76 /* Do 64x64 -> 128 signed/unsigned multiplies, for computing flags
77 after imulq/mulq. */
78
mullS64(Long u,Long v,Long * rHi,Long * rLo)79 static void mullS64 ( Long u, Long v, Long* rHi, Long* rLo )
80 {
81 ULong u0, v0, w0;
82 Long u1, v1, w1, w2, t;
83 u0 = u & 0xFFFFFFFFULL;
84 u1 = u >> 32;
85 v0 = v & 0xFFFFFFFFULL;
86 v1 = v >> 32;
87 w0 = u0 * v0;
88 t = u1 * v0 + (w0 >> 32);
89 w1 = t & 0xFFFFFFFFULL;
90 w2 = t >> 32;
91 w1 = u0 * v1 + w1;
92 *rHi = u1 * v1 + w2 + (w1 >> 32);
93 *rLo = u * v;
94 }
95
mullU64(ULong u,ULong v,ULong * rHi,ULong * rLo)96 static void mullU64 ( ULong u, ULong v, ULong* rHi, ULong* rLo )
97 {
98 ULong u0, v0, w0;
99 ULong u1, v1, w1,w2,t;
100 u0 = u & 0xFFFFFFFFULL;
101 u1 = u >> 32;
102 v0 = v & 0xFFFFFFFFULL;
103 v1 = v >> 32;
104 w0 = u0 * v0;
105 t = u1 * v0 + (w0 >> 32);
106 w1 = t & 0xFFFFFFFFULL;
107 w2 = t >> 32;
108 w1 = u0 * v1 + w1;
109 *rHi = u1 * v1 + w2 + (w1 >> 32);
110 *rLo = u * v;
111 }
112
113
114 static const UChar parity_table[256] = {
115 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
116 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
117 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
118 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
119 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
120 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
121 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
122 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
123 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
124 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
125 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
126 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
127 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
128 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
129 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
130 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
131 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
132 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
133 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
134 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
135 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
136 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
137 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
138 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
139 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
140 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
141 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
142 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
143 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
144 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
145 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
146 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
147 };
148
149 /* generalised left-shifter */
lshift(Long x,Int n)150 static inline Long lshift ( Long x, Int n )
151 {
152 if (n >= 0)
153 return x << n;
154 else
155 return x >> (-n);
156 }
157
158 /* identity on ULong */
idULong(ULong x)159 static inline ULong idULong ( ULong x )
160 {
161 return x;
162 }
163
164
165 #define PREAMBLE(__data_bits) \
166 /* const */ ULong DATA_MASK \
167 = __data_bits==8 \
168 ? 0xFFULL \
169 : (__data_bits==16 \
170 ? 0xFFFFULL \
171 : (__data_bits==32 \
172 ? 0xFFFFFFFFULL \
173 : 0xFFFFFFFFFFFFFFFFULL)); \
174 /* const */ ULong SIGN_MASK = 1ULL << (__data_bits - 1); \
175 /* const */ ULong CC_DEP1 = cc_dep1_formal; \
176 /* const */ ULong CC_DEP2 = cc_dep2_formal; \
177 /* const */ ULong CC_NDEP = cc_ndep_formal; \
178 /* Four bogus assignments, which hopefully gcc can */ \
179 /* optimise away, and which stop it complaining about */ \
180 /* unused variables. */ \
181 SIGN_MASK = SIGN_MASK; \
182 DATA_MASK = DATA_MASK; \
183 CC_DEP2 = CC_DEP2; \
184 CC_NDEP = CC_NDEP;
185
186
187 /*-------------------------------------------------------------*/
188
189 #define ACTIONS_ADD(DATA_BITS,DATA_UTYPE) \
190 { \
191 PREAMBLE(DATA_BITS); \
192 { Long cf, pf, af, zf, sf, of; \
193 Long argL, argR, res; \
194 argL = CC_DEP1; \
195 argR = CC_DEP2; \
196 res = argL + argR; \
197 cf = (DATA_UTYPE)res < (DATA_UTYPE)argL; \
198 pf = parity_table[(UChar)res]; \
199 af = (res ^ argL ^ argR) & 0x10; \
200 zf = ((DATA_UTYPE)res == 0) << 6; \
201 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
202 of = lshift((argL ^ argR ^ -1) & (argL ^ res), \
203 12 - DATA_BITS) & AMD64G_CC_MASK_O; \
204 return cf | pf | af | zf | sf | of; \
205 } \
206 }
207
208 /*-------------------------------------------------------------*/
209
210 #define ACTIONS_SUB(DATA_BITS,DATA_UTYPE) \
211 { \
212 PREAMBLE(DATA_BITS); \
213 { Long cf, pf, af, zf, sf, of; \
214 Long argL, argR, res; \
215 argL = CC_DEP1; \
216 argR = CC_DEP2; \
217 res = argL - argR; \
218 cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR; \
219 pf = parity_table[(UChar)res]; \
220 af = (res ^ argL ^ argR) & 0x10; \
221 zf = ((DATA_UTYPE)res == 0) << 6; \
222 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
223 of = lshift((argL ^ argR) & (argL ^ res), \
224 12 - DATA_BITS) & AMD64G_CC_MASK_O; \
225 return cf | pf | af | zf | sf | of; \
226 } \
227 }
228
229 /*-------------------------------------------------------------*/
230
231 #define ACTIONS_ADC(DATA_BITS,DATA_UTYPE) \
232 { \
233 PREAMBLE(DATA_BITS); \
234 { Long cf, pf, af, zf, sf, of; \
235 Long argL, argR, oldC, res; \
236 oldC = CC_NDEP & AMD64G_CC_MASK_C; \
237 argL = CC_DEP1; \
238 argR = CC_DEP2 ^ oldC; \
239 res = (argL + argR) + oldC; \
240 if (oldC) \
241 cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL; \
242 else \
243 cf = (DATA_UTYPE)res < (DATA_UTYPE)argL; \
244 pf = parity_table[(UChar)res]; \
245 af = (res ^ argL ^ argR) & 0x10; \
246 zf = ((DATA_UTYPE)res == 0) << 6; \
247 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
248 of = lshift((argL ^ argR ^ -1) & (argL ^ res), \
249 12 - DATA_BITS) & AMD64G_CC_MASK_O; \
250 return cf | pf | af | zf | sf | of; \
251 } \
252 }
253
254 /*-------------------------------------------------------------*/
255
256 #define ACTIONS_SBB(DATA_BITS,DATA_UTYPE) \
257 { \
258 PREAMBLE(DATA_BITS); \
259 { Long cf, pf, af, zf, sf, of; \
260 Long argL, argR, oldC, res; \
261 oldC = CC_NDEP & AMD64G_CC_MASK_C; \
262 argL = CC_DEP1; \
263 argR = CC_DEP2 ^ oldC; \
264 res = (argL - argR) - oldC; \
265 if (oldC) \
266 cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR; \
267 else \
268 cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR; \
269 pf = parity_table[(UChar)res]; \
270 af = (res ^ argL ^ argR) & 0x10; \
271 zf = ((DATA_UTYPE)res == 0) << 6; \
272 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
273 of = lshift((argL ^ argR) & (argL ^ res), \
274 12 - DATA_BITS) & AMD64G_CC_MASK_O; \
275 return cf | pf | af | zf | sf | of; \
276 } \
277 }
278
279 /*-------------------------------------------------------------*/
280
281 #define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE) \
282 { \
283 PREAMBLE(DATA_BITS); \
284 { Long cf, pf, af, zf, sf, of; \
285 cf = 0; \
286 pf = parity_table[(UChar)CC_DEP1]; \
287 af = 0; \
288 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \
289 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
290 of = 0; \
291 return cf | pf | af | zf | sf | of; \
292 } \
293 }
294
295 /*-------------------------------------------------------------*/
296
297 #define ACTIONS_INC(DATA_BITS,DATA_UTYPE) \
298 { \
299 PREAMBLE(DATA_BITS); \
300 { Long cf, pf, af, zf, sf, of; \
301 Long argL, argR, res; \
302 res = CC_DEP1; \
303 argL = res - 1; \
304 argR = 1; \
305 cf = CC_NDEP & AMD64G_CC_MASK_C; \
306 pf = parity_table[(UChar)res]; \
307 af = (res ^ argL ^ argR) & 0x10; \
308 zf = ((DATA_UTYPE)res == 0) << 6; \
309 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
310 of = ((res & DATA_MASK) == SIGN_MASK) << 11; \
311 return cf | pf | af | zf | sf | of; \
312 } \
313 }
314
315 /*-------------------------------------------------------------*/
316
317 #define ACTIONS_DEC(DATA_BITS,DATA_UTYPE) \
318 { \
319 PREAMBLE(DATA_BITS); \
320 { Long cf, pf, af, zf, sf, of; \
321 Long argL, argR, res; \
322 res = CC_DEP1; \
323 argL = res + 1; \
324 argR = 1; \
325 cf = CC_NDEP & AMD64G_CC_MASK_C; \
326 pf = parity_table[(UChar)res]; \
327 af = (res ^ argL ^ argR) & 0x10; \
328 zf = ((DATA_UTYPE)res == 0) << 6; \
329 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
330 of = ((res & DATA_MASK) \
331 == ((ULong)SIGN_MASK - 1)) << 11; \
332 return cf | pf | af | zf | sf | of; \
333 } \
334 }
335
336 /*-------------------------------------------------------------*/
337
338 #define ACTIONS_SHL(DATA_BITS,DATA_UTYPE) \
339 { \
340 PREAMBLE(DATA_BITS); \
341 { Long cf, pf, af, zf, sf, of; \
342 cf = (CC_DEP2 >> (DATA_BITS - 1)) & AMD64G_CC_MASK_C; \
343 pf = parity_table[(UChar)CC_DEP1]; \
344 af = 0; /* undefined */ \
345 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \
346 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
347 /* of is defined if shift count == 1 */ \
348 of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) \
349 & AMD64G_CC_MASK_O; \
350 return cf | pf | af | zf | sf | of; \
351 } \
352 }
353
354 /*-------------------------------------------------------------*/
355
356 #define ACTIONS_SHR(DATA_BITS,DATA_UTYPE) \
357 { \
358 PREAMBLE(DATA_BITS); \
359 { Long cf, pf, af, zf, sf, of; \
360 cf = CC_DEP2 & 1; \
361 pf = parity_table[(UChar)CC_DEP1]; \
362 af = 0; /* undefined */ \
363 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \
364 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
365 /* of is defined if shift count == 1 */ \
366 of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) \
367 & AMD64G_CC_MASK_O; \
368 return cf | pf | af | zf | sf | of; \
369 } \
370 }
371
372 /*-------------------------------------------------------------*/
373
374 /* ROL: cf' = lsb(result). of' = msb(result) ^ lsb(result). */
375 /* DEP1 = result, NDEP = old flags */
376 #define ACTIONS_ROL(DATA_BITS,DATA_UTYPE) \
377 { \
378 PREAMBLE(DATA_BITS); \
379 { Long fl \
380 = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C)) \
381 | (AMD64G_CC_MASK_C & CC_DEP1) \
382 | (AMD64G_CC_MASK_O & (lshift(CC_DEP1, \
383 11-(DATA_BITS-1)) \
384 ^ lshift(CC_DEP1, 11))); \
385 return fl; \
386 } \
387 }
388
389 /*-------------------------------------------------------------*/
390
391 /* ROR: cf' = msb(result). of' = msb(result) ^ msb-1(result). */
392 /* DEP1 = result, NDEP = old flags */
393 #define ACTIONS_ROR(DATA_BITS,DATA_UTYPE) \
394 { \
395 PREAMBLE(DATA_BITS); \
396 { Long fl \
397 = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C)) \
398 | (AMD64G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1))) \
399 | (AMD64G_CC_MASK_O & (lshift(CC_DEP1, \
400 11-(DATA_BITS-1)) \
401 ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1))); \
402 return fl; \
403 } \
404 }
405
406 /*-------------------------------------------------------------*/
407
408 #define ACTIONS_UMUL(DATA_BITS, DATA_UTYPE, NARROWtoU, \
409 DATA_U2TYPE, NARROWto2U) \
410 { \
411 PREAMBLE(DATA_BITS); \
412 { Long cf, pf, af, zf, sf, of; \
413 DATA_UTYPE hi; \
414 DATA_UTYPE lo \
415 = NARROWtoU( ((DATA_UTYPE)CC_DEP1) \
416 * ((DATA_UTYPE)CC_DEP2) ); \
417 DATA_U2TYPE rr \
418 = NARROWto2U( \
419 ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1)) \
420 * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2)) ); \
421 hi = NARROWtoU(rr >>/*u*/ DATA_BITS); \
422 cf = (hi != 0); \
423 pf = parity_table[(UChar)lo]; \
424 af = 0; /* undefined */ \
425 zf = (lo == 0) << 6; \
426 sf = lshift(lo, 8 - DATA_BITS) & 0x80; \
427 of = cf << 11; \
428 return cf | pf | af | zf | sf | of; \
429 } \
430 }
431
432 /*-------------------------------------------------------------*/
433
434 #define ACTIONS_SMUL(DATA_BITS, DATA_STYPE, NARROWtoS, \
435 DATA_S2TYPE, NARROWto2S) \
436 { \
437 PREAMBLE(DATA_BITS); \
438 { Long cf, pf, af, zf, sf, of; \
439 DATA_STYPE hi; \
440 DATA_STYPE lo \
441 = NARROWtoS( ((DATA_STYPE)CC_DEP1) \
442 * ((DATA_STYPE)CC_DEP2) ); \
443 DATA_S2TYPE rr \
444 = NARROWto2S( \
445 ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1)) \
446 * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2)) ); \
447 hi = NARROWtoS(rr >>/*s*/ DATA_BITS); \
448 cf = (hi != (lo >>/*s*/ (DATA_BITS-1))); \
449 pf = parity_table[(UChar)lo]; \
450 af = 0; /* undefined */ \
451 zf = (lo == 0) << 6; \
452 sf = lshift(lo, 8 - DATA_BITS) & 0x80; \
453 of = cf << 11; \
454 return cf | pf | af | zf | sf | of; \
455 } \
456 }
457
458 /*-------------------------------------------------------------*/
459
460 #define ACTIONS_UMULQ \
461 { \
462 PREAMBLE(64); \
463 { Long cf, pf, af, zf, sf, of; \
464 ULong lo, hi; \
465 mullU64( (ULong)CC_DEP1, (ULong)CC_DEP2, &hi, &lo ); \
466 cf = (hi != 0); \
467 pf = parity_table[(UChar)lo]; \
468 af = 0; /* undefined */ \
469 zf = (lo == 0) << 6; \
470 sf = lshift(lo, 8 - 64) & 0x80; \
471 of = cf << 11; \
472 return cf | pf | af | zf | sf | of; \
473 } \
474 }
475
476 /*-------------------------------------------------------------*/
477
478 #define ACTIONS_SMULQ \
479 { \
480 PREAMBLE(64); \
481 { Long cf, pf, af, zf, sf, of; \
482 Long lo, hi; \
483 mullS64( (Long)CC_DEP1, (Long)CC_DEP2, &hi, &lo ); \
484 cf = (hi != (lo >>/*s*/ (64-1))); \
485 pf = parity_table[(UChar)lo]; \
486 af = 0; /* undefined */ \
487 zf = (lo == 0) << 6; \
488 sf = lshift(lo, 8 - 64) & 0x80; \
489 of = cf << 11; \
490 return cf | pf | af | zf | sf | of; \
491 } \
492 }
493
494
495 #if PROFILE_RFLAGS
496
497 static Bool initted = False;
498
499 /* C flag, fast route */
500 static UInt tabc_fast[AMD64G_CC_OP_NUMBER];
501 /* C flag, slow route */
502 static UInt tabc_slow[AMD64G_CC_OP_NUMBER];
503 /* table for calculate_cond */
504 static UInt tab_cond[AMD64G_CC_OP_NUMBER][16];
505 /* total entry counts for calc_all, calc_c, calc_cond. */
506 static UInt n_calc_all = 0;
507 static UInt n_calc_c = 0;
508 static UInt n_calc_cond = 0;
509
510 #define SHOW_COUNTS_NOW (0 == (0x3FFFFF & (n_calc_all+n_calc_c+n_calc_cond)))
511
512
showCounts(void)513 static void showCounts ( void )
514 {
515 Int op, co;
516 Char ch;
517 vex_printf("\nTotal calls: calc_all=%u calc_cond=%u calc_c=%u\n",
518 n_calc_all, n_calc_cond, n_calc_c);
519
520 vex_printf(" cSLOW cFAST O NO B NB Z NZ BE NBE"
521 " S NS P NP L NL LE NLE\n");
522 vex_printf(" -----------------------------------------------------"
523 "----------------------------------------\n");
524 for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
525
526 ch = ' ';
527 if (op > 0 && (op-1) % 4 == 0)
528 ch = 'B';
529 if (op > 0 && (op-1) % 4 == 1)
530 ch = 'W';
531 if (op > 0 && (op-1) % 4 == 2)
532 ch = 'L';
533 if (op > 0 && (op-1) % 4 == 3)
534 ch = 'Q';
535
536 vex_printf("%2d%c: ", op, ch);
537 vex_printf("%6u ", tabc_slow[op]);
538 vex_printf("%6u ", tabc_fast[op]);
539 for (co = 0; co < 16; co++) {
540 Int n = tab_cond[op][co];
541 if (n >= 1000) {
542 vex_printf(" %3dK", n / 1000);
543 } else
544 if (n >= 0) {
545 vex_printf(" %3d ", n );
546 } else {
547 vex_printf(" ");
548 }
549 }
550 vex_printf("\n");
551 }
552 vex_printf("\n");
553 }
554
initCounts(void)555 static void initCounts ( void )
556 {
557 Int op, co;
558 initted = True;
559 for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
560 tabc_fast[op] = tabc_slow[op] = 0;
561 for (co = 0; co < 16; co++)
562 tab_cond[op][co] = 0;
563 }
564 }
565
566 #endif /* PROFILE_RFLAGS */
567
568
569 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
570 /* Calculate all the 6 flags from the supplied thunk parameters.
571 Worker function, not directly called from generated code. */
572 static
amd64g_calculate_rflags_all_WRK(ULong cc_op,ULong cc_dep1_formal,ULong cc_dep2_formal,ULong cc_ndep_formal)573 ULong amd64g_calculate_rflags_all_WRK ( ULong cc_op,
574 ULong cc_dep1_formal,
575 ULong cc_dep2_formal,
576 ULong cc_ndep_formal )
577 {
578 switch (cc_op) {
579 case AMD64G_CC_OP_COPY:
580 return cc_dep1_formal
581 & (AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z
582 | AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P);
583
584 case AMD64G_CC_OP_ADDB: ACTIONS_ADD( 8, UChar );
585 case AMD64G_CC_OP_ADDW: ACTIONS_ADD( 16, UShort );
586 case AMD64G_CC_OP_ADDL: ACTIONS_ADD( 32, UInt );
587 case AMD64G_CC_OP_ADDQ: ACTIONS_ADD( 64, ULong );
588
589 case AMD64G_CC_OP_ADCB: ACTIONS_ADC( 8, UChar );
590 case AMD64G_CC_OP_ADCW: ACTIONS_ADC( 16, UShort );
591 case AMD64G_CC_OP_ADCL: ACTIONS_ADC( 32, UInt );
592 case AMD64G_CC_OP_ADCQ: ACTIONS_ADC( 64, ULong );
593
594 case AMD64G_CC_OP_SUBB: ACTIONS_SUB( 8, UChar );
595 case AMD64G_CC_OP_SUBW: ACTIONS_SUB( 16, UShort );
596 case AMD64G_CC_OP_SUBL: ACTIONS_SUB( 32, UInt );
597 case AMD64G_CC_OP_SUBQ: ACTIONS_SUB( 64, ULong );
598
599 case AMD64G_CC_OP_SBBB: ACTIONS_SBB( 8, UChar );
600 case AMD64G_CC_OP_SBBW: ACTIONS_SBB( 16, UShort );
601 case AMD64G_CC_OP_SBBL: ACTIONS_SBB( 32, UInt );
602 case AMD64G_CC_OP_SBBQ: ACTIONS_SBB( 64, ULong );
603
604 case AMD64G_CC_OP_LOGICB: ACTIONS_LOGIC( 8, UChar );
605 case AMD64G_CC_OP_LOGICW: ACTIONS_LOGIC( 16, UShort );
606 case AMD64G_CC_OP_LOGICL: ACTIONS_LOGIC( 32, UInt );
607 case AMD64G_CC_OP_LOGICQ: ACTIONS_LOGIC( 64, ULong );
608
609 case AMD64G_CC_OP_INCB: ACTIONS_INC( 8, UChar );
610 case AMD64G_CC_OP_INCW: ACTIONS_INC( 16, UShort );
611 case AMD64G_CC_OP_INCL: ACTIONS_INC( 32, UInt );
612 case AMD64G_CC_OP_INCQ: ACTIONS_INC( 64, ULong );
613
614 case AMD64G_CC_OP_DECB: ACTIONS_DEC( 8, UChar );
615 case AMD64G_CC_OP_DECW: ACTIONS_DEC( 16, UShort );
616 case AMD64G_CC_OP_DECL: ACTIONS_DEC( 32, UInt );
617 case AMD64G_CC_OP_DECQ: ACTIONS_DEC( 64, ULong );
618
619 case AMD64G_CC_OP_SHLB: ACTIONS_SHL( 8, UChar );
620 case AMD64G_CC_OP_SHLW: ACTIONS_SHL( 16, UShort );
621 case AMD64G_CC_OP_SHLL: ACTIONS_SHL( 32, UInt );
622 case AMD64G_CC_OP_SHLQ: ACTIONS_SHL( 64, ULong );
623
624 case AMD64G_CC_OP_SHRB: ACTIONS_SHR( 8, UChar );
625 case AMD64G_CC_OP_SHRW: ACTIONS_SHR( 16, UShort );
626 case AMD64G_CC_OP_SHRL: ACTIONS_SHR( 32, UInt );
627 case AMD64G_CC_OP_SHRQ: ACTIONS_SHR( 64, ULong );
628
629 case AMD64G_CC_OP_ROLB: ACTIONS_ROL( 8, UChar );
630 case AMD64G_CC_OP_ROLW: ACTIONS_ROL( 16, UShort );
631 case AMD64G_CC_OP_ROLL: ACTIONS_ROL( 32, UInt );
632 case AMD64G_CC_OP_ROLQ: ACTIONS_ROL( 64, ULong );
633
634 case AMD64G_CC_OP_RORB: ACTIONS_ROR( 8, UChar );
635 case AMD64G_CC_OP_RORW: ACTIONS_ROR( 16, UShort );
636 case AMD64G_CC_OP_RORL: ACTIONS_ROR( 32, UInt );
637 case AMD64G_CC_OP_RORQ: ACTIONS_ROR( 64, ULong );
638
639 case AMD64G_CC_OP_UMULB: ACTIONS_UMUL( 8, UChar, toUChar,
640 UShort, toUShort );
641 case AMD64G_CC_OP_UMULW: ACTIONS_UMUL( 16, UShort, toUShort,
642 UInt, toUInt );
643 case AMD64G_CC_OP_UMULL: ACTIONS_UMUL( 32, UInt, toUInt,
644 ULong, idULong );
645
646 case AMD64G_CC_OP_UMULQ: ACTIONS_UMULQ;
647
648 case AMD64G_CC_OP_SMULB: ACTIONS_SMUL( 8, Char, toUChar,
649 Short, toUShort );
650 case AMD64G_CC_OP_SMULW: ACTIONS_SMUL( 16, Short, toUShort,
651 Int, toUInt );
652 case AMD64G_CC_OP_SMULL: ACTIONS_SMUL( 32, Int, toUInt,
653 Long, idULong );
654
655 case AMD64G_CC_OP_SMULQ: ACTIONS_SMULQ;
656
657 default:
658 /* shouldn't really make these calls from generated code */
659 vex_printf("amd64g_calculate_rflags_all_WRK(AMD64)"
660 "( %llu, 0x%llx, 0x%llx, 0x%llx )\n",
661 cc_op, cc_dep1_formal, cc_dep2_formal, cc_ndep_formal );
662 vpanic("amd64g_calculate_rflags_all_WRK(AMD64)");
663 }
664 }
665
666
667 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
668 /* Calculate all the 6 flags from the supplied thunk parameters. */
amd64g_calculate_rflags_all(ULong cc_op,ULong cc_dep1,ULong cc_dep2,ULong cc_ndep)669 ULong amd64g_calculate_rflags_all ( ULong cc_op,
670 ULong cc_dep1,
671 ULong cc_dep2,
672 ULong cc_ndep )
673 {
674 # if PROFILE_RFLAGS
675 if (!initted) initCounts();
676 n_calc_all++;
677 if (SHOW_COUNTS_NOW) showCounts();
678 # endif
679 return
680 amd64g_calculate_rflags_all_WRK ( cc_op, cc_dep1, cc_dep2, cc_ndep );
681 }
682
683
684 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
685 /* Calculate just the carry flag from the supplied thunk parameters. */
amd64g_calculate_rflags_c(ULong cc_op,ULong cc_dep1,ULong cc_dep2,ULong cc_ndep)686 ULong amd64g_calculate_rflags_c ( ULong cc_op,
687 ULong cc_dep1,
688 ULong cc_dep2,
689 ULong cc_ndep )
690 {
691 # if PROFILE_RFLAGS
692 if (!initted) initCounts();
693 n_calc_c++;
694 tabc_fast[cc_op]++;
695 if (SHOW_COUNTS_NOW) showCounts();
696 # endif
697
698 /* Fast-case some common ones. */
699 switch (cc_op) {
700 case AMD64G_CC_OP_COPY:
701 return (cc_dep1 >> AMD64G_CC_SHIFT_C) & 1;
702 case AMD64G_CC_OP_LOGICQ:
703 case AMD64G_CC_OP_LOGICL:
704 case AMD64G_CC_OP_LOGICW:
705 case AMD64G_CC_OP_LOGICB:
706 return 0;
707 // case AMD64G_CC_OP_SUBL:
708 // return ((UInt)cc_dep1) < ((UInt)cc_dep2)
709 // ? AMD64G_CC_MASK_C : 0;
710 // case AMD64G_CC_OP_SUBW:
711 // return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF))
712 // ? AMD64G_CC_MASK_C : 0;
713 // case AMD64G_CC_OP_SUBB:
714 // return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
715 // ? AMD64G_CC_MASK_C : 0;
716 // case AMD64G_CC_OP_INCL:
717 // case AMD64G_CC_OP_DECL:
718 // return cc_ndep & AMD64G_CC_MASK_C;
719 default:
720 break;
721 }
722
723 # if PROFILE_RFLAGS
724 tabc_fast[cc_op]--;
725 tabc_slow[cc_op]++;
726 # endif
727
728 return amd64g_calculate_rflags_all_WRK(cc_op,cc_dep1,cc_dep2,cc_ndep)
729 & AMD64G_CC_MASK_C;
730 }
731
732
733 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
734 /* returns 1 or 0 */
amd64g_calculate_condition(ULong cond,ULong cc_op,ULong cc_dep1,ULong cc_dep2,ULong cc_ndep)735 ULong amd64g_calculate_condition ( ULong/*AMD64Condcode*/ cond,
736 ULong cc_op,
737 ULong cc_dep1,
738 ULong cc_dep2,
739 ULong cc_ndep )
740 {
741 ULong rflags = amd64g_calculate_rflags_all_WRK(cc_op, cc_dep1,
742 cc_dep2, cc_ndep);
743 ULong of,sf,zf,cf,pf;
744 ULong inv = cond & 1;
745
746 # if PROFILE_RFLAGS
747 if (!initted) initCounts();
748 tab_cond[cc_op][cond]++;
749 n_calc_cond++;
750 if (SHOW_COUNTS_NOW) showCounts();
751 # endif
752
753 switch (cond) {
754 case AMD64CondNO:
755 case AMD64CondO: /* OF == 1 */
756 of = rflags >> AMD64G_CC_SHIFT_O;
757 return 1 & (inv ^ of);
758
759 case AMD64CondNZ:
760 case AMD64CondZ: /* ZF == 1 */
761 zf = rflags >> AMD64G_CC_SHIFT_Z;
762 return 1 & (inv ^ zf);
763
764 case AMD64CondNB:
765 case AMD64CondB: /* CF == 1 */
766 cf = rflags >> AMD64G_CC_SHIFT_C;
767 return 1 & (inv ^ cf);
768 break;
769
770 case AMD64CondNBE:
771 case AMD64CondBE: /* (CF or ZF) == 1 */
772 cf = rflags >> AMD64G_CC_SHIFT_C;
773 zf = rflags >> AMD64G_CC_SHIFT_Z;
774 return 1 & (inv ^ (cf | zf));
775 break;
776
777 case AMD64CondNS:
778 case AMD64CondS: /* SF == 1 */
779 sf = rflags >> AMD64G_CC_SHIFT_S;
780 return 1 & (inv ^ sf);
781
782 case AMD64CondNP:
783 case AMD64CondP: /* PF == 1 */
784 pf = rflags >> AMD64G_CC_SHIFT_P;
785 return 1 & (inv ^ pf);
786
787 case AMD64CondNL:
788 case AMD64CondL: /* (SF xor OF) == 1 */
789 sf = rflags >> AMD64G_CC_SHIFT_S;
790 of = rflags >> AMD64G_CC_SHIFT_O;
791 return 1 & (inv ^ (sf ^ of));
792 break;
793
794 case AMD64CondNLE:
795 case AMD64CondLE: /* ((SF xor OF) or ZF) == 1 */
796 sf = rflags >> AMD64G_CC_SHIFT_S;
797 of = rflags >> AMD64G_CC_SHIFT_O;
798 zf = rflags >> AMD64G_CC_SHIFT_Z;
799 return 1 & (inv ^ ((sf ^ of) | zf));
800 break;
801
802 default:
803 /* shouldn't really make these calls from generated code */
804 vex_printf("amd64g_calculate_condition"
805 "( %llu, %llu, 0x%llx, 0x%llx, 0x%llx )\n",
806 cond, cc_op, cc_dep1, cc_dep2, cc_ndep );
807 vpanic("amd64g_calculate_condition");
808 }
809 }
810
811
812 /* VISIBLE TO LIBVEX CLIENT */
LibVEX_GuestAMD64_get_rflags(VexGuestAMD64State * vex_state)813 ULong LibVEX_GuestAMD64_get_rflags ( /*IN*/VexGuestAMD64State* vex_state )
814 {
815 ULong rflags = amd64g_calculate_rflags_all_WRK(
816 vex_state->guest_CC_OP,
817 vex_state->guest_CC_DEP1,
818 vex_state->guest_CC_DEP2,
819 vex_state->guest_CC_NDEP
820 );
821 Long dflag = vex_state->guest_DFLAG;
822 vassert(dflag == 1 || dflag == -1);
823 if (dflag == -1)
824 rflags |= (1<<10);
825 if (vex_state->guest_IDFLAG == 1)
826 rflags |= (1<<21);
827 if (vex_state->guest_ACFLAG == 1)
828 rflags |= (1<<18);
829
830 return rflags;
831 }
832
833 /* VISIBLE TO LIBVEX CLIENT */
834 void
LibVEX_GuestAMD64_put_rflag_c(ULong new_carry_flag,VexGuestAMD64State * vex_state)835 LibVEX_GuestAMD64_put_rflag_c ( ULong new_carry_flag,
836 /*MOD*/VexGuestAMD64State* vex_state )
837 {
838 ULong oszacp = amd64g_calculate_rflags_all_WRK(
839 vex_state->guest_CC_OP,
840 vex_state->guest_CC_DEP1,
841 vex_state->guest_CC_DEP2,
842 vex_state->guest_CC_NDEP
843 );
844 if (new_carry_flag & 1) {
845 oszacp |= AMD64G_CC_MASK_C;
846 } else {
847 oszacp &= ~AMD64G_CC_MASK_C;
848 }
849 vex_state->guest_CC_OP = AMD64G_CC_OP_COPY;
850 vex_state->guest_CC_DEP1 = oszacp;
851 vex_state->guest_CC_DEP2 = 0;
852 vex_state->guest_CC_NDEP = 0;
853 }
854
855
856 /*---------------------------------------------------------------*/
857 /*--- %rflags translation-time function specialisers. ---*/
858 /*--- These help iropt specialise calls the above run-time ---*/
859 /*--- %rflags functions. ---*/
860 /*---------------------------------------------------------------*/
861
862 /* Used by the optimiser to try specialisations. Returns an
863 equivalent expression, or NULL if none. */
864
isU64(IRExpr * e,ULong n)865 static Bool isU64 ( IRExpr* e, ULong n )
866 {
867 return toBool( e->tag == Iex_Const
868 && e->Iex.Const.con->tag == Ico_U64
869 && e->Iex.Const.con->Ico.U64 == n );
870 }
871
guest_amd64_spechelper(HChar * function_name,IRExpr ** args,IRStmt ** precedingStmts,Int n_precedingStmts)872 IRExpr* guest_amd64_spechelper ( HChar* function_name,
873 IRExpr** args,
874 IRStmt** precedingStmts,
875 Int n_precedingStmts )
876 {
877 # define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
878 # define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
879 # define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
880 # define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
881 # define mkU8(_n) IRExpr_Const(IRConst_U8(_n))
882
883 Int i, arity = 0;
884 for (i = 0; args[i]; i++)
885 arity++;
886 # if 0
887 vex_printf("spec request:\n");
888 vex_printf(" %s ", function_name);
889 for (i = 0; i < arity; i++) {
890 vex_printf(" ");
891 ppIRExpr(args[i]);
892 }
893 vex_printf("\n");
894 # endif
895
896 /* --------- specialising "amd64g_calculate_condition" --------- */
897
898 if (vex_streq(function_name, "amd64g_calculate_condition")) {
899 /* specialise calls to above "calculate condition" function */
900 IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2;
901 vassert(arity == 5);
902 cond = args[0];
903 cc_op = args[1];
904 cc_dep1 = args[2];
905 cc_dep2 = args[3];
906
907 /*---------------- ADDQ ----------------*/
908
909 if (isU64(cc_op, AMD64G_CC_OP_ADDQ) && isU64(cond, AMD64CondZ)) {
910 /* long long add, then Z --> test (dst+src == 0) */
911 return unop(Iop_1Uto64,
912 binop(Iop_CmpEQ64,
913 binop(Iop_Add64, cc_dep1, cc_dep2),
914 mkU64(0)));
915 }
916
917 /*---------------- SUBQ ----------------*/
918
919 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondZ)) {
920 /* long long sub/cmp, then Z --> test dst==src */
921 return unop(Iop_1Uto64,
922 binop(Iop_CmpEQ64,cc_dep1,cc_dep2));
923 }
924 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNZ)) {
925 /* long long sub/cmp, then NZ --> test dst!=src */
926 return unop(Iop_1Uto64,
927 binop(Iop_CmpNE64,cc_dep1,cc_dep2));
928 }
929
930 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondL)) {
931 /* long long sub/cmp, then L (signed less than)
932 --> test dst <s src */
933 return unop(Iop_1Uto64,
934 binop(Iop_CmpLT64S, cc_dep1, cc_dep2));
935 }
936
937 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondB)) {
938 /* long long sub/cmp, then B (unsigned less than)
939 --> test dst <u src */
940 return unop(Iop_1Uto64,
941 binop(Iop_CmpLT64U, cc_dep1, cc_dep2));
942 }
943 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNB)) {
944 /* long long sub/cmp, then NB (unsigned greater than or equal)
945 --> test src <=u dst */
946 /* Note, args are opposite way round from the usual */
947 return unop(Iop_1Uto64,
948 binop(Iop_CmpLE64U, cc_dep2, cc_dep1));
949 }
950
951 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondBE)) {
952 /* long long sub/cmp, then BE (unsigned less than or equal)
953 --> test dst <=u src */
954 return unop(Iop_1Uto64,
955 binop(Iop_CmpLE64U, cc_dep1, cc_dep2));
956 }
957 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNBE)) {
958 /* long long sub/cmp, then NBE (unsigned greater than)
959 --> test !(dst <=u src) */
960 return binop(Iop_Xor64,
961 unop(Iop_1Uto64,
962 binop(Iop_CmpLE64U, cc_dep1, cc_dep2)),
963 mkU64(1));
964 }
965
966 /*---------------- SUBL ----------------*/
967
968 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondZ)) {
969 /* long sub/cmp, then Z --> test dst==src */
970 return unop(Iop_1Uto64,
971 binop(Iop_CmpEQ32,
972 unop(Iop_64to32, cc_dep1),
973 unop(Iop_64to32, cc_dep2)));
974 }
975 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNZ)) {
976 /* long sub/cmp, then NZ --> test dst!=src */
977 return unop(Iop_1Uto64,
978 binop(Iop_CmpNE32,
979 unop(Iop_64to32, cc_dep1),
980 unop(Iop_64to32, cc_dep2)));
981 }
982
983 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondL)) {
984 /* long sub/cmp, then L (signed less than)
985 --> test dst <s src */
986 return unop(Iop_1Uto64,
987 binop(Iop_CmpLT32S,
988 unop(Iop_64to32, cc_dep1),
989 unop(Iop_64to32, cc_dep2)));
990 }
991
992 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondLE)) {
993 /* long sub/cmp, then LE (signed less than or equal)
994 --> test dst <=s src */
995 return unop(Iop_1Uto64,
996 binop(Iop_CmpLE32S,
997 unop(Iop_64to32, cc_dep1),
998 unop(Iop_64to32, cc_dep2)));
999
1000 }
1001 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNLE)) {
1002 /* long sub/cmp, then NLE (signed greater than)
1003 --> test !(dst <=s src)
1004 --> test (dst >s src)
1005 --> test (src <s dst) */
1006 return unop(Iop_1Uto64,
1007 binop(Iop_CmpLT32S,
1008 unop(Iop_64to32, cc_dep2),
1009 unop(Iop_64to32, cc_dep1)));
1010
1011 }
1012
1013 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondBE)) {
1014 /* long sub/cmp, then BE (unsigned less than or equal)
1015 --> test dst <=u src */
1016 return unop(Iop_1Uto64,
1017 binop(Iop_CmpLE32U,
1018 unop(Iop_64to32, cc_dep1),
1019 unop(Iop_64to32, cc_dep2)));
1020 }
1021 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNBE)) {
1022 /* long sub/cmp, then NBE (unsigned greater than)
1023 --> test src <u dst */
1024 /* Note, args are opposite way round from the usual */
1025 return unop(Iop_1Uto64,
1026 binop(Iop_CmpLT32U,
1027 unop(Iop_64to32, cc_dep2),
1028 unop(Iop_64to32, cc_dep1)));
1029 }
1030
1031 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondS)) {
1032 /* long sub/cmp, then S (negative) --> test (dst-src <s 0) */
1033 return unop(Iop_1Uto64,
1034 binop(Iop_CmpLT32S,
1035 binop(Iop_Sub32,
1036 unop(Iop_64to32, cc_dep1),
1037 unop(Iop_64to32, cc_dep2)),
1038 mkU32(0)));
1039 }
1040
1041 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondB)) {
1042 /* long sub/cmp, then B (unsigned less than)
1043 --> test dst <u src */
1044 return unop(Iop_1Uto64,
1045 binop(Iop_CmpLT32U,
1046 unop(Iop_64to32, cc_dep1),
1047 unop(Iop_64to32, cc_dep2)));
1048 }
1049
1050 /*---------------- SUBW ----------------*/
1051
1052 if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondZ)) {
1053 /* word sub/cmp, then Z --> test dst==src */
1054 return unop(Iop_1Uto64,
1055 binop(Iop_CmpEQ16,
1056 unop(Iop_64to16,cc_dep1),
1057 unop(Iop_64to16,cc_dep2)));
1058 }
1059 if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNZ)) {
1060 /* word sub/cmp, then NZ --> test dst!=src */
1061 return unop(Iop_1Uto64,
1062 binop(Iop_CmpNE16,
1063 unop(Iop_64to16,cc_dep1),
1064 unop(Iop_64to16,cc_dep2)));
1065 }
1066
1067 if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondLE)) {
1068 /* word sub/cmp, then LE (signed less than or equal)
1069 --> test dst <=s src */
1070 return unop(Iop_1Uto64,
1071 binop(Iop_CmpLE64S,
1072 binop(Iop_Shl64,cc_dep1,mkU8(48)),
1073 binop(Iop_Shl64,cc_dep2,mkU8(48))));
1074
1075 }
1076
1077 /*---------------- SUBB ----------------*/
1078
1079 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondZ)) {
1080 /* byte sub/cmp, then Z --> test dst==src */
1081 return unop(Iop_1Uto64,
1082 binop(Iop_CmpEQ8,
1083 unop(Iop_64to8,cc_dep1),
1084 unop(Iop_64to8,cc_dep2)));
1085 }
1086 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNZ)) {
1087 /* byte sub/cmp, then NZ --> test dst!=src */
1088 return unop(Iop_1Uto64,
1089 binop(Iop_CmpNE8,
1090 unop(Iop_64to8,cc_dep1),
1091 unop(Iop_64to8,cc_dep2)));
1092 }
1093
1094 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondBE)) {
1095 /* byte sub/cmp, then BE (unsigned less than or equal)
1096 --> test dst <=u src */
1097 return unop(Iop_1Uto64,
1098 binop(Iop_CmpLE64U,
1099 binop(Iop_And64, cc_dep1, mkU64(0xFF)),
1100 binop(Iop_And64, cc_dep2, mkU64(0xFF))));
1101 }
1102
1103 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondS)
1104 && isU64(cc_dep2, 0)) {
1105 /* byte sub/cmp of zero, then S --> test (dst-0 <s 0)
1106 --> test dst <s 0
1107 --> (ULong)dst[7]
1108 This is yet another scheme by which gcc figures out if the
1109 top bit of a byte is 1 or 0. See also LOGICB/CondS below. */
1110 /* Note: isU64(cc_dep2, 0) is correct, even though this is
1111 for an 8-bit comparison, since the args to the helper
1112 function are always U64s. */
1113 return binop(Iop_And64,
1114 binop(Iop_Shr64,cc_dep1,mkU8(7)),
1115 mkU64(1));
1116 }
1117 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNS)
1118 && isU64(cc_dep2, 0)) {
1119 /* byte sub/cmp of zero, then NS --> test !(dst-0 <s 0)
1120 --> test !(dst <s 0)
1121 --> (ULong) !dst[7]
1122 */
1123 return binop(Iop_Xor64,
1124 binop(Iop_And64,
1125 binop(Iop_Shr64,cc_dep1,mkU8(7)),
1126 mkU64(1)),
1127 mkU64(1));
1128 }
1129
1130 /*---------------- LOGICQ ----------------*/
1131
1132 if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondZ)) {
1133 /* long long and/or/xor, then Z --> test dst==0 */
1134 return unop(Iop_1Uto64,
1135 binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
1136 }
1137 if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondNZ)) {
1138 /* long long and/or/xor, then NZ --> test dst!=0 */
1139 return unop(Iop_1Uto64,
1140 binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
1141 }
1142
1143 if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondL)) {
1144 /* long long and/or/xor, then L
1145 LOGIC sets SF and ZF according to the
1146 result and makes OF be zero. L computes SF ^ OF, but
1147 OF is zero, so this reduces to SF -- which will be 1 iff
1148 the result is < signed 0. Hence ...
1149 */
1150 return unop(Iop_1Uto64,
1151 binop(Iop_CmpLT64S,
1152 cc_dep1,
1153 mkU64(0)));
1154 }
1155
1156 /*---------------- LOGICL ----------------*/
1157
1158 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondZ)) {
1159 /* long and/or/xor, then Z --> test dst==0 */
1160 return unop(Iop_1Uto64,
1161 binop(Iop_CmpEQ32,
1162 unop(Iop_64to32, cc_dep1),
1163 mkU32(0)));
1164 }
1165 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNZ)) {
1166 /* long and/or/xor, then NZ --> test dst!=0 */
1167 return unop(Iop_1Uto64,
1168 binop(Iop_CmpNE32,
1169 unop(Iop_64to32, cc_dep1),
1170 mkU32(0)));
1171 }
1172
1173 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondLE)) {
1174 /* long and/or/xor, then LE
1175 This is pretty subtle. LOGIC sets SF and ZF according to the
1176 result and makes OF be zero. LE computes (SF ^ OF) | ZF, but
1177 OF is zero, so this reduces to SF | ZF -- which will be 1 iff
1178 the result is <=signed 0. Hence ...
1179 */
1180 return unop(Iop_1Uto64,
1181 binop(Iop_CmpLE32S,
1182 unop(Iop_64to32, cc_dep1),
1183 mkU32(0)));
1184 }
1185
1186 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondS)) {
1187 /* long and/or/xor, then S --> (ULong)result[31] */
1188 return binop(Iop_And64,
1189 binop(Iop_Shr64, cc_dep1, mkU8(31)),
1190 mkU64(1));
1191 }
1192 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNS)) {
1193 /* long and/or/xor, then S --> (ULong) ~ result[31] */
1194 return binop(Iop_Xor64,
1195 binop(Iop_And64,
1196 binop(Iop_Shr64, cc_dep1, mkU8(31)),
1197 mkU64(1)),
1198 mkU64(1));
1199 }
1200
1201 /*---------------- LOGICW ----------------*/
1202
1203 if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondZ)) {
1204 /* word and/or/xor, then Z --> test dst==0 */
1205 return unop(Iop_1Uto64,
1206 binop(Iop_CmpEQ64,
1207 binop(Iop_And64, cc_dep1, mkU64(0xFFFF)),
1208 mkU64(0)));
1209 }
1210 if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondNZ)) {
1211 /* word and/or/xor, then NZ --> test dst!=0 */
1212 return unop(Iop_1Uto64,
1213 binop(Iop_CmpNE64,
1214 binop(Iop_And64, cc_dep1, mkU64(0xFFFF)),
1215 mkU64(0)));
1216 }
1217
1218 /*---------------- LOGICB ----------------*/
1219
1220 if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondZ)) {
1221 /* byte and/or/xor, then Z --> test dst==0 */
1222 return unop(Iop_1Uto64,
1223 binop(Iop_CmpEQ64, binop(Iop_And64,cc_dep1,mkU64(255)),
1224 mkU64(0)));
1225 }
1226 if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNZ)) {
1227 /* byte and/or/xor, then NZ --> test dst!=0 */
1228 return unop(Iop_1Uto64,
1229 binop(Iop_CmpNE64, binop(Iop_And64,cc_dep1,mkU64(255)),
1230 mkU64(0)));
1231 }
1232
1233 if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondS)) {
1234 /* this is an idiom gcc sometimes uses to find out if the top
1235 bit of a byte register is set: eg testb %al,%al; js ..
1236 Since it just depends on the top bit of the byte, extract
1237 that bit and explicitly get rid of all the rest. This
1238 helps memcheck avoid false positives in the case where any
1239 of the other bits in the byte are undefined. */
1240 /* byte and/or/xor, then S --> (UInt)result[7] */
1241 return binop(Iop_And64,
1242 binop(Iop_Shr64,cc_dep1,mkU8(7)),
1243 mkU64(1));
1244 }
1245 if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNS)) {
1246 /* byte and/or/xor, then NS --> (UInt)!result[7] */
1247 return binop(Iop_Xor64,
1248 binop(Iop_And64,
1249 binop(Iop_Shr64,cc_dep1,mkU8(7)),
1250 mkU64(1)),
1251 mkU64(1));
1252 }
1253
1254 /*---------------- INCB ----------------*/
1255
1256 if (isU64(cc_op, AMD64G_CC_OP_INCB) && isU64(cond, AMD64CondLE)) {
1257 /* 8-bit inc, then LE --> sign bit of the arg */
1258 return binop(Iop_And64,
1259 binop(Iop_Shr64,
1260 binop(Iop_Sub64, cc_dep1, mkU64(1)),
1261 mkU8(7)),
1262 mkU64(1));
1263 }
1264
1265 /*---------------- INCW ----------------*/
1266
1267 if (isU64(cc_op, AMD64G_CC_OP_INCW) && isU64(cond, AMD64CondZ)) {
1268 /* 16-bit inc, then Z --> test dst == 0 */
1269 return unop(Iop_1Uto64,
1270 binop(Iop_CmpEQ64,
1271 binop(Iop_Shl64,cc_dep1,mkU8(48)),
1272 mkU64(0)));
1273 }
1274
1275 /*---------------- DECL ----------------*/
1276
1277 if (isU64(cc_op, AMD64G_CC_OP_DECL) && isU64(cond, AMD64CondZ)) {
1278 /* dec L, then Z --> test dst == 0 */
1279 return unop(Iop_1Uto64,
1280 binop(Iop_CmpEQ32,
1281 unop(Iop_64to32, cc_dep1),
1282 mkU32(0)));
1283 }
1284
1285 /*---------------- DECW ----------------*/
1286
1287 if (isU64(cc_op, AMD64G_CC_OP_DECW) && isU64(cond, AMD64CondNZ)) {
1288 /* 16-bit dec, then NZ --> test dst != 0 */
1289 return unop(Iop_1Uto64,
1290 binop(Iop_CmpNE64,
1291 binop(Iop_Shl64,cc_dep1,mkU8(48)),
1292 mkU64(0)));
1293 }
1294
1295 /*---------------- COPY ----------------*/
1296 /* This can happen, as a result of amd64 FP compares: "comisd ... ;
1297 jbe" for example. */
1298
1299 if (isU64(cc_op, AMD64G_CC_OP_COPY) &&
1300 (isU64(cond, AMD64CondBE) || isU64(cond, AMD64CondNBE))) {
1301 /* COPY, then BE --> extract C and Z from dep1, and test (C
1302 or Z == 1). */
1303 /* COPY, then NBE --> extract C and Z from dep1, and test (C
1304 or Z == 0). */
1305 ULong nnn = isU64(cond, AMD64CondBE) ? 1 : 0;
1306 return
1307 unop(
1308 Iop_1Uto64,
1309 binop(
1310 Iop_CmpEQ64,
1311 binop(
1312 Iop_And64,
1313 binop(
1314 Iop_Or64,
1315 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
1316 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z))
1317 ),
1318 mkU64(1)
1319 ),
1320 mkU64(nnn)
1321 )
1322 );
1323 }
1324
1325 if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondB)) {
1326 /* COPY, then B --> extract C dep1, and test (C == 1). */
1327 return
1328 unop(
1329 Iop_1Uto64,
1330 binop(
1331 Iop_CmpNE64,
1332 binop(
1333 Iop_And64,
1334 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
1335 mkU64(1)
1336 ),
1337 mkU64(0)
1338 )
1339 );
1340 }
1341
1342 if (isU64(cc_op, AMD64G_CC_OP_COPY)
1343 && (isU64(cond, AMD64CondZ) || isU64(cond, AMD64CondNZ))) {
1344 /* COPY, then Z --> extract Z from dep1, and test (Z == 1). */
1345 /* COPY, then NZ --> extract Z from dep1, and test (Z == 0). */
1346 UInt nnn = isU64(cond, AMD64CondZ) ? 1 : 0;
1347 return
1348 unop(
1349 Iop_1Uto64,
1350 binop(
1351 Iop_CmpEQ64,
1352 binop(
1353 Iop_And64,
1354 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z)),
1355 mkU64(1)
1356 ),
1357 mkU64(nnn)
1358 )
1359 );
1360 }
1361
1362 if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondP)) {
1363 /* COPY, then P --> extract P from dep1, and test (P == 1). */
1364 return
1365 unop(
1366 Iop_1Uto64,
1367 binop(
1368 Iop_CmpNE64,
1369 binop(
1370 Iop_And64,
1371 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_P)),
1372 mkU64(1)
1373 ),
1374 mkU64(0)
1375 )
1376 );
1377 }
1378
1379 return NULL;
1380 }
1381
1382 /* --------- specialising "amd64g_calculate_rflags_c" --------- */
1383
1384 if (vex_streq(function_name, "amd64g_calculate_rflags_c")) {
1385 /* specialise calls to above "calculate_rflags_c" function */
1386 IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
1387 vassert(arity == 4);
1388 cc_op = args[0];
1389 cc_dep1 = args[1];
1390 cc_dep2 = args[2];
1391 cc_ndep = args[3];
1392
1393 if (isU64(cc_op, AMD64G_CC_OP_SUBQ)) {
1394 /* C after sub denotes unsigned less than */
1395 return unop(Iop_1Uto64,
1396 binop(Iop_CmpLT64U,
1397 cc_dep1,
1398 cc_dep2));
1399 }
1400 if (isU64(cc_op, AMD64G_CC_OP_SUBL)) {
1401 /* C after sub denotes unsigned less than */
1402 return unop(Iop_1Uto64,
1403 binop(Iop_CmpLT32U,
1404 unop(Iop_64to32, cc_dep1),
1405 unop(Iop_64to32, cc_dep2)));
1406 }
1407 if (isU64(cc_op, AMD64G_CC_OP_SUBB)) {
1408 /* C after sub denotes unsigned less than */
1409 return unop(Iop_1Uto64,
1410 binop(Iop_CmpLT64U,
1411 binop(Iop_And64,cc_dep1,mkU64(0xFF)),
1412 binop(Iop_And64,cc_dep2,mkU64(0xFF))));
1413 }
1414 if (isU64(cc_op, AMD64G_CC_OP_LOGICQ)
1415 || isU64(cc_op, AMD64G_CC_OP_LOGICL)
1416 || isU64(cc_op, AMD64G_CC_OP_LOGICW)
1417 || isU64(cc_op, AMD64G_CC_OP_LOGICB)) {
1418 /* cflag after logic is zero */
1419 return mkU64(0);
1420 }
1421 if (isU64(cc_op, AMD64G_CC_OP_DECL) || isU64(cc_op, AMD64G_CC_OP_INCL)
1422 || isU64(cc_op, AMD64G_CC_OP_DECQ) || isU64(cc_op, AMD64G_CC_OP_INCQ)) {
1423 /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
1424 return cc_ndep;
1425 }
1426
1427 # if 0
1428 if (cc_op->tag == Iex_Const) {
1429 vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n");
1430 }
1431 # endif
1432
1433 return NULL;
1434 }
1435
1436 # undef unop
1437 # undef binop
1438 # undef mkU64
1439 # undef mkU32
1440 # undef mkU8
1441
1442 return NULL;
1443 }
1444
1445
1446 /*---------------------------------------------------------------*/
1447 /*--- Supporting functions for x87 FPU activities. ---*/
1448 /*---------------------------------------------------------------*/
1449
host_is_little_endian(void)1450 static inline Bool host_is_little_endian ( void )
1451 {
1452 UInt x = 0x76543210;
1453 UChar* p = (UChar*)(&x);
1454 return toBool(*p == 0x10);
1455 }
1456
1457 /* Inspect a value and its tag, as per the x87 'FXAM' instruction. */
1458 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_FXAM(ULong tag,ULong dbl)1459 ULong amd64g_calculate_FXAM ( ULong tag, ULong dbl )
1460 {
1461 Bool mantissaIsZero;
1462 Int bexp;
1463 UChar sign;
1464 UChar* f64;
1465
1466 vassert(host_is_little_endian());
1467
1468 /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */
1469
1470 f64 = (UChar*)(&dbl);
1471 sign = toUChar( (f64[7] >> 7) & 1 );
1472
1473 /* First off, if the tag indicates the register was empty,
1474 return 1,0,sign,1 */
1475 if (tag == 0) {
1476 /* vex_printf("Empty\n"); */
1477 return AMD64G_FC_MASK_C3 | 0 | (sign << AMD64G_FC_SHIFT_C1)
1478 | AMD64G_FC_MASK_C0;
1479 }
1480
1481 bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
1482 bexp &= 0x7FF;
1483
1484 mantissaIsZero
1485 = toBool(
1486 (f64[6] & 0x0F) == 0
1487 && (f64[5] | f64[4] | f64[3] | f64[2] | f64[1] | f64[0]) == 0
1488 );
1489
1490 /* If both exponent and mantissa are zero, the value is zero.
1491 Return 1,0,sign,0. */
1492 if (bexp == 0 && mantissaIsZero) {
1493 /* vex_printf("Zero\n"); */
1494 return AMD64G_FC_MASK_C3 | 0
1495 | (sign << AMD64G_FC_SHIFT_C1) | 0;
1496 }
1497
1498 /* If exponent is zero but mantissa isn't, it's a denormal.
1499 Return 1,1,sign,0. */
1500 if (bexp == 0 && !mantissaIsZero) {
1501 /* vex_printf("Denormal\n"); */
1502 return AMD64G_FC_MASK_C3 | AMD64G_FC_MASK_C2
1503 | (sign << AMD64G_FC_SHIFT_C1) | 0;
1504 }
1505
1506 /* If the exponent is 7FF and the mantissa is zero, this is an infinity.
1507 Return 0,1,sign,1. */
1508 if (bexp == 0x7FF && mantissaIsZero) {
1509 /* vex_printf("Inf\n"); */
1510 return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1)
1511 | AMD64G_FC_MASK_C0;
1512 }
1513
1514 /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN.
1515 Return 0,0,sign,1. */
1516 if (bexp == 0x7FF && !mantissaIsZero) {
1517 /* vex_printf("NaN\n"); */
1518 return 0 | 0 | (sign << AMD64G_FC_SHIFT_C1) | AMD64G_FC_MASK_C0;
1519 }
1520
1521 /* Uh, ok, we give up. It must be a normal finite number.
1522 Return 0,1,sign,0.
1523 */
1524 /* vex_printf("normal\n"); */
1525 return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1) | 0;
1526 }
1527
1528
1529 /* This is used to implement both 'frstor' and 'fldenv'. The latter
1530 appears to differ from the former only in that the 8 FP registers
1531 themselves are not transferred into the guest state. */
1532 static
do_put_x87(Bool moveRegs,UChar * x87_state,VexGuestAMD64State * vex_state)1533 VexEmWarn do_put_x87 ( Bool moveRegs,
1534 /*IN*/UChar* x87_state,
1535 /*OUT*/VexGuestAMD64State* vex_state )
1536 {
1537 Int stno, preg;
1538 UInt tag;
1539 ULong* vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
1540 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
1541 Fpu_State* x87 = (Fpu_State*)x87_state;
1542 UInt ftop = (x87->env[FP_ENV_STAT] >> 11) & 7;
1543 UInt tagw = x87->env[FP_ENV_TAG];
1544 UInt fpucw = x87->env[FP_ENV_CTRL];
1545 UInt c3210 = x87->env[FP_ENV_STAT] & 0x4700;
1546 VexEmWarn ew;
1547 UInt fpround;
1548 ULong pair;
1549
1550 /* Copy registers and tags */
1551 for (stno = 0; stno < 8; stno++) {
1552 preg = (stno + ftop) & 7;
1553 tag = (tagw >> (2*preg)) & 3;
1554 if (tag == 3) {
1555 /* register is empty */
1556 /* hmm, if it's empty, does it still get written? Probably
1557 safer to say it does. If we don't, memcheck could get out
1558 of sync, in that it thinks all FP registers are defined by
1559 this helper, but in reality some have not been updated. */
1560 if (moveRegs)
1561 vexRegs[preg] = 0; /* IEEE754 64-bit zero */
1562 vexTags[preg] = 0;
1563 } else {
1564 /* register is non-empty */
1565 if (moveRegs)
1566 convert_f80le_to_f64le( &x87->reg[10*stno],
1567 (UChar*)&vexRegs[preg] );
1568 vexTags[preg] = 1;
1569 }
1570 }
1571
1572 /* stack pointer */
1573 vex_state->guest_FTOP = ftop;
1574
1575 /* status word */
1576 vex_state->guest_FC3210 = c3210;
1577
1578 /* handle the control word, setting FPROUND and detecting any
1579 emulation warnings. */
1580 pair = amd64g_check_fldcw ( (ULong)fpucw );
1581 fpround = (UInt)pair & 0xFFFFFFFFULL;
1582 ew = (VexEmWarn)(pair >> 32);
1583
1584 vex_state->guest_FPROUND = fpround & 3;
1585
1586 /* emulation warnings --> caller */
1587 return ew;
1588 }
1589
1590
1591 /* Create an x87 FPU state from the guest state, as close as
1592 we can approximate it. */
1593 static
do_get_x87(VexGuestAMD64State * vex_state,UChar * x87_state)1594 void do_get_x87 ( /*IN*/VexGuestAMD64State* vex_state,
1595 /*OUT*/UChar* x87_state )
1596 {
1597 Int i, stno, preg;
1598 UInt tagw;
1599 ULong* vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
1600 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
1601 Fpu_State* x87 = (Fpu_State*)x87_state;
1602 UInt ftop = vex_state->guest_FTOP;
1603 UInt c3210 = vex_state->guest_FC3210;
1604
1605 for (i = 0; i < 14; i++)
1606 x87->env[i] = 0;
1607
1608 x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
1609 x87->env[FP_ENV_STAT]
1610 = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
1611 x87->env[FP_ENV_CTRL]
1612 = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
1613
1614 /* Dump the register stack in ST order. */
1615 tagw = 0;
1616 for (stno = 0; stno < 8; stno++) {
1617 preg = (stno + ftop) & 7;
1618 if (vexTags[preg] == 0) {
1619 /* register is empty */
1620 tagw |= (3 << (2*preg));
1621 convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
1622 &x87->reg[10*stno] );
1623 } else {
1624 /* register is full. */
1625 tagw |= (0 << (2*preg));
1626 convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
1627 &x87->reg[10*stno] );
1628 }
1629 }
1630 x87->env[FP_ENV_TAG] = toUShort(tagw);
1631 }
1632
1633
1634 /* CALLED FROM GENERATED CODE */
1635 /* DIRTY HELPER (reads guest state, writes guest mem) */
1636 /* NOTE: only handles 32-bit format (no REX.W on the insn) */
amd64g_dirtyhelper_FXSAVE(VexGuestAMD64State * gst,HWord addr)1637 void amd64g_dirtyhelper_FXSAVE ( VexGuestAMD64State* gst, HWord addr )
1638 {
1639 /* Derived from values obtained from
1640 vendor_id : AuthenticAMD
1641 cpu family : 15
1642 model : 12
1643 model name : AMD Athlon(tm) 64 Processor 3200+
1644 stepping : 0
1645 cpu MHz : 2200.000
1646 cache size : 512 KB
1647 */
1648 /* Somewhat roundabout, but at least it's simple. */
1649 Fpu_State tmp;
1650 UShort* addrS = (UShort*)addr;
1651 UChar* addrC = (UChar*)addr;
1652 U128* xmm = (U128*)(addr + 160);
1653 UInt mxcsr;
1654 UShort fp_tags;
1655 UInt summary_tags;
1656 Int r, stno;
1657 UShort *srcS, *dstS;
1658
1659 do_get_x87( gst, (UChar*)&tmp );
1660 mxcsr = amd64g_create_mxcsr( gst->guest_SSEROUND );
1661
1662 /* Now build the proper fxsave image from the x87 image we just
1663 made. */
1664
1665 addrS[0] = tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */
1666 addrS[1] = tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */
1667
1668 /* set addrS[2] in an endian-independent way */
1669 summary_tags = 0;
1670 fp_tags = tmp.env[FP_ENV_TAG];
1671 for (r = 0; r < 8; r++) {
1672 if ( ((fp_tags >> (2*r)) & 3) != 3 )
1673 summary_tags |= (1 << r);
1674 }
1675 addrC[4] = toUChar(summary_tags); /* FTW: tag summary byte */
1676 addrC[5] = 0; /* pad */
1677
1678 /* FOP: faulting fpu opcode. From experimentation, the real CPU
1679 does not write this field. (?!) */
1680 addrS[3] = 0; /* BOGUS */
1681
1682 /* RIP (Last x87 instruction pointer). From experimentation, the
1683 real CPU does not write this field. (?!) */
1684 addrS[4] = 0; /* BOGUS */
1685 addrS[5] = 0; /* BOGUS */
1686 addrS[6] = 0; /* BOGUS */
1687 addrS[7] = 0; /* BOGUS */
1688
1689 /* RDP (Last x87 data pointer). From experimentation, the real CPU
1690 does not write this field. (?!) */
1691 addrS[8] = 0; /* BOGUS */
1692 addrS[9] = 0; /* BOGUS */
1693 addrS[10] = 0; /* BOGUS */
1694 addrS[11] = 0; /* BOGUS */
1695
1696 addrS[12] = toUShort(mxcsr); /* MXCSR */
1697 addrS[13] = toUShort(mxcsr >> 16);
1698
1699 addrS[14] = 0xFFFF; /* MXCSR mask (lo16) */
1700 addrS[15] = 0x0000; /* MXCSR mask (hi16) */
1701
1702 /* Copy in the FP registers, in ST order. */
1703 for (stno = 0; stno < 8; stno++) {
1704 srcS = (UShort*)(&tmp.reg[10*stno]);
1705 dstS = (UShort*)(&addrS[16 + 8*stno]);
1706 dstS[0] = srcS[0];
1707 dstS[1] = srcS[1];
1708 dstS[2] = srcS[2];
1709 dstS[3] = srcS[3];
1710 dstS[4] = srcS[4];
1711 dstS[5] = 0;
1712 dstS[6] = 0;
1713 dstS[7] = 0;
1714 }
1715
1716 /* That's the first 160 bytes of the image done. Now only %xmm0
1717 .. %xmm15 remain to be copied. If the host is big-endian, these
1718 need to be byte-swapped. */
1719 vassert(host_is_little_endian());
1720
1721 # define COPY_U128(_dst,_src) \
1722 do { _dst[0] = _src[0]; _dst[1] = _src[1]; \
1723 _dst[2] = _src[2]; _dst[3] = _src[3]; } \
1724 while (0)
1725
1726 COPY_U128( xmm[0], gst->guest_YMM0 );
1727 COPY_U128( xmm[1], gst->guest_YMM1 );
1728 COPY_U128( xmm[2], gst->guest_YMM2 );
1729 COPY_U128( xmm[3], gst->guest_YMM3 );
1730 COPY_U128( xmm[4], gst->guest_YMM4 );
1731 COPY_U128( xmm[5], gst->guest_YMM5 );
1732 COPY_U128( xmm[6], gst->guest_YMM6 );
1733 COPY_U128( xmm[7], gst->guest_YMM7 );
1734 COPY_U128( xmm[8], gst->guest_YMM8 );
1735 COPY_U128( xmm[9], gst->guest_YMM9 );
1736 COPY_U128( xmm[10], gst->guest_YMM10 );
1737 COPY_U128( xmm[11], gst->guest_YMM11 );
1738 COPY_U128( xmm[12], gst->guest_YMM12 );
1739 COPY_U128( xmm[13], gst->guest_YMM13 );
1740 COPY_U128( xmm[14], gst->guest_YMM14 );
1741 COPY_U128( xmm[15], gst->guest_YMM15 );
1742
1743 # undef COPY_U128
1744 }
1745
1746
1747 /* CALLED FROM GENERATED CODE */
1748 /* DIRTY HELPER (writes guest state, reads guest mem) */
amd64g_dirtyhelper_FXRSTOR(VexGuestAMD64State * gst,HWord addr)1749 VexEmWarn amd64g_dirtyhelper_FXRSTOR ( VexGuestAMD64State* gst, HWord addr )
1750 {
1751 Fpu_State tmp;
1752 VexEmWarn warnX87 = EmWarn_NONE;
1753 VexEmWarn warnXMM = EmWarn_NONE;
1754 UShort* addrS = (UShort*)addr;
1755 UChar* addrC = (UChar*)addr;
1756 U128* xmm = (U128*)(addr + 160);
1757 UShort fp_tags;
1758 Int r, stno, i;
1759
1760 /* Restore %xmm0 .. %xmm15. If the host is big-endian, these need
1761 to be byte-swapped. */
1762 vassert(host_is_little_endian());
1763
1764 # define COPY_U128(_dst,_src) \
1765 do { _dst[0] = _src[0]; _dst[1] = _src[1]; \
1766 _dst[2] = _src[2]; _dst[3] = _src[3]; } \
1767 while (0)
1768
1769 COPY_U128( gst->guest_YMM0, xmm[0] );
1770 COPY_U128( gst->guest_YMM1, xmm[1] );
1771 COPY_U128( gst->guest_YMM2, xmm[2] );
1772 COPY_U128( gst->guest_YMM3, xmm[3] );
1773 COPY_U128( gst->guest_YMM4, xmm[4] );
1774 COPY_U128( gst->guest_YMM5, xmm[5] );
1775 COPY_U128( gst->guest_YMM6, xmm[6] );
1776 COPY_U128( gst->guest_YMM7, xmm[7] );
1777 COPY_U128( gst->guest_YMM8, xmm[8] );
1778 COPY_U128( gst->guest_YMM9, xmm[9] );
1779 COPY_U128( gst->guest_YMM10, xmm[10] );
1780 COPY_U128( gst->guest_YMM11, xmm[11] );
1781 COPY_U128( gst->guest_YMM12, xmm[12] );
1782 COPY_U128( gst->guest_YMM13, xmm[13] );
1783 COPY_U128( gst->guest_YMM14, xmm[14] );
1784 COPY_U128( gst->guest_YMM15, xmm[15] );
1785
1786 # undef COPY_U128
1787
1788 /* Copy the x87 registers out of the image, into a temporary
1789 Fpu_State struct. */
1790 for (i = 0; i < 14; i++) tmp.env[i] = 0;
1791 for (i = 0; i < 80; i++) tmp.reg[i] = 0;
1792 /* fill in tmp.reg[0..7] */
1793 for (stno = 0; stno < 8; stno++) {
1794 UShort* dstS = (UShort*)(&tmp.reg[10*stno]);
1795 UShort* srcS = (UShort*)(&addrS[16 + 8*stno]);
1796 dstS[0] = srcS[0];
1797 dstS[1] = srcS[1];
1798 dstS[2] = srcS[2];
1799 dstS[3] = srcS[3];
1800 dstS[4] = srcS[4];
1801 }
1802 /* fill in tmp.env[0..13] */
1803 tmp.env[FP_ENV_CTRL] = addrS[0]; /* FCW: fpu control word */
1804 tmp.env[FP_ENV_STAT] = addrS[1]; /* FCW: fpu status word */
1805
1806 fp_tags = 0;
1807 for (r = 0; r < 8; r++) {
1808 if (addrC[4] & (1<<r))
1809 fp_tags |= (0 << (2*r)); /* EMPTY */
1810 else
1811 fp_tags |= (3 << (2*r)); /* VALID -- not really precise enough. */
1812 }
1813 tmp.env[FP_ENV_TAG] = fp_tags;
1814
1815 /* Now write 'tmp' into the guest state. */
1816 warnX87 = do_put_x87( True/*moveRegs*/, (UChar*)&tmp, gst );
1817
1818 { UInt w32 = (((UInt)addrS[12]) & 0xFFFF)
1819 | ((((UInt)addrS[13]) & 0xFFFF) << 16);
1820 ULong w64 = amd64g_check_ldmxcsr( (ULong)w32 );
1821
1822 warnXMM = (VexEmWarn)(w64 >> 32);
1823
1824 gst->guest_SSEROUND = w64 & 0xFFFFFFFFULL;
1825 }
1826
1827 /* Prefer an X87 emwarn over an XMM one, if both exist. */
1828 if (warnX87 != EmWarn_NONE)
1829 return warnX87;
1830 else
1831 return warnXMM;
1832 }
1833
1834
1835 /* DIRTY HELPER (writes guest state) */
1836 /* Initialise the x87 FPU state as per 'finit'. */
amd64g_dirtyhelper_FINIT(VexGuestAMD64State * gst)1837 void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* gst )
1838 {
1839 Int i;
1840 gst->guest_FTOP = 0;
1841 for (i = 0; i < 8; i++) {
1842 gst->guest_FPTAG[i] = 0; /* empty */
1843 gst->guest_FPREG[i] = 0; /* IEEE754 64-bit zero */
1844 }
1845 gst->guest_FPROUND = (ULong)Irrm_NEAREST;
1846 gst->guest_FC3210 = 0;
1847 }
1848
1849
1850 /* CALLED FROM GENERATED CODE */
1851 /* DIRTY HELPER (reads guest memory) */
amd64g_dirtyhelper_loadF80le(ULong addrU)1852 ULong amd64g_dirtyhelper_loadF80le ( ULong addrU )
1853 {
1854 ULong f64;
1855 convert_f80le_to_f64le ( (UChar*)ULong_to_Ptr(addrU), (UChar*)&f64 );
1856 return f64;
1857 }
1858
1859 /* CALLED FROM GENERATED CODE */
1860 /* DIRTY HELPER (writes guest memory) */
amd64g_dirtyhelper_storeF80le(ULong addrU,ULong f64)1861 void amd64g_dirtyhelper_storeF80le ( ULong addrU, ULong f64 )
1862 {
1863 convert_f64le_to_f80le( (UChar*)&f64, (UChar*)ULong_to_Ptr(addrU) );
1864 }
1865
1866
1867 /* CALLED FROM GENERATED CODE */
1868 /* CLEAN HELPER */
1869 /* mxcsr[15:0] contains a SSE native format MXCSR value.
1870 Extract from it the required SSEROUND value and any resulting
1871 emulation warning, and return (warn << 32) | sseround value.
1872 */
amd64g_check_ldmxcsr(ULong mxcsr)1873 ULong amd64g_check_ldmxcsr ( ULong mxcsr )
1874 {
1875 /* Decide on a rounding mode. mxcsr[14:13] holds it. */
1876 /* NOTE, encoded exactly as per enum IRRoundingMode. */
1877 ULong rmode = (mxcsr >> 13) & 3;
1878
1879 /* Detect any required emulation warnings. */
1880 VexEmWarn ew = EmWarn_NONE;
1881
1882 if ((mxcsr & 0x1F80) != 0x1F80) {
1883 /* unmasked exceptions! */
1884 ew = EmWarn_X86_sseExns;
1885 }
1886 else
1887 if (mxcsr & (1<<15)) {
1888 /* FZ is set */
1889 ew = EmWarn_X86_fz;
1890 }
1891 else
1892 if (mxcsr & (1<<6)) {
1893 /* DAZ is set */
1894 ew = EmWarn_X86_daz;
1895 }
1896
1897 return (((ULong)ew) << 32) | ((ULong)rmode);
1898 }
1899
1900
1901 /* CALLED FROM GENERATED CODE */
1902 /* CLEAN HELPER */
1903 /* Given sseround as an IRRoundingMode value, create a suitable SSE
1904 native format MXCSR value. */
amd64g_create_mxcsr(ULong sseround)1905 ULong amd64g_create_mxcsr ( ULong sseround )
1906 {
1907 sseround &= 3;
1908 return 0x1F80 | (sseround << 13);
1909 }
1910
1911
1912 /* CLEAN HELPER */
1913 /* fpucw[15:0] contains a x87 native format FPU control word.
1914 Extract from it the required FPROUND value and any resulting
1915 emulation warning, and return (warn << 32) | fpround value.
1916 */
amd64g_check_fldcw(ULong fpucw)1917 ULong amd64g_check_fldcw ( ULong fpucw )
1918 {
1919 /* Decide on a rounding mode. fpucw[11:10] holds it. */
1920 /* NOTE, encoded exactly as per enum IRRoundingMode. */
1921 ULong rmode = (fpucw >> 10) & 3;
1922
1923 /* Detect any required emulation warnings. */
1924 VexEmWarn ew = EmWarn_NONE;
1925
1926 if ((fpucw & 0x3F) != 0x3F) {
1927 /* unmasked exceptions! */
1928 ew = EmWarn_X86_x87exns;
1929 }
1930 else
1931 if (((fpucw >> 8) & 3) != 3) {
1932 /* unsupported precision */
1933 ew = EmWarn_X86_x87precision;
1934 }
1935
1936 return (((ULong)ew) << 32) | ((ULong)rmode);
1937 }
1938
1939
1940 /* CLEAN HELPER */
1941 /* Given fpround as an IRRoundingMode value, create a suitable x87
1942 native format FPU control word. */
amd64g_create_fpucw(ULong fpround)1943 ULong amd64g_create_fpucw ( ULong fpround )
1944 {
1945 fpround &= 3;
1946 return 0x037F | (fpround << 10);
1947 }
1948
1949
1950 /* This is used to implement 'fldenv'.
1951 Reads 28 bytes at x87_state[0 .. 27]. */
1952 /* CALLED FROM GENERATED CODE */
1953 /* DIRTY HELPER */
amd64g_dirtyhelper_FLDENV(VexGuestAMD64State * vex_state,HWord x87_state)1954 VexEmWarn amd64g_dirtyhelper_FLDENV ( /*OUT*/VexGuestAMD64State* vex_state,
1955 /*IN*/HWord x87_state)
1956 {
1957 return do_put_x87( False, (UChar*)x87_state, vex_state );
1958 }
1959
1960
1961 /* CALLED FROM GENERATED CODE */
1962 /* DIRTY HELPER */
1963 /* Create an x87 FPU env from the guest state, as close as we can
1964 approximate it. Writes 28 bytes at x87_state[0..27]. */
amd64g_dirtyhelper_FSTENV(VexGuestAMD64State * vex_state,HWord x87_state)1965 void amd64g_dirtyhelper_FSTENV ( /*IN*/VexGuestAMD64State* vex_state,
1966 /*OUT*/HWord x87_state )
1967 {
1968 Int i, stno, preg;
1969 UInt tagw;
1970 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
1971 Fpu_State* x87 = (Fpu_State*)x87_state;
1972 UInt ftop = vex_state->guest_FTOP;
1973 ULong c3210 = vex_state->guest_FC3210;
1974
1975 for (i = 0; i < 14; i++)
1976 x87->env[i] = 0;
1977
1978 x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
1979 x87->env[FP_ENV_STAT]
1980 = toUShort(toUInt( ((ftop & 7) << 11) | (c3210 & 0x4700) ));
1981 x87->env[FP_ENV_CTRL]
1982 = toUShort(toUInt( amd64g_create_fpucw( vex_state->guest_FPROUND ) ));
1983
1984 /* Compute the x87 tag word. */
1985 tagw = 0;
1986 for (stno = 0; stno < 8; stno++) {
1987 preg = (stno + ftop) & 7;
1988 if (vexTags[preg] == 0) {
1989 /* register is empty */
1990 tagw |= (3 << (2*preg));
1991 } else {
1992 /* register is full. */
1993 tagw |= (0 << (2*preg));
1994 }
1995 }
1996 x87->env[FP_ENV_TAG] = toUShort(tagw);
1997
1998 /* We don't dump the x87 registers, tho. */
1999 }
2000
2001
2002 /* This is used to implement 'fnsave'.
2003 Writes 108 bytes at x87_state[0 .. 107]. */
2004 /* CALLED FROM GENERATED CODE */
2005 /* DIRTY HELPER */
amd64g_dirtyhelper_FNSAVE(VexGuestAMD64State * vex_state,HWord x87_state)2006 void amd64g_dirtyhelper_FNSAVE ( /*IN*/VexGuestAMD64State* vex_state,
2007 /*OUT*/HWord x87_state)
2008 {
2009 do_get_x87( vex_state, (UChar*)x87_state );
2010 }
2011
2012
2013 /* This is used to implement 'fnsaves'.
2014 Writes 94 bytes at x87_state[0 .. 93]. */
2015 /* CALLED FROM GENERATED CODE */
2016 /* DIRTY HELPER */
amd64g_dirtyhelper_FNSAVES(VexGuestAMD64State * vex_state,HWord x87_state)2017 void amd64g_dirtyhelper_FNSAVES ( /*IN*/VexGuestAMD64State* vex_state,
2018 /*OUT*/HWord x87_state)
2019 {
2020 Int i, stno, preg;
2021 UInt tagw;
2022 ULong* vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2023 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2024 Fpu_State_16* x87 = (Fpu_State_16*)x87_state;
2025 UInt ftop = vex_state->guest_FTOP;
2026 UInt c3210 = vex_state->guest_FC3210;
2027
2028 for (i = 0; i < 7; i++)
2029 x87->env[i] = 0;
2030
2031 x87->env[FPS_ENV_STAT]
2032 = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
2033 x87->env[FPS_ENV_CTRL]
2034 = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
2035
2036 /* Dump the register stack in ST order. */
2037 tagw = 0;
2038 for (stno = 0; stno < 8; stno++) {
2039 preg = (stno + ftop) & 7;
2040 if (vexTags[preg] == 0) {
2041 /* register is empty */
2042 tagw |= (3 << (2*preg));
2043 convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2044 &x87->reg[10*stno] );
2045 } else {
2046 /* register is full. */
2047 tagw |= (0 << (2*preg));
2048 convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2049 &x87->reg[10*stno] );
2050 }
2051 }
2052 x87->env[FPS_ENV_TAG] = toUShort(tagw);
2053 }
2054
2055
2056 /* This is used to implement 'frstor'.
2057 Reads 108 bytes at x87_state[0 .. 107]. */
2058 /* CALLED FROM GENERATED CODE */
2059 /* DIRTY HELPER */
amd64g_dirtyhelper_FRSTOR(VexGuestAMD64State * vex_state,HWord x87_state)2060 VexEmWarn amd64g_dirtyhelper_FRSTOR ( /*OUT*/VexGuestAMD64State* vex_state,
2061 /*IN*/HWord x87_state)
2062 {
2063 return do_put_x87( True, (UChar*)x87_state, vex_state );
2064 }
2065
2066
2067 /* This is used to implement 'frstors'.
2068 Reads 94 bytes at x87_state[0 .. 93]. */
2069 /* CALLED FROM GENERATED CODE */
2070 /* DIRTY HELPER */
amd64g_dirtyhelper_FRSTORS(VexGuestAMD64State * vex_state,HWord x87_state)2071 VexEmWarn amd64g_dirtyhelper_FRSTORS ( /*OUT*/VexGuestAMD64State* vex_state,
2072 /*IN*/HWord x87_state)
2073 {
2074 Int stno, preg;
2075 UInt tag;
2076 ULong* vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2077 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2078 Fpu_State_16* x87 = (Fpu_State_16*)x87_state;
2079 UInt ftop = (x87->env[FPS_ENV_STAT] >> 11) & 7;
2080 UInt tagw = x87->env[FPS_ENV_TAG];
2081 UInt fpucw = x87->env[FPS_ENV_CTRL];
2082 UInt c3210 = x87->env[FPS_ENV_STAT] & 0x4700;
2083 VexEmWarn ew;
2084 UInt fpround;
2085 ULong pair;
2086
2087 /* Copy registers and tags */
2088 for (stno = 0; stno < 8; stno++) {
2089 preg = (stno + ftop) & 7;
2090 tag = (tagw >> (2*preg)) & 3;
2091 if (tag == 3) {
2092 /* register is empty */
2093 /* hmm, if it's empty, does it still get written? Probably
2094 safer to say it does. If we don't, memcheck could get out
2095 of sync, in that it thinks all FP registers are defined by
2096 this helper, but in reality some have not been updated. */
2097 vexRegs[preg] = 0; /* IEEE754 64-bit zero */
2098 vexTags[preg] = 0;
2099 } else {
2100 /* register is non-empty */
2101 convert_f80le_to_f64le( &x87->reg[10*stno],
2102 (UChar*)&vexRegs[preg] );
2103 vexTags[preg] = 1;
2104 }
2105 }
2106
2107 /* stack pointer */
2108 vex_state->guest_FTOP = ftop;
2109
2110 /* status word */
2111 vex_state->guest_FC3210 = c3210;
2112
2113 /* handle the control word, setting FPROUND and detecting any
2114 emulation warnings. */
2115 pair = amd64g_check_fldcw ( (ULong)fpucw );
2116 fpround = (UInt)pair & 0xFFFFFFFFULL;
2117 ew = (VexEmWarn)(pair >> 32);
2118
2119 vex_state->guest_FPROUND = fpround & 3;
2120
2121 /* emulation warnings --> caller */
2122 return ew;
2123 }
2124
2125
2126 /*---------------------------------------------------------------*/
2127 /*--- Misc integer helpers, including rotates and CPUID. ---*/
2128 /*---------------------------------------------------------------*/
2129
2130 /* Claim to be the following CPU, which is probably representative of
2131 the lowliest (earliest) amd64 offerings. It can do neither sse3
2132 nor cx16.
2133
2134 vendor_id : AuthenticAMD
2135 cpu family : 15
2136 model : 5
2137 model name : AMD Opteron (tm) Processor 848
2138 stepping : 10
2139 cpu MHz : 1797.682
2140 cache size : 1024 KB
2141 fpu : yes
2142 fpu_exception : yes
2143 cpuid level : 1
2144 wp : yes
2145 flags : fpu vme de pse tsc msr pae mce cx8 apic sep
2146 mtrr pge mca cmov pat pse36 clflush mmx fxsr
2147 sse sse2 syscall nx mmxext lm 3dnowext 3dnow
2148 bogomips : 3600.62
2149 TLB size : 1088 4K pages
2150 clflush size : 64
2151 cache_alignment : 64
2152 address sizes : 40 bits physical, 48 bits virtual
2153 power management: ts fid vid ttp
2154
2155 2012-Feb-21: don't claim 3dnow or 3dnowext, since in fact
2156 we don't support them. See #291568. 3dnow is 80000001.EDX.31
2157 and 3dnowext is 80000001.EDX.30.
2158 */
amd64g_dirtyhelper_CPUID_baseline(VexGuestAMD64State * st)2159 void amd64g_dirtyhelper_CPUID_baseline ( VexGuestAMD64State* st )
2160 {
2161 # define SET_ABCD(_a,_b,_c,_d) \
2162 do { st->guest_RAX = (ULong)(_a); \
2163 st->guest_RBX = (ULong)(_b); \
2164 st->guest_RCX = (ULong)(_c); \
2165 st->guest_RDX = (ULong)(_d); \
2166 } while (0)
2167
2168 switch (0xFFFFFFFF & st->guest_RAX) {
2169 case 0x00000000:
2170 SET_ABCD(0x00000001, 0x68747541, 0x444d4163, 0x69746e65);
2171 break;
2172 case 0x00000001:
2173 SET_ABCD(0x00000f5a, 0x01000800, 0x00000000, 0x078bfbff);
2174 break;
2175 case 0x80000000:
2176 SET_ABCD(0x80000018, 0x68747541, 0x444d4163, 0x69746e65);
2177 break;
2178 case 0x80000001:
2179 /* Don't claim to support 3dnow or 3dnowext. 0xe1d3fbff is
2180 the original it-is-supported value that the h/w provides.
2181 See #291568. */
2182 SET_ABCD(0x00000f5a, 0x00000505, 0x00000000, /*0xe1d3fbff*/
2183 0x21d3fbff);
2184 break;
2185 case 0x80000002:
2186 SET_ABCD(0x20444d41, 0x6574704f, 0x206e6f72, 0x296d7428);
2187 break;
2188 case 0x80000003:
2189 SET_ABCD(0x6f725020, 0x73736563, 0x3820726f, 0x00003834);
2190 break;
2191 case 0x80000004:
2192 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2193 break;
2194 case 0x80000005:
2195 SET_ABCD(0xff08ff08, 0xff20ff20, 0x40020140, 0x40020140);
2196 break;
2197 case 0x80000006:
2198 SET_ABCD(0x00000000, 0x42004200, 0x04008140, 0x00000000);
2199 break;
2200 case 0x80000007:
2201 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x0000000f);
2202 break;
2203 case 0x80000008:
2204 SET_ABCD(0x00003028, 0x00000000, 0x00000000, 0x00000000);
2205 break;
2206 default:
2207 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2208 break;
2209 }
2210 # undef SET_ABCD
2211 }
2212
2213
2214 /* Claim to be the following CPU (2 x ...), which is sse3 and cx16
2215 capable.
2216
2217 vendor_id : GenuineIntel
2218 cpu family : 6
2219 model : 15
2220 model name : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
2221 stepping : 6
2222 cpu MHz : 2394.000
2223 cache size : 4096 KB
2224 physical id : 0
2225 siblings : 2
2226 core id : 0
2227 cpu cores : 2
2228 fpu : yes
2229 fpu_exception : yes
2230 cpuid level : 10
2231 wp : yes
2232 flags : fpu vme de pse tsc msr pae mce cx8 apic sep
2233 mtrr pge mca cmov pat pse36 clflush dts acpi
2234 mmx fxsr sse sse2 ss ht tm syscall nx lm
2235 constant_tsc pni monitor ds_cpl vmx est tm2
2236 cx16 xtpr lahf_lm
2237 bogomips : 4798.78
2238 clflush size : 64
2239 cache_alignment : 64
2240 address sizes : 36 bits physical, 48 bits virtual
2241 power management:
2242 */
amd64g_dirtyhelper_CPUID_sse3_and_cx16(VexGuestAMD64State * st)2243 void amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State* st )
2244 {
2245 # define SET_ABCD(_a,_b,_c,_d) \
2246 do { st->guest_RAX = (ULong)(_a); \
2247 st->guest_RBX = (ULong)(_b); \
2248 st->guest_RCX = (ULong)(_c); \
2249 st->guest_RDX = (ULong)(_d); \
2250 } while (0)
2251
2252 switch (0xFFFFFFFF & st->guest_RAX) {
2253 case 0x00000000:
2254 SET_ABCD(0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69);
2255 break;
2256 case 0x00000001:
2257 SET_ABCD(0x000006f6, 0x00020800, 0x0000e3bd, 0xbfebfbff);
2258 break;
2259 case 0x00000002:
2260 SET_ABCD(0x05b0b101, 0x005657f0, 0x00000000, 0x2cb43049);
2261 break;
2262 case 0x00000003:
2263 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2264 break;
2265 case 0x00000004: {
2266 switch (0xFFFFFFFF & st->guest_RCX) {
2267 case 0x00000000: SET_ABCD(0x04000121, 0x01c0003f,
2268 0x0000003f, 0x00000001); break;
2269 case 0x00000001: SET_ABCD(0x04000122, 0x01c0003f,
2270 0x0000003f, 0x00000001); break;
2271 case 0x00000002: SET_ABCD(0x04004143, 0x03c0003f,
2272 0x00000fff, 0x00000001); break;
2273 default: SET_ABCD(0x00000000, 0x00000000,
2274 0x00000000, 0x00000000); break;
2275 }
2276 break;
2277 }
2278 case 0x00000005:
2279 SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000020);
2280 break;
2281 case 0x00000006:
2282 SET_ABCD(0x00000001, 0x00000002, 0x00000001, 0x00000000);
2283 break;
2284 case 0x00000007:
2285 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2286 break;
2287 case 0x00000008:
2288 SET_ABCD(0x00000400, 0x00000000, 0x00000000, 0x00000000);
2289 break;
2290 case 0x00000009:
2291 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2292 break;
2293 case 0x0000000a:
2294 unhandled_eax_value:
2295 SET_ABCD(0x07280202, 0x00000000, 0x00000000, 0x00000000);
2296 break;
2297 case 0x80000000:
2298 SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
2299 break;
2300 case 0x80000001:
2301 SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x20100800);
2302 break;
2303 case 0x80000002:
2304 SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
2305 break;
2306 case 0x80000003:
2307 SET_ABCD(0x43203229, 0x20205550, 0x20202020, 0x20202020);
2308 break;
2309 case 0x80000004:
2310 SET_ABCD(0x30303636, 0x20402020, 0x30342e32, 0x007a4847);
2311 break;
2312 case 0x80000005:
2313 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2314 break;
2315 case 0x80000006:
2316 SET_ABCD(0x00000000, 0x00000000, 0x10008040, 0x00000000);
2317 break;
2318 case 0x80000007:
2319 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2320 break;
2321 case 0x80000008:
2322 SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
2323 break;
2324 default:
2325 goto unhandled_eax_value;
2326 }
2327 # undef SET_ABCD
2328 }
2329
2330
2331 /* Claim to be the following CPU (4 x ...), which is sse4.2 and cx16
2332 capable.
2333
2334 vendor_id : GenuineIntel
2335 cpu family : 6
2336 model : 37
2337 model name : Intel(R) Core(TM) i5 CPU 670 @ 3.47GHz
2338 stepping : 2
2339 cpu MHz : 3334.000
2340 cache size : 4096 KB
2341 physical id : 0
2342 siblings : 4
2343 core id : 0
2344 cpu cores : 2
2345 apicid : 0
2346 initial apicid : 0
2347 fpu : yes
2348 fpu_exception : yes
2349 cpuid level : 11
2350 wp : yes
2351 flags : fpu vme de pse tsc msr pae mce cx8 apic sep
2352 mtrr pge mca cmov pat pse36 clflush dts acpi
2353 mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
2354 lm constant_tsc arch_perfmon pebs bts rep_good
2355 xtopology nonstop_tsc aperfmperf pni pclmulqdq
2356 dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16
2357 xtpr pdcm sse4_1 sse4_2 popcnt aes lahf_lm ida
2358 arat tpr_shadow vnmi flexpriority ept vpid
2359 bogomips : 6957.57
2360 clflush size : 64
2361 cache_alignment : 64
2362 address sizes : 36 bits physical, 48 bits virtual
2363 power management:
2364 */
amd64g_dirtyhelper_CPUID_sse42_and_cx16(VexGuestAMD64State * st)2365 void amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st )
2366 {
2367 # define SET_ABCD(_a,_b,_c,_d) \
2368 do { st->guest_RAX = (ULong)(_a); \
2369 st->guest_RBX = (ULong)(_b); \
2370 st->guest_RCX = (ULong)(_c); \
2371 st->guest_RDX = (ULong)(_d); \
2372 } while (0)
2373
2374 UInt old_eax = (UInt)st->guest_RAX;
2375 UInt old_ecx = (UInt)st->guest_RCX;
2376
2377 switch (old_eax) {
2378 case 0x00000000:
2379 SET_ABCD(0x0000000b, 0x756e6547, 0x6c65746e, 0x49656e69);
2380 break;
2381 case 0x00000001:
2382 SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff, 0xbfebfbff);
2383 break;
2384 case 0x00000002:
2385 SET_ABCD(0x55035a01, 0x00f0b2e3, 0x00000000, 0x09ca212c);
2386 break;
2387 case 0x00000003:
2388 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2389 break;
2390 case 0x00000004:
2391 switch (old_ecx) {
2392 case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
2393 0x0000003f, 0x00000000); break;
2394 case 0x00000001: SET_ABCD(0x1c004122, 0x00c0003f,
2395 0x0000007f, 0x00000000); break;
2396 case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
2397 0x000001ff, 0x00000000); break;
2398 case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
2399 0x00000fff, 0x00000002); break;
2400 default: SET_ABCD(0x00000000, 0x00000000,
2401 0x00000000, 0x00000000); break;
2402 }
2403 break;
2404 case 0x00000005:
2405 SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
2406 break;
2407 case 0x00000006:
2408 SET_ABCD(0x00000007, 0x00000002, 0x00000001, 0x00000000);
2409 break;
2410 case 0x00000007:
2411 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2412 break;
2413 case 0x00000008:
2414 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2415 break;
2416 case 0x00000009:
2417 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2418 break;
2419 case 0x0000000a:
2420 SET_ABCD(0x07300403, 0x00000004, 0x00000000, 0x00000603);
2421 break;
2422 case 0x0000000b:
2423 switch (old_ecx) {
2424 case 0x00000000:
2425 SET_ABCD(0x00000001, 0x00000002,
2426 0x00000100, 0x00000000); break;
2427 case 0x00000001:
2428 SET_ABCD(0x00000004, 0x00000004,
2429 0x00000201, 0x00000000); break;
2430 default:
2431 SET_ABCD(0x00000000, 0x00000000,
2432 old_ecx, 0x00000000); break;
2433 }
2434 break;
2435 case 0x0000000c:
2436 SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
2437 break;
2438 case 0x0000000d:
2439 switch (old_ecx) {
2440 case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
2441 0x00000100, 0x00000000); break;
2442 case 0x00000001: SET_ABCD(0x00000004, 0x00000004,
2443 0x00000201, 0x00000000); break;
2444 default: SET_ABCD(0x00000000, 0x00000000,
2445 old_ecx, 0x00000000); break;
2446 }
2447 break;
2448 case 0x80000000:
2449 SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
2450 break;
2451 case 0x80000001:
2452 SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
2453 break;
2454 case 0x80000002:
2455 SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
2456 break;
2457 case 0x80000003:
2458 SET_ABCD(0x35692029, 0x55504320, 0x20202020, 0x20202020);
2459 break;
2460 case 0x80000004:
2461 SET_ABCD(0x30373620, 0x20402020, 0x37342e33, 0x007a4847);
2462 break;
2463 case 0x80000005:
2464 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2465 break;
2466 case 0x80000006:
2467 SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
2468 break;
2469 case 0x80000007:
2470 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
2471 break;
2472 case 0x80000008:
2473 SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
2474 break;
2475 default:
2476 SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
2477 break;
2478 }
2479 # undef SET_ABCD
2480 }
2481
2482
2483 /* Claim to be the following CPU (4 x ...), which is AVX and cx16
2484 capable.
2485
2486 vendor_id : GenuineIntel
2487 cpu family : 6
2488 model : 42
2489 model name : Intel(R) Core(TM) i5-2300 CPU @ 2.80GHz
2490 stepping : 7
2491 cpu MHz : 1600.000
2492 cache size : 6144 KB
2493 physical id : 0
2494 siblings : 4
2495 core id : 3
2496 cpu cores : 4
2497 apicid : 6
2498 initial apicid : 6
2499 fpu : yes
2500 fpu_exception : yes
2501 cpuid level : 13
2502 wp : yes
2503 flags : fpu vme de pse tsc msr pae mce cx8 apic sep
2504 mtrr pge mca cmov pat pse36 clflush dts acpi
2505 mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
2506 lm constant_tsc arch_perfmon pebs bts rep_good
2507 nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq
2508 dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16
2509 xtpr pdcm sse4_1 sse4_2 popcnt aes xsave avx
2510 lahf_lm ida arat epb xsaveopt pln pts dts
2511 tpr_shadow vnmi flexpriority ept vpid
2512
2513 bogomips : 5768.94
2514 clflush size : 64
2515 cache_alignment : 64
2516 address sizes : 36 bits physical, 48 bits virtual
2517 power management:
2518 */
amd64g_dirtyhelper_CPUID_avx_and_cx16(VexGuestAMD64State * st)2519 void amd64g_dirtyhelper_CPUID_avx_and_cx16 ( VexGuestAMD64State* st )
2520 {
2521 # define SET_ABCD(_a,_b,_c,_d) \
2522 do { st->guest_RAX = (ULong)(_a); \
2523 st->guest_RBX = (ULong)(_b); \
2524 st->guest_RCX = (ULong)(_c); \
2525 st->guest_RDX = (ULong)(_d); \
2526 } while (0)
2527
2528 UInt old_eax = (UInt)st->guest_RAX;
2529 UInt old_ecx = (UInt)st->guest_RCX;
2530
2531 switch (old_eax) {
2532 case 0x00000000:
2533 SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
2534 break;
2535 case 0x00000001:
2536 SET_ABCD(0x000206a7, 0x00100800, 0x1f9ae3bf, 0xbfebfbff);
2537 break;
2538 case 0x00000002:
2539 SET_ABCD(0x76035a01, 0x00f0b0ff, 0x00000000, 0x00ca0000);
2540 break;
2541 case 0x00000003:
2542 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2543 break;
2544 case 0x00000004:
2545 switch (old_ecx) {
2546 case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
2547 0x0000003f, 0x00000000); break;
2548 case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
2549 0x0000003f, 0x00000000); break;
2550 case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
2551 0x000001ff, 0x00000000); break;
2552 case 0x00000003: SET_ABCD(0x1c03c163, 0x02c0003f,
2553 0x00001fff, 0x00000006); break;
2554 default: SET_ABCD(0x00000000, 0x00000000,
2555 0x00000000, 0x00000000); break;
2556 }
2557 break;
2558 case 0x00000005:
2559 SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
2560 break;
2561 case 0x00000006:
2562 SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
2563 break;
2564 case 0x00000007:
2565 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2566 break;
2567 case 0x00000008:
2568 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2569 break;
2570 case 0x00000009:
2571 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2572 break;
2573 case 0x0000000a:
2574 SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
2575 break;
2576 case 0x0000000b:
2577 switch (old_ecx) {
2578 case 0x00000000:
2579 SET_ABCD(0x00000001, 0x00000001,
2580 0x00000100, 0x00000000); break;
2581 case 0x00000001:
2582 SET_ABCD(0x00000004, 0x00000004,
2583 0x00000201, 0x00000000); break;
2584 default:
2585 SET_ABCD(0x00000000, 0x00000000,
2586 old_ecx, 0x00000000); break;
2587 }
2588 break;
2589 case 0x0000000c:
2590 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2591 break;
2592 case 0x0000000d:
2593 switch (old_ecx) {
2594 case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
2595 0x00000340, 0x00000000); break;
2596 case 0x00000001: SET_ABCD(0x00000001, 0x00000000,
2597 0x00000000, 0x00000000); break;
2598 case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
2599 0x00000000, 0x00000000); break;
2600 default: SET_ABCD(0x00000000, 0x00000000,
2601 0x00000000, 0x00000000); break;
2602 }
2603 break;
2604 case 0x0000000e:
2605 SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
2606 break;
2607 case 0x0000000f:
2608 SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
2609 break;
2610 case 0x80000000:
2611 SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
2612 break;
2613 case 0x80000001:
2614 SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
2615 break;
2616 case 0x80000002:
2617 SET_ABCD(0x20202020, 0x20202020, 0x65746e49, 0x2952286c);
2618 break;
2619 case 0x80000003:
2620 SET_ABCD(0x726f4320, 0x4d542865, 0x35692029, 0x3033322d);
2621 break;
2622 case 0x80000004:
2623 SET_ABCD(0x50432030, 0x20402055, 0x30382e32, 0x007a4847);
2624 break;
2625 case 0x80000005:
2626 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2627 break;
2628 case 0x80000006:
2629 SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
2630 break;
2631 case 0x80000007:
2632 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
2633 break;
2634 case 0x80000008:
2635 SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
2636 break;
2637 default:
2638 SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
2639 break;
2640 }
2641 # undef SET_ABCD
2642 }
2643
2644
amd64g_calculate_RCR(ULong arg,ULong rot_amt,ULong rflags_in,Long szIN)2645 ULong amd64g_calculate_RCR ( ULong arg,
2646 ULong rot_amt,
2647 ULong rflags_in,
2648 Long szIN )
2649 {
2650 Bool wantRflags = toBool(szIN < 0);
2651 ULong sz = wantRflags ? (-szIN) : szIN;
2652 ULong tempCOUNT = rot_amt & (sz == 8 ? 0x3F : 0x1F);
2653 ULong cf=0, of=0, tempcf;
2654
2655 switch (sz) {
2656 case 8:
2657 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2658 of = ((arg >> 63) ^ cf) & 1;
2659 while (tempCOUNT > 0) {
2660 tempcf = arg & 1;
2661 arg = (arg >> 1) | (cf << 63);
2662 cf = tempcf;
2663 tempCOUNT--;
2664 }
2665 break;
2666 case 4:
2667 while (tempCOUNT >= 33) tempCOUNT -= 33;
2668 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2669 of = ((arg >> 31) ^ cf) & 1;
2670 while (tempCOUNT > 0) {
2671 tempcf = arg & 1;
2672 arg = ((arg >> 1) & 0x7FFFFFFFULL) | (cf << 31);
2673 cf = tempcf;
2674 tempCOUNT--;
2675 }
2676 break;
2677 case 2:
2678 while (tempCOUNT >= 17) tempCOUNT -= 17;
2679 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2680 of = ((arg >> 15) ^ cf) & 1;
2681 while (tempCOUNT > 0) {
2682 tempcf = arg & 1;
2683 arg = ((arg >> 1) & 0x7FFFULL) | (cf << 15);
2684 cf = tempcf;
2685 tempCOUNT--;
2686 }
2687 break;
2688 case 1:
2689 while (tempCOUNT >= 9) tempCOUNT -= 9;
2690 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2691 of = ((arg >> 7) ^ cf) & 1;
2692 while (tempCOUNT > 0) {
2693 tempcf = arg & 1;
2694 arg = ((arg >> 1) & 0x7FULL) | (cf << 7);
2695 cf = tempcf;
2696 tempCOUNT--;
2697 }
2698 break;
2699 default:
2700 vpanic("calculate_RCR(amd64g): invalid size");
2701 }
2702
2703 cf &= 1;
2704 of &= 1;
2705 rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
2706 rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
2707
2708 /* caller can ask to have back either the resulting flags or
2709 resulting value, but not both */
2710 return wantRflags ? rflags_in : arg;
2711 }
2712
amd64g_calculate_RCL(ULong arg,ULong rot_amt,ULong rflags_in,Long szIN)2713 ULong amd64g_calculate_RCL ( ULong arg,
2714 ULong rot_amt,
2715 ULong rflags_in,
2716 Long szIN )
2717 {
2718 Bool wantRflags = toBool(szIN < 0);
2719 ULong sz = wantRflags ? (-szIN) : szIN;
2720 ULong tempCOUNT = rot_amt & (sz == 8 ? 0x3F : 0x1F);
2721 ULong cf=0, of=0, tempcf;
2722
2723 switch (sz) {
2724 case 8:
2725 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2726 while (tempCOUNT > 0) {
2727 tempcf = (arg >> 63) & 1;
2728 arg = (arg << 1) | (cf & 1);
2729 cf = tempcf;
2730 tempCOUNT--;
2731 }
2732 of = ((arg >> 63) ^ cf) & 1;
2733 break;
2734 case 4:
2735 while (tempCOUNT >= 33) tempCOUNT -= 33;
2736 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2737 while (tempCOUNT > 0) {
2738 tempcf = (arg >> 31) & 1;
2739 arg = 0xFFFFFFFFULL & ((arg << 1) | (cf & 1));
2740 cf = tempcf;
2741 tempCOUNT--;
2742 }
2743 of = ((arg >> 31) ^ cf) & 1;
2744 break;
2745 case 2:
2746 while (tempCOUNT >= 17) tempCOUNT -= 17;
2747 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2748 while (tempCOUNT > 0) {
2749 tempcf = (arg >> 15) & 1;
2750 arg = 0xFFFFULL & ((arg << 1) | (cf & 1));
2751 cf = tempcf;
2752 tempCOUNT--;
2753 }
2754 of = ((arg >> 15) ^ cf) & 1;
2755 break;
2756 case 1:
2757 while (tempCOUNT >= 9) tempCOUNT -= 9;
2758 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2759 while (tempCOUNT > 0) {
2760 tempcf = (arg >> 7) & 1;
2761 arg = 0xFFULL & ((arg << 1) | (cf & 1));
2762 cf = tempcf;
2763 tempCOUNT--;
2764 }
2765 of = ((arg >> 7) ^ cf) & 1;
2766 break;
2767 default:
2768 vpanic("calculate_RCL(amd64g): invalid size");
2769 }
2770
2771 cf &= 1;
2772 of &= 1;
2773 rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
2774 rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
2775
2776 return wantRflags ? rflags_in : arg;
2777 }
2778
2779 /* Taken from gf2x-0.9.5, released under GPLv2+ (later versions LGPLv2+)
2780 * svn://scm.gforge.inria.fr/svn/gf2x/trunk/hardware/opteron/gf2x_mul1.h@25
2781 */
amd64g_calculate_pclmul(ULong a,ULong b,ULong which)2782 ULong amd64g_calculate_pclmul(ULong a, ULong b, ULong which)
2783 {
2784 ULong hi, lo, tmp, A[16];
2785
2786 A[0] = 0; A[1] = a;
2787 A[2] = A[1] << 1; A[3] = A[2] ^ a;
2788 A[4] = A[2] << 1; A[5] = A[4] ^ a;
2789 A[6] = A[3] << 1; A[7] = A[6] ^ a;
2790 A[8] = A[4] << 1; A[9] = A[8] ^ a;
2791 A[10] = A[5] << 1; A[11] = A[10] ^ a;
2792 A[12] = A[6] << 1; A[13] = A[12] ^ a;
2793 A[14] = A[7] << 1; A[15] = A[14] ^ a;
2794
2795 lo = (A[b >> 60] << 4) ^ A[(b >> 56) & 15];
2796 hi = lo >> 56;
2797 lo = (lo << 8) ^ (A[(b >> 52) & 15] << 4) ^ A[(b >> 48) & 15];
2798 hi = (hi << 8) | (lo >> 56);
2799 lo = (lo << 8) ^ (A[(b >> 44) & 15] << 4) ^ A[(b >> 40) & 15];
2800 hi = (hi << 8) | (lo >> 56);
2801 lo = (lo << 8) ^ (A[(b >> 36) & 15] << 4) ^ A[(b >> 32) & 15];
2802 hi = (hi << 8) | (lo >> 56);
2803 lo = (lo << 8) ^ (A[(b >> 28) & 15] << 4) ^ A[(b >> 24) & 15];
2804 hi = (hi << 8) | (lo >> 56);
2805 lo = (lo << 8) ^ (A[(b >> 20) & 15] << 4) ^ A[(b >> 16) & 15];
2806 hi = (hi << 8) | (lo >> 56);
2807 lo = (lo << 8) ^ (A[(b >> 12) & 15] << 4) ^ A[(b >> 8) & 15];
2808 hi = (hi << 8) | (lo >> 56);
2809 lo = (lo << 8) ^ (A[(b >> 4) & 15] << 4) ^ A[b & 15];
2810
2811 ULong m0 = -1;
2812 m0 /= 255;
2813 tmp = -((a >> 63) & 1); tmp &= ((b & (m0 * 0xfe)) >> 1); hi = hi ^ tmp;
2814 tmp = -((a >> 62) & 1); tmp &= ((b & (m0 * 0xfc)) >> 2); hi = hi ^ tmp;
2815 tmp = -((a >> 61) & 1); tmp &= ((b & (m0 * 0xf8)) >> 3); hi = hi ^ tmp;
2816 tmp = -((a >> 60) & 1); tmp &= ((b & (m0 * 0xf0)) >> 4); hi = hi ^ tmp;
2817 tmp = -((a >> 59) & 1); tmp &= ((b & (m0 * 0xe0)) >> 5); hi = hi ^ tmp;
2818 tmp = -((a >> 58) & 1); tmp &= ((b & (m0 * 0xc0)) >> 6); hi = hi ^ tmp;
2819 tmp = -((a >> 57) & 1); tmp &= ((b & (m0 * 0x80)) >> 7); hi = hi ^ tmp;
2820
2821 return which ? hi : lo;
2822 }
2823
2824
2825 /* CALLED FROM GENERATED CODE */
2826 /* DIRTY HELPER (non-referentially-transparent) */
2827 /* Horrible hack. On non-amd64 platforms, return 1. */
amd64g_dirtyhelper_RDTSC(void)2828 ULong amd64g_dirtyhelper_RDTSC ( void )
2829 {
2830 # if defined(__x86_64__)
2831 UInt eax, edx;
2832 __asm__ __volatile__("rdtsc" : "=a" (eax), "=d" (edx));
2833 return (((ULong)edx) << 32) | ((ULong)eax);
2834 # else
2835 return 1ULL;
2836 # endif
2837 }
2838
2839
2840 /* CALLED FROM GENERATED CODE */
2841 /* DIRTY HELPER (non-referentially-transparent) */
2842 /* Horrible hack. On non-amd64 platforms, return 0. */
amd64g_dirtyhelper_IN(ULong portno,ULong sz)2843 ULong amd64g_dirtyhelper_IN ( ULong portno, ULong sz/*1,2 or 4*/ )
2844 {
2845 # if defined(__x86_64__)
2846 ULong r = 0;
2847 portno &= 0xFFFF;
2848 switch (sz) {
2849 case 4:
2850 __asm__ __volatile__("movq $0,%%rax; inl %w1,%%eax; movq %%rax,%0"
2851 : "=a" (r) : "Nd" (portno));
2852 break;
2853 case 2:
2854 __asm__ __volatile__("movq $0,%%rax; inw %w1,%w0"
2855 : "=a" (r) : "Nd" (portno));
2856 break;
2857 case 1:
2858 __asm__ __volatile__("movq $0,%%rax; inb %w1,%b0"
2859 : "=a" (r) : "Nd" (portno));
2860 break;
2861 default:
2862 break; /* note: no 64-bit version of insn exists */
2863 }
2864 return r;
2865 # else
2866 return 0;
2867 # endif
2868 }
2869
2870
2871 /* CALLED FROM GENERATED CODE */
2872 /* DIRTY HELPER (non-referentially-transparent) */
2873 /* Horrible hack. On non-amd64 platforms, do nothing. */
amd64g_dirtyhelper_OUT(ULong portno,ULong data,ULong sz)2874 void amd64g_dirtyhelper_OUT ( ULong portno, ULong data, ULong sz/*1,2 or 4*/ )
2875 {
2876 # if defined(__x86_64__)
2877 portno &= 0xFFFF;
2878 switch (sz) {
2879 case 4:
2880 __asm__ __volatile__("movq %0,%%rax; outl %%eax, %w1"
2881 : : "a" (data), "Nd" (portno));
2882 break;
2883 case 2:
2884 __asm__ __volatile__("outw %w0, %w1"
2885 : : "a" (data), "Nd" (portno));
2886 break;
2887 case 1:
2888 __asm__ __volatile__("outb %b0, %w1"
2889 : : "a" (data), "Nd" (portno));
2890 break;
2891 default:
2892 break; /* note: no 64-bit version of insn exists */
2893 }
2894 # else
2895 /* do nothing */
2896 # endif
2897 }
2898
2899 /* CALLED FROM GENERATED CODE */
2900 /* DIRTY HELPER (non-referentially-transparent) */
2901 /* Horrible hack. On non-amd64 platforms, do nothing. */
2902 /* op = 0: call the native SGDT instruction.
2903 op = 1: call the native SIDT instruction.
2904 */
amd64g_dirtyhelper_SxDT(void * address,ULong op)2905 void amd64g_dirtyhelper_SxDT ( void *address, ULong op ) {
2906 # if defined(__x86_64__)
2907 switch (op) {
2908 case 0:
2909 __asm__ __volatile__("sgdt (%0)" : : "r" (address) : "memory");
2910 break;
2911 case 1:
2912 __asm__ __volatile__("sidt (%0)" : : "r" (address) : "memory");
2913 break;
2914 default:
2915 vpanic("amd64g_dirtyhelper_SxDT");
2916 }
2917 # else
2918 /* do nothing */
2919 UChar* p = (UChar*)address;
2920 p[0] = p[1] = p[2] = p[3] = p[4] = p[5] = 0;
2921 p[6] = p[7] = p[8] = p[9] = 0;
2922 # endif
2923 }
2924
2925 /*---------------------------------------------------------------*/
2926 /*--- Helpers for MMX/SSE/SSE2. ---*/
2927 /*---------------------------------------------------------------*/
2928
abdU8(UChar xx,UChar yy)2929 static inline UChar abdU8 ( UChar xx, UChar yy ) {
2930 return toUChar(xx>yy ? xx-yy : yy-xx);
2931 }
2932
mk32x2(UInt w1,UInt w0)2933 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
2934 return (((ULong)w1) << 32) | ((ULong)w0);
2935 }
2936
sel16x4_3(ULong w64)2937 static inline UShort sel16x4_3 ( ULong w64 ) {
2938 UInt hi32 = toUInt(w64 >> 32);
2939 return toUShort(hi32 >> 16);
2940 }
sel16x4_2(ULong w64)2941 static inline UShort sel16x4_2 ( ULong w64 ) {
2942 UInt hi32 = toUInt(w64 >> 32);
2943 return toUShort(hi32);
2944 }
sel16x4_1(ULong w64)2945 static inline UShort sel16x4_1 ( ULong w64 ) {
2946 UInt lo32 = toUInt(w64);
2947 return toUShort(lo32 >> 16);
2948 }
sel16x4_0(ULong w64)2949 static inline UShort sel16x4_0 ( ULong w64 ) {
2950 UInt lo32 = toUInt(w64);
2951 return toUShort(lo32);
2952 }
2953
sel8x8_7(ULong w64)2954 static inline UChar sel8x8_7 ( ULong w64 ) {
2955 UInt hi32 = toUInt(w64 >> 32);
2956 return toUChar(hi32 >> 24);
2957 }
sel8x8_6(ULong w64)2958 static inline UChar sel8x8_6 ( ULong w64 ) {
2959 UInt hi32 = toUInt(w64 >> 32);
2960 return toUChar(hi32 >> 16);
2961 }
sel8x8_5(ULong w64)2962 static inline UChar sel8x8_5 ( ULong w64 ) {
2963 UInt hi32 = toUInt(w64 >> 32);
2964 return toUChar(hi32 >> 8);
2965 }
sel8x8_4(ULong w64)2966 static inline UChar sel8x8_4 ( ULong w64 ) {
2967 UInt hi32 = toUInt(w64 >> 32);
2968 return toUChar(hi32 >> 0);
2969 }
sel8x8_3(ULong w64)2970 static inline UChar sel8x8_3 ( ULong w64 ) {
2971 UInt lo32 = toUInt(w64);
2972 return toUChar(lo32 >> 24);
2973 }
sel8x8_2(ULong w64)2974 static inline UChar sel8x8_2 ( ULong w64 ) {
2975 UInt lo32 = toUInt(w64);
2976 return toUChar(lo32 >> 16);
2977 }
sel8x8_1(ULong w64)2978 static inline UChar sel8x8_1 ( ULong w64 ) {
2979 UInt lo32 = toUInt(w64);
2980 return toUChar(lo32 >> 8);
2981 }
sel8x8_0(ULong w64)2982 static inline UChar sel8x8_0 ( ULong w64 ) {
2983 UInt lo32 = toUInt(w64);
2984 return toUChar(lo32 >> 0);
2985 }
2986
2987 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_mmx_pmaddwd(ULong xx,ULong yy)2988 ULong amd64g_calculate_mmx_pmaddwd ( ULong xx, ULong yy )
2989 {
2990 return
2991 mk32x2(
2992 (((Int)(Short)sel16x4_3(xx)) * ((Int)(Short)sel16x4_3(yy)))
2993 + (((Int)(Short)sel16x4_2(xx)) * ((Int)(Short)sel16x4_2(yy))),
2994 (((Int)(Short)sel16x4_1(xx)) * ((Int)(Short)sel16x4_1(yy)))
2995 + (((Int)(Short)sel16x4_0(xx)) * ((Int)(Short)sel16x4_0(yy)))
2996 );
2997 }
2998
2999 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_mmx_pmovmskb(ULong xx)3000 ULong amd64g_calculate_mmx_pmovmskb ( ULong xx )
3001 {
3002 ULong r = 0;
3003 if (xx & (1ULL << (64-1))) r |= (1<<7);
3004 if (xx & (1ULL << (56-1))) r |= (1<<6);
3005 if (xx & (1ULL << (48-1))) r |= (1<<5);
3006 if (xx & (1ULL << (40-1))) r |= (1<<4);
3007 if (xx & (1ULL << (32-1))) r |= (1<<3);
3008 if (xx & (1ULL << (24-1))) r |= (1<<2);
3009 if (xx & (1ULL << (16-1))) r |= (1<<1);
3010 if (xx & (1ULL << ( 8-1))) r |= (1<<0);
3011 return r;
3012 }
3013
3014 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_mmx_psadbw(ULong xx,ULong yy)3015 ULong amd64g_calculate_mmx_psadbw ( ULong xx, ULong yy )
3016 {
3017 UInt t = 0;
3018 t += (UInt)abdU8( sel8x8_7(xx), sel8x8_7(yy) );
3019 t += (UInt)abdU8( sel8x8_6(xx), sel8x8_6(yy) );
3020 t += (UInt)abdU8( sel8x8_5(xx), sel8x8_5(yy) );
3021 t += (UInt)abdU8( sel8x8_4(xx), sel8x8_4(yy) );
3022 t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
3023 t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
3024 t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
3025 t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
3026 t &= 0xFFFF;
3027 return (ULong)t;
3028 }
3029
3030 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_sse_pmovmskb(ULong w64hi,ULong w64lo)3031 ULong amd64g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo )
3032 {
3033 ULong rHi8 = amd64g_calculate_mmx_pmovmskb ( w64hi );
3034 ULong rLo8 = amd64g_calculate_mmx_pmovmskb ( w64lo );
3035 return ((rHi8 & 0xFF) << 8) | (rLo8 & 0xFF);
3036 }
3037
3038 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_sse_phminposuw(ULong sLo,ULong sHi)3039 ULong amd64g_calculate_sse_phminposuw ( ULong sLo, ULong sHi )
3040 {
3041 UShort t, min;
3042 UInt idx;
3043 t = sel16x4_0(sLo); if (True) { min = t; idx = 0; }
3044 t = sel16x4_1(sLo); if (t < min) { min = t; idx = 1; }
3045 t = sel16x4_2(sLo); if (t < min) { min = t; idx = 2; }
3046 t = sel16x4_3(sLo); if (t < min) { min = t; idx = 3; }
3047 t = sel16x4_0(sHi); if (t < min) { min = t; idx = 4; }
3048 t = sel16x4_1(sHi); if (t < min) { min = t; idx = 5; }
3049 t = sel16x4_2(sHi); if (t < min) { min = t; idx = 6; }
3050 t = sel16x4_3(sHi); if (t < min) { min = t; idx = 7; }
3051 return ((ULong)(idx << 16)) | ((ULong)min);
3052 }
3053
3054 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calc_crc32b(ULong crcIn,ULong b)3055 ULong amd64g_calc_crc32b ( ULong crcIn, ULong b )
3056 {
3057 UInt i;
3058 ULong crc = (b & 0xFFULL) ^ crcIn;
3059 for (i = 0; i < 8; i++)
3060 crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3061 return crc;
3062 }
3063
3064 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calc_crc32w(ULong crcIn,ULong w)3065 ULong amd64g_calc_crc32w ( ULong crcIn, ULong w )
3066 {
3067 UInt i;
3068 ULong crc = (w & 0xFFFFULL) ^ crcIn;
3069 for (i = 0; i < 16; i++)
3070 crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3071 return crc;
3072 }
3073
3074 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calc_crc32l(ULong crcIn,ULong l)3075 ULong amd64g_calc_crc32l ( ULong crcIn, ULong l )
3076 {
3077 UInt i;
3078 ULong crc = (l & 0xFFFFFFFFULL) ^ crcIn;
3079 for (i = 0; i < 32; i++)
3080 crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3081 return crc;
3082 }
3083
3084 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calc_crc32q(ULong crcIn,ULong q)3085 ULong amd64g_calc_crc32q ( ULong crcIn, ULong q )
3086 {
3087 ULong crc = amd64g_calc_crc32l(crcIn, q);
3088 return amd64g_calc_crc32l(crc, q >> 32);
3089 }
3090
3091
3092 /* .. helper for next fn .. */
sad_8x4(ULong xx,ULong yy)3093 static inline ULong sad_8x4 ( ULong xx, ULong yy )
3094 {
3095 UInt t = 0;
3096 t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
3097 t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
3098 t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
3099 t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
3100 return (ULong)t;
3101 }
3102
3103 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calc_mpsadbw(ULong sHi,ULong sLo,ULong dHi,ULong dLo,ULong imm_and_return_control_bit)3104 ULong amd64g_calc_mpsadbw ( ULong sHi, ULong sLo,
3105 ULong dHi, ULong dLo,
3106 ULong imm_and_return_control_bit )
3107 {
3108 UInt imm8 = imm_and_return_control_bit & 7;
3109 Bool calcHi = (imm_and_return_control_bit >> 7) & 1;
3110 UInt srcOffsL = imm8 & 3; /* src offs in 32-bit (L) chunks */
3111 UInt dstOffsL = (imm8 >> 2) & 1; /* dst offs in ditto chunks */
3112 /* For src we only need 32 bits, so get them into the
3113 lower half of a 64 bit word. */
3114 ULong src = ((srcOffsL & 2) ? sHi : sLo) >> (32 * (srcOffsL & 1));
3115 /* For dst we need to get hold of 56 bits (7 bytes) from a total of
3116 11 bytes. If calculating the low part of the result, need bytes
3117 dstOffsL * 4 + (0 .. 6); if calculating the high part,
3118 dstOffsL * 4 + (4 .. 10). */
3119 ULong dst;
3120 /* dstOffL = 0, Lo -> 0 .. 6
3121 dstOffL = 1, Lo -> 4 .. 10
3122 dstOffL = 0, Hi -> 4 .. 10
3123 dstOffL = 1, Hi -> 8 .. 14
3124 */
3125 if (calcHi && dstOffsL) {
3126 /* 8 .. 14 */
3127 dst = dHi & 0x00FFFFFFFFFFFFFFULL;
3128 }
3129 else if (!calcHi && !dstOffsL) {
3130 /* 0 .. 6 */
3131 dst = dLo & 0x00FFFFFFFFFFFFFFULL;
3132 }
3133 else {
3134 /* 4 .. 10 */
3135 dst = (dLo >> 32) | ((dHi & 0x00FFFFFFULL) << 32);
3136 }
3137 ULong r0 = sad_8x4( dst >> 0, src );
3138 ULong r1 = sad_8x4( dst >> 8, src );
3139 ULong r2 = sad_8x4( dst >> 16, src );
3140 ULong r3 = sad_8x4( dst >> 24, src );
3141 ULong res = (r3 << 48) | (r2 << 32) | (r1 << 16) | r0;
3142 return res;
3143 }
3144
3145 /*---------------------------------------------------------------*/
3146 /*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M} ---*/
3147 /*---------------------------------------------------------------*/
3148
zmask_from_V128(V128 * arg)3149 static UInt zmask_from_V128 ( V128* arg )
3150 {
3151 UInt i, res = 0;
3152 for (i = 0; i < 16; i++) {
3153 res |= ((arg->w8[i] == 0) ? 1 : 0) << i;
3154 }
3155 return res;
3156 }
3157
zmask_from_V128_wide(V128 * arg)3158 static UInt zmask_from_V128_wide ( V128* arg )
3159 {
3160 UInt i, res = 0;
3161 for (i = 0; i < 8; i++) {
3162 res |= ((arg->w16[i] == 0) ? 1 : 0) << i;
3163 }
3164 return res;
3165 }
3166
3167 /* Helps with PCMP{I,E}STR{I,M}.
3168
3169 CALLED FROM GENERATED CODE: DIRTY HELPER(s). (But not really,
3170 actually it could be a clean helper, but for the fact that we can't
3171 pass by value 2 x V128 to a clean helper, nor have one returned.)
3172 Reads guest state, writes to guest state for the xSTRM cases, no
3173 accesses of memory, is a pure function.
3174
3175 opc_and_imm contains (4th byte of opcode << 8) | the-imm8-byte so
3176 the callee knows which I/E and I/M variant it is dealing with and
3177 what the specific operation is. 4th byte of opcode is in the range
3178 0x60 to 0x63:
3179 istri 66 0F 3A 63
3180 istrm 66 0F 3A 62
3181 estri 66 0F 3A 61
3182 estrm 66 0F 3A 60
3183
3184 gstOffL and gstOffR are the guest state offsets for the two XMM
3185 register inputs. We never have to deal with the memory case since
3186 that is handled by pre-loading the relevant value into the fake
3187 XMM16 register.
3188
3189 For ESTRx variants, edxIN and eaxIN hold the values of those two
3190 registers.
3191
3192 In all cases, the bottom 16 bits of the result contain the new
3193 OSZACP %rflags values. For xSTRI variants, bits[31:16] of the
3194 result hold the new %ecx value. For xSTRM variants, the helper
3195 writes the result directly to the guest XMM0.
3196
3197 Declarable side effects: in all cases, reads guest state at
3198 [gstOffL, +16) and [gstOffR, +16). For xSTRM variants, also writes
3199 guest_XMM0.
3200
3201 Is expected to be called with opc_and_imm combinations which have
3202 actually been validated, and will assert if otherwise. The front
3203 end should ensure we're only called with verified values.
3204 */
amd64g_dirtyhelper_PCMPxSTRx(VexGuestAMD64State * gst,HWord opc4_and_imm,HWord gstOffL,HWord gstOffR,HWord edxIN,HWord eaxIN)3205 ULong amd64g_dirtyhelper_PCMPxSTRx (
3206 VexGuestAMD64State* gst,
3207 HWord opc4_and_imm,
3208 HWord gstOffL, HWord gstOffR,
3209 HWord edxIN, HWord eaxIN
3210 )
3211 {
3212 HWord opc4 = (opc4_and_imm >> 8) & 0xFF;
3213 HWord imm8 = opc4_and_imm & 0xFF;
3214 HWord isISTRx = opc4 & 2;
3215 HWord isxSTRM = (opc4 & 1) ^ 1;
3216 vassert((opc4 & 0xFC) == 0x60); /* 0x60 .. 0x63 */
3217 HWord wide = (imm8 & 1);
3218
3219 // where the args are
3220 V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
3221 V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
3222
3223 /* Create the arg validity masks, either from the vectors
3224 themselves or from the supplied edx/eax values. */
3225 // FIXME: this is only right for the 8-bit data cases.
3226 // At least that is asserted above.
3227 UInt zmaskL, zmaskR;
3228
3229 // temp spot for the resulting flags and vector.
3230 V128 resV;
3231 UInt resOSZACP;
3232
3233 // for checking whether case was handled
3234 Bool ok = False;
3235
3236 if (wide) {
3237 if (isISTRx) {
3238 zmaskL = zmask_from_V128_wide(argL);
3239 zmaskR = zmask_from_V128_wide(argR);
3240 } else {
3241 Int tmp;
3242 tmp = edxIN & 0xFFFFFFFF;
3243 if (tmp < -8) tmp = -8;
3244 if (tmp > 8) tmp = 8;
3245 if (tmp < 0) tmp = -tmp;
3246 vassert(tmp >= 0 && tmp <= 8);
3247 zmaskL = (1 << tmp) & 0xFF;
3248 tmp = eaxIN & 0xFFFFFFFF;
3249 if (tmp < -8) tmp = -8;
3250 if (tmp > 8) tmp = 8;
3251 if (tmp < 0) tmp = -tmp;
3252 vassert(tmp >= 0 && tmp <= 8);
3253 zmaskR = (1 << tmp) & 0xFF;
3254 }
3255 // do the meyaath
3256 ok = compute_PCMPxSTRx_wide (
3257 &resV, &resOSZACP, argL, argR,
3258 zmaskL, zmaskR, imm8, (Bool)isxSTRM
3259 );
3260 } else {
3261 if (isISTRx) {
3262 zmaskL = zmask_from_V128(argL);
3263 zmaskR = zmask_from_V128(argR);
3264 } else {
3265 Int tmp;
3266 tmp = edxIN & 0xFFFFFFFF;
3267 if (tmp < -16) tmp = -16;
3268 if (tmp > 16) tmp = 16;
3269 if (tmp < 0) tmp = -tmp;
3270 vassert(tmp >= 0 && tmp <= 16);
3271 zmaskL = (1 << tmp) & 0xFFFF;
3272 tmp = eaxIN & 0xFFFFFFFF;
3273 if (tmp < -16) tmp = -16;
3274 if (tmp > 16) tmp = 16;
3275 if (tmp < 0) tmp = -tmp;
3276 vassert(tmp >= 0 && tmp <= 16);
3277 zmaskR = (1 << tmp) & 0xFFFF;
3278 }
3279 // do the meyaath
3280 ok = compute_PCMPxSTRx (
3281 &resV, &resOSZACP, argL, argR,
3282 zmaskL, zmaskR, imm8, (Bool)isxSTRM
3283 );
3284 }
3285
3286 // front end shouldn't pass us any imm8 variants we can't
3287 // handle. Hence:
3288 vassert(ok);
3289
3290 // So, finally we need to get the results back to the caller.
3291 // In all cases, the new OSZACP value is the lowest 16 of
3292 // the return value.
3293 if (isxSTRM) {
3294 gst->guest_YMM0[0] = resV.w32[0];
3295 gst->guest_YMM0[1] = resV.w32[1];
3296 gst->guest_YMM0[2] = resV.w32[2];
3297 gst->guest_YMM0[3] = resV.w32[3];
3298 return resOSZACP & 0x8D5;
3299 } else {
3300 UInt newECX = resV.w32[0] & 0xFFFF;
3301 return (newECX << 16) | (resOSZACP & 0x8D5);
3302 }
3303 }
3304
3305 /*---------------------------------------------------------------*/
3306 /*--- AES primitives and helpers ---*/
3307 /*---------------------------------------------------------------*/
3308 /* a 16 x 16 matrix */
3309 static const UChar sbox[256] = { // row nr
3310 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, // 1
3311 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
3312 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, // 2
3313 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
3314 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, // 3
3315 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
3316 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, // 4
3317 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
3318 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, // 5
3319 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
3320 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, // 6
3321 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
3322 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, // 7
3323 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
3324 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, // 8
3325 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
3326 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, // 9
3327 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
3328 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, //10
3329 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
3330 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, //11
3331 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
3332 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, //12
3333 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
3334 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, //13
3335 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
3336 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, //14
3337 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
3338 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, //15
3339 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
3340 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, //16
3341 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
3342 };
SubBytes(V128 * v)3343 static void SubBytes (V128* v)
3344 {
3345 V128 r;
3346 UInt i;
3347 for (i = 0; i < 16; i++)
3348 r.w8[i] = sbox[v->w8[i]];
3349 *v = r;
3350 }
3351
3352 /* a 16 x 16 matrix */
3353 static const UChar invsbox[256] = { // row nr
3354 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, // 1
3355 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
3356 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, // 2
3357 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
3358 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, // 3
3359 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
3360 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, // 4
3361 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
3362 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, // 5
3363 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
3364 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, // 6
3365 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
3366 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, // 7
3367 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
3368 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, // 8
3369 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
3370 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, // 9
3371 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
3372 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, //10
3373 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
3374 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, //11
3375 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
3376 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, //12
3377 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
3378 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, //13
3379 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
3380 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, //14
3381 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
3382 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, //15
3383 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
3384 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, //16
3385 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
3386 };
InvSubBytes(V128 * v)3387 static void InvSubBytes (V128* v)
3388 {
3389 V128 r;
3390 UInt i;
3391 for (i = 0; i < 16; i++)
3392 r.w8[i] = invsbox[v->w8[i]];
3393 *v = r;
3394 }
3395
3396 static const UChar ShiftRows_op[16] =
3397 {11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0};
ShiftRows(V128 * v)3398 static void ShiftRows (V128* v)
3399 {
3400 V128 r;
3401 UInt i;
3402 for (i = 0; i < 16; i++)
3403 r.w8[i] = v->w8[ShiftRows_op[15-i]];
3404 *v = r;
3405 }
3406
3407 static const UChar InvShiftRows_op[16] =
3408 {3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0};
InvShiftRows(V128 * v)3409 static void InvShiftRows (V128* v)
3410 {
3411 V128 r;
3412 UInt i;
3413 for (i = 0; i < 16; i++)
3414 r.w8[i] = v->w8[InvShiftRows_op[15-i]];
3415 *v = r;
3416 }
3417
3418 /* Multiplication of the finite fields elements of AES.
3419 See "A Specification for The AES Algorithm Rijndael
3420 (by Joan Daemen & Vincent Rijmen)"
3421 Dr. Brian Gladman, v3.1, 3rd March 2001. */
3422 /* N values so that (hex) xy = 0x03^N.
3423 0x00 cannot be used. We put 0xff for this value.*/
3424 /* a 16 x 16 matrix */
3425 static const UChar Nxy[256] = { // row nr
3426 0xff, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, // 1
3427 0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03,
3428 0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, // 2
3429 0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1,
3430 0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, // 3
3431 0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78,
3432 0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, // 4
3433 0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e,
3434 0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, // 5
3435 0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38,
3436 0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, // 6
3437 0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10,
3438 0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, // 7
3439 0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba,
3440 0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, // 8
3441 0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57,
3442 0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, // 9
3443 0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8,
3444 0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, //10
3445 0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0,
3446 0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, //11
3447 0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7,
3448 0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, //12
3449 0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d,
3450 0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, //13
3451 0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1,
3452 0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, //14
3453 0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab,
3454 0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, //15
3455 0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5,
3456 0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, //16
3457 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07
3458 };
3459
3460 /* E values so that E = 0x03^xy. */
3461 static const UChar Exy[256] = { // row nr
3462 0x01, 0x03, 0x05, 0x0f, 0x11, 0x33, 0x55, 0xff, // 1
3463 0x1a, 0x2e, 0x72, 0x96, 0xa1, 0xf8, 0x13, 0x35,
3464 0x5f, 0xe1, 0x38, 0x48, 0xd8, 0x73, 0x95, 0xa4, // 2
3465 0xf7, 0x02, 0x06, 0x0a, 0x1e, 0x22, 0x66, 0xaa,
3466 0xe5, 0x34, 0x5c, 0xe4, 0x37, 0x59, 0xeb, 0x26, // 3
3467 0x6a, 0xbe, 0xd9, 0x70, 0x90, 0xab, 0xe6, 0x31,
3468 0x53, 0xf5, 0x04, 0x0c, 0x14, 0x3c, 0x44, 0xcc, // 4
3469 0x4f, 0xd1, 0x68, 0xb8, 0xd3, 0x6e, 0xb2, 0xcd,
3470 0x4c, 0xd4, 0x67, 0xa9, 0xe0, 0x3b, 0x4d, 0xd7, // 5
3471 0x62, 0xa6, 0xf1, 0x08, 0x18, 0x28, 0x78, 0x88,
3472 0x83, 0x9e, 0xb9, 0xd0, 0x6b, 0xbd, 0xdc, 0x7f, // 6
3473 0x81, 0x98, 0xb3, 0xce, 0x49, 0xdb, 0x76, 0x9a,
3474 0xb5, 0xc4, 0x57, 0xf9, 0x10, 0x30, 0x50, 0xf0, // 7
3475 0x0b, 0x1d, 0x27, 0x69, 0xbb, 0xd6, 0x61, 0xa3,
3476 0xfe, 0x19, 0x2b, 0x7d, 0x87, 0x92, 0xad, 0xec, // 8
3477 0x2f, 0x71, 0x93, 0xae, 0xe9, 0x20, 0x60, 0xa0,
3478 0xfb, 0x16, 0x3a, 0x4e, 0xd2, 0x6d, 0xb7, 0xc2, // 9
3479 0x5d, 0xe7, 0x32, 0x56, 0xfa, 0x15, 0x3f, 0x41,
3480 0xc3, 0x5e, 0xe2, 0x3d, 0x47, 0xc9, 0x40, 0xc0, //10
3481 0x5b, 0xed, 0x2c, 0x74, 0x9c, 0xbf, 0xda, 0x75,
3482 0x9f, 0xba, 0xd5, 0x64, 0xac, 0xef, 0x2a, 0x7e, //11
3483 0x82, 0x9d, 0xbc, 0xdf, 0x7a, 0x8e, 0x89, 0x80,
3484 0x9b, 0xb6, 0xc1, 0x58, 0xe8, 0x23, 0x65, 0xaf, //12
3485 0xea, 0x25, 0x6f, 0xb1, 0xc8, 0x43, 0xc5, 0x54,
3486 0xfc, 0x1f, 0x21, 0x63, 0xa5, 0xf4, 0x07, 0x09, //13
3487 0x1b, 0x2d, 0x77, 0x99, 0xb0, 0xcb, 0x46, 0xca,
3488 0x45, 0xcf, 0x4a, 0xde, 0x79, 0x8b, 0x86, 0x91, //14
3489 0xa8, 0xe3, 0x3e, 0x42, 0xc6, 0x51, 0xf3, 0x0e,
3490 0x12, 0x36, 0x5a, 0xee, 0x29, 0x7b, 0x8d, 0x8c, //15
3491 0x8f, 0x8a, 0x85, 0x94, 0xa7, 0xf2, 0x0d, 0x17,
3492 0x39, 0x4b, 0xdd, 0x7c, 0x84, 0x97, 0xa2, 0xfd, //16
3493 0x1c, 0x24, 0x6c, 0xb4, 0xc7, 0x52, 0xf6, 0x01};
3494
ff_mul(UChar u1,UChar u2)3495 static inline UChar ff_mul(UChar u1, UChar u2)
3496 {
3497 if ((u1 > 0) && (u2 > 0)) {
3498 UInt ui = Nxy[u1] + Nxy[u2];
3499 if (ui >= 255)
3500 ui = ui - 255;
3501 return Exy[ui];
3502 } else {
3503 return 0;
3504 };
3505 }
3506
MixColumns(V128 * v)3507 static void MixColumns (V128* v)
3508 {
3509 V128 r;
3510 Int j;
3511 #define P(x,row,col) (x)->w8[((row)*4+(col))]
3512 for (j = 0; j < 4; j++) {
3513 P(&r,j,0) = ff_mul(0x02, P(v,j,0)) ^ ff_mul(0x03, P(v,j,1))
3514 ^ P(v,j,2) ^ P(v,j,3);
3515 P(&r,j,1) = P(v,j,0) ^ ff_mul( 0x02, P(v,j,1) )
3516 ^ ff_mul(0x03, P(v,j,2) ) ^ P(v,j,3);
3517 P(&r,j,2) = P(v,j,0) ^ P(v,j,1) ^ ff_mul( 0x02, P(v,j,2) )
3518 ^ ff_mul(0x03, P(v,j,3) );
3519 P(&r,j,3) = ff_mul(0x03, P(v,j,0) ) ^ P(v,j,1) ^ P(v,j,2)
3520 ^ ff_mul( 0x02, P(v,j,3) );
3521 }
3522 *v = r;
3523 #undef P
3524 }
3525
InvMixColumns(V128 * v)3526 static void InvMixColumns (V128* v)
3527 {
3528 V128 r;
3529 Int j;
3530 #define P(x,row,col) (x)->w8[((row)*4+(col))]
3531 for (j = 0; j < 4; j++) {
3532 P(&r,j,0) = ff_mul(0x0e, P(v,j,0) ) ^ ff_mul(0x0b, P(v,j,1) )
3533 ^ ff_mul(0x0d,P(v,j,2) ) ^ ff_mul(0x09, P(v,j,3) );
3534 P(&r,j,1) = ff_mul(0x09, P(v,j,0) ) ^ ff_mul(0x0e, P(v,j,1) )
3535 ^ ff_mul(0x0b,P(v,j,2) ) ^ ff_mul(0x0d, P(v,j,3) );
3536 P(&r,j,2) = ff_mul(0x0d, P(v,j,0) ) ^ ff_mul(0x09, P(v,j,1) )
3537 ^ ff_mul(0x0e,P(v,j,2) ) ^ ff_mul(0x0b, P(v,j,3) );
3538 P(&r,j,3) = ff_mul(0x0b, P(v,j,0) ) ^ ff_mul(0x0d, P(v,j,1) )
3539 ^ ff_mul(0x09,P(v,j,2) ) ^ ff_mul(0x0e, P(v,j,3) );
3540 }
3541 *v = r;
3542 #undef P
3543
3544 }
3545
3546 /* For description, see definition in guest_amd64_defs.h */
amd64g_dirtyhelper_AES(VexGuestAMD64State * gst,HWord opc4,HWord gstOffD,HWord gstOffL,HWord gstOffR)3547 void amd64g_dirtyhelper_AES (
3548 VexGuestAMD64State* gst,
3549 HWord opc4, HWord gstOffD,
3550 HWord gstOffL, HWord gstOffR
3551 )
3552 {
3553 // where the args are
3554 V128* argD = (V128*)( ((UChar*)gst) + gstOffD );
3555 V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
3556 V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
3557 V128 r;
3558
3559 switch (opc4) {
3560 case 0xDC: /* AESENC */
3561 case 0xDD: /* AESENCLAST */
3562 r = *argR;
3563 ShiftRows (&r);
3564 SubBytes (&r);
3565 if (opc4 == 0xDC)
3566 MixColumns (&r);
3567 argD->w64[0] = r.w64[0] ^ argL->w64[0];
3568 argD->w64[1] = r.w64[1] ^ argL->w64[1];
3569 break;
3570
3571 case 0xDE: /* AESDEC */
3572 case 0xDF: /* AESDECLAST */
3573 r = *argR;
3574 InvShiftRows (&r);
3575 InvSubBytes (&r);
3576 if (opc4 == 0xDE)
3577 InvMixColumns (&r);
3578 argD->w64[0] = r.w64[0] ^ argL->w64[0];
3579 argD->w64[1] = r.w64[1] ^ argL->w64[1];
3580 break;
3581
3582 case 0xDB: /* AESIMC */
3583 *argD = *argL;
3584 InvMixColumns (argD);
3585 break;
3586 default: vassert(0);
3587 }
3588 }
3589
RotWord(UInt w32)3590 static inline UInt RotWord (UInt w32)
3591 {
3592 return ((w32 >> 8) | (w32 << 24));
3593 }
3594
SubWord(UInt w32)3595 static inline UInt SubWord (UInt w32)
3596 {
3597 UChar *w8;
3598 UChar *r8;
3599 UInt res;
3600 w8 = (UChar*) &w32;
3601 r8 = (UChar*) &res;
3602 r8[0] = sbox[w8[0]];
3603 r8[1] = sbox[w8[1]];
3604 r8[2] = sbox[w8[2]];
3605 r8[3] = sbox[w8[3]];
3606 return res;
3607 }
3608
3609 /* For description, see definition in guest_amd64_defs.h */
amd64g_dirtyhelper_AESKEYGENASSIST(VexGuestAMD64State * gst,HWord imm8,HWord gstOffL,HWord gstOffR)3610 extern void amd64g_dirtyhelper_AESKEYGENASSIST (
3611 VexGuestAMD64State* gst,
3612 HWord imm8,
3613 HWord gstOffL, HWord gstOffR
3614 )
3615 {
3616 // where the args are
3617 V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
3618 V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
3619
3620 argR->w32[3] = RotWord (SubWord (argL->w32[3])) ^ imm8;
3621 argR->w32[2] = SubWord (argL->w32[3]);
3622 argR->w32[1] = RotWord (SubWord (argL->w32[1])) ^ imm8;
3623 argR->w32[0] = SubWord (argL->w32[1]);
3624 }
3625
3626
3627
3628 /*---------------------------------------------------------------*/
3629 /*--- Helpers for dealing with, and describing, ---*/
3630 /*--- guest state as a whole. ---*/
3631 /*---------------------------------------------------------------*/
3632
3633 /* Initialise the entire amd64 guest state. */
3634 /* VISIBLE TO LIBVEX CLIENT */
LibVEX_GuestAMD64_initialise(VexGuestAMD64State * vex_state)3635 void LibVEX_GuestAMD64_initialise ( /*OUT*/VexGuestAMD64State* vex_state )
3636 {
3637 vex_state->host_EvC_FAILADDR = 0;
3638 vex_state->host_EvC_COUNTER = 0;
3639 vex_state->pad0 = 0;
3640
3641 vex_state->guest_RAX = 0;
3642 vex_state->guest_RCX = 0;
3643 vex_state->guest_RDX = 0;
3644 vex_state->guest_RBX = 0;
3645 vex_state->guest_RSP = 0;
3646 vex_state->guest_RBP = 0;
3647 vex_state->guest_RSI = 0;
3648 vex_state->guest_RDI = 0;
3649 vex_state->guest_R8 = 0;
3650 vex_state->guest_R9 = 0;
3651 vex_state->guest_R10 = 0;
3652 vex_state->guest_R11 = 0;
3653 vex_state->guest_R12 = 0;
3654 vex_state->guest_R13 = 0;
3655 vex_state->guest_R14 = 0;
3656 vex_state->guest_R15 = 0;
3657
3658 vex_state->guest_CC_OP = AMD64G_CC_OP_COPY;
3659 vex_state->guest_CC_DEP1 = 0;
3660 vex_state->guest_CC_DEP2 = 0;
3661 vex_state->guest_CC_NDEP = 0;
3662
3663 vex_state->guest_DFLAG = 1; /* forwards */
3664 vex_state->guest_IDFLAG = 0;
3665
3666 /* HACK: represent the offset associated with %fs==0. This
3667 assumes that %fs is only ever zero. */
3668 vex_state->guest_FS_ZERO = 0;
3669
3670 vex_state->guest_RIP = 0;
3671
3672 /* Initialise the simulated FPU */
3673 amd64g_dirtyhelper_FINIT( vex_state );
3674
3675 /* Initialise the AVX state. */
3676 # define AVXZERO(_ymm) \
3677 do { _ymm[0]=_ymm[1]=_ymm[2]=_ymm[3] = 0; \
3678 _ymm[4]=_ymm[5]=_ymm[6]=_ymm[7] = 0; \
3679 } while (0)
3680 vex_state->guest_SSEROUND = (ULong)Irrm_NEAREST;
3681 AVXZERO(vex_state->guest_YMM0);
3682 AVXZERO(vex_state->guest_YMM1);
3683 AVXZERO(vex_state->guest_YMM2);
3684 AVXZERO(vex_state->guest_YMM3);
3685 AVXZERO(vex_state->guest_YMM4);
3686 AVXZERO(vex_state->guest_YMM5);
3687 AVXZERO(vex_state->guest_YMM6);
3688 AVXZERO(vex_state->guest_YMM7);
3689 AVXZERO(vex_state->guest_YMM8);
3690 AVXZERO(vex_state->guest_YMM9);
3691 AVXZERO(vex_state->guest_YMM10);
3692 AVXZERO(vex_state->guest_YMM11);
3693 AVXZERO(vex_state->guest_YMM12);
3694 AVXZERO(vex_state->guest_YMM13);
3695 AVXZERO(vex_state->guest_YMM14);
3696 AVXZERO(vex_state->guest_YMM15);
3697 AVXZERO(vex_state->guest_YMM16);
3698
3699 # undef AVXZERO
3700
3701 vex_state->guest_EMWARN = EmWarn_NONE;
3702
3703 /* These should not ever be either read or written, but we
3704 initialise them anyway. */
3705 vex_state->guest_TISTART = 0;
3706 vex_state->guest_TILEN = 0;
3707
3708 vex_state->guest_NRADDR = 0;
3709 vex_state->guest_SC_CLASS = 0;
3710 vex_state->guest_GS_0x60 = 0;
3711
3712 vex_state->guest_IP_AT_SYSCALL = 0;
3713 vex_state->pad1 = 0;
3714 }
3715
3716
3717 /* Figure out if any part of the guest state contained in minoff
3718 .. maxoff requires precise memory exceptions. If in doubt return
3719 True (but this is generates significantly slower code).
3720
3721 By default we enforce precise exns for guest %RSP, %RBP and %RIP
3722 only. These are the minimum needed to extract correct stack
3723 backtraces from amd64 code.
3724 */
guest_amd64_state_requires_precise_mem_exns(Int minoff,Int maxoff)3725 Bool guest_amd64_state_requires_precise_mem_exns ( Int minoff,
3726 Int maxoff)
3727 {
3728 Int rbp_min = offsetof(VexGuestAMD64State, guest_RBP);
3729 Int rbp_max = rbp_min + 8 - 1;
3730 Int rsp_min = offsetof(VexGuestAMD64State, guest_RSP);
3731 Int rsp_max = rsp_min + 8 - 1;
3732 Int rip_min = offsetof(VexGuestAMD64State, guest_RIP);
3733 Int rip_max = rip_min + 8 - 1;
3734
3735 if (maxoff < rbp_min || minoff > rbp_max) {
3736 /* no overlap with rbp */
3737 } else {
3738 return True;
3739 }
3740
3741 if (maxoff < rsp_min || minoff > rsp_max) {
3742 /* no overlap with rsp */
3743 } else {
3744 return True;
3745 }
3746
3747 if (maxoff < rip_min || minoff > rip_max) {
3748 /* no overlap with eip */
3749 } else {
3750 return True;
3751 }
3752
3753 return False;
3754 }
3755
3756
3757 #define ALWAYSDEFD(field) \
3758 { offsetof(VexGuestAMD64State, field), \
3759 (sizeof ((VexGuestAMD64State*)0)->field) }
3760
3761 VexGuestLayout
3762 amd64guest_layout
3763 = {
3764 /* Total size of the guest state, in bytes. */
3765 .total_sizeB = sizeof(VexGuestAMD64State),
3766
3767 /* Describe the stack pointer. */
3768 .offset_SP = offsetof(VexGuestAMD64State,guest_RSP),
3769 .sizeof_SP = 8,
3770
3771 /* Describe the frame pointer. */
3772 .offset_FP = offsetof(VexGuestAMD64State,guest_RBP),
3773 .sizeof_FP = 8,
3774
3775 /* Describe the instruction pointer. */
3776 .offset_IP = offsetof(VexGuestAMD64State,guest_RIP),
3777 .sizeof_IP = 8,
3778
3779 /* Describe any sections to be regarded by Memcheck as
3780 'always-defined'. */
3781 .n_alwaysDefd = 16,
3782
3783 /* flags thunk: OP and NDEP are always defd, whereas DEP1
3784 and DEP2 have to be tracked. See detailed comment in
3785 gdefs.h on meaning of thunk fields. */
3786 .alwaysDefd
3787 = { /* 0 */ ALWAYSDEFD(guest_CC_OP),
3788 /* 1 */ ALWAYSDEFD(guest_CC_NDEP),
3789 /* 2 */ ALWAYSDEFD(guest_DFLAG),
3790 /* 3 */ ALWAYSDEFD(guest_IDFLAG),
3791 /* 4 */ ALWAYSDEFD(guest_RIP),
3792 /* 5 */ ALWAYSDEFD(guest_FS_ZERO),
3793 /* 6 */ ALWAYSDEFD(guest_FTOP),
3794 /* 7 */ ALWAYSDEFD(guest_FPTAG),
3795 /* 8 */ ALWAYSDEFD(guest_FPROUND),
3796 /* 9 */ ALWAYSDEFD(guest_FC3210),
3797 // /* */ ALWAYSDEFD(guest_CS),
3798 // /* */ ALWAYSDEFD(guest_DS),
3799 // /* */ ALWAYSDEFD(guest_ES),
3800 // /* */ ALWAYSDEFD(guest_FS),
3801 // /* */ ALWAYSDEFD(guest_GS),
3802 // /* */ ALWAYSDEFD(guest_SS),
3803 // /* */ ALWAYSDEFD(guest_LDT),
3804 // /* */ ALWAYSDEFD(guest_GDT),
3805 /* 10 */ ALWAYSDEFD(guest_EMWARN),
3806 /* 11 */ ALWAYSDEFD(guest_SSEROUND),
3807 /* 12 */ ALWAYSDEFD(guest_TISTART),
3808 /* 13 */ ALWAYSDEFD(guest_TILEN),
3809 /* 14 */ ALWAYSDEFD(guest_SC_CLASS),
3810 /* 15 */ ALWAYSDEFD(guest_IP_AT_SYSCALL)
3811 }
3812 };
3813
3814
3815 /*---------------------------------------------------------------*/
3816 /*--- end guest_amd64_helpers.c ---*/
3817 /*---------------------------------------------------------------*/
3818