1
2 /*---------------------------------------------------------------*/
3 /*--- begin guest_amd64_helpers.c ---*/
4 /*---------------------------------------------------------------*/
5
6 /*
7 This file is part of Valgrind, a dynamic binary instrumentation
8 framework.
9
10 Copyright (C) 2004-2013 OpenWorks LLP
11 info@open-works.net
12
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of the
16 License, or (at your option) any later version.
17
18 This program is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with this program; if not, write to the Free Software
25 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26 02110-1301, USA.
27
28 The GNU General Public License is contained in the file COPYING.
29
30 Neither the names of the U.S. Department of Energy nor the
31 University of California nor the names of its contributors may be
32 used to endorse or promote products derived from this software
33 without prior written permission.
34 */
35
36 #include "libvex_basictypes.h"
37 #include "libvex_emnote.h"
38 #include "libvex_guest_amd64.h"
39 #include "libvex_ir.h"
40 #include "libvex.h"
41
42 #include "main_util.h"
43 #include "main_globals.h"
44 #include "guest_generic_bb_to_IR.h"
45 #include "guest_amd64_defs.h"
46 #include "guest_generic_x87.h"
47
48
49 /* This file contains helper functions for amd64 guest code.
50 Calls to these functions are generated by the back end.
51 These calls are of course in the host machine code and
52 this file will be compiled to host machine code, so that
53 all makes sense.
54
55 Only change the signatures of these helper functions very
56 carefully. If you change the signature here, you'll have to change
57 the parameters passed to it in the IR calls constructed by
58 guest-amd64/toIR.c.
59
60 The convention used is that all functions called from generated
61 code are named amd64g_<something>, and any function whose name lacks
62 that prefix is not called from generated code. Note that some
63 LibVEX_* functions can however be called by VEX's client, but that
64 is not the same as calling them from VEX-generated code.
65 */
66
67
68 /* Set to 1 to get detailed profiling info about use of the flag
69 machinery. */
70 #define PROFILE_RFLAGS 0
71
72
73 /*---------------------------------------------------------------*/
74 /*--- %rflags run-time helpers. ---*/
75 /*---------------------------------------------------------------*/
76
77 /* Do 64x64 -> 128 signed/unsigned multiplies, for computing flags
78 after imulq/mulq. */
79
mullS64(Long u,Long v,Long * rHi,Long * rLo)80 static void mullS64 ( Long u, Long v, Long* rHi, Long* rLo )
81 {
82 ULong u0, v0, w0;
83 Long u1, v1, w1, w2, t;
84 u0 = u & 0xFFFFFFFFULL;
85 u1 = u >> 32;
86 v0 = v & 0xFFFFFFFFULL;
87 v1 = v >> 32;
88 w0 = u0 * v0;
89 t = u1 * v0 + (w0 >> 32);
90 w1 = t & 0xFFFFFFFFULL;
91 w2 = t >> 32;
92 w1 = u0 * v1 + w1;
93 *rHi = u1 * v1 + w2 + (w1 >> 32);
94 *rLo = u * v;
95 }
96
mullU64(ULong u,ULong v,ULong * rHi,ULong * rLo)97 static void mullU64 ( ULong u, ULong v, ULong* rHi, ULong* rLo )
98 {
99 ULong u0, v0, w0;
100 ULong u1, v1, w1,w2,t;
101 u0 = u & 0xFFFFFFFFULL;
102 u1 = u >> 32;
103 v0 = v & 0xFFFFFFFFULL;
104 v1 = v >> 32;
105 w0 = u0 * v0;
106 t = u1 * v0 + (w0 >> 32);
107 w1 = t & 0xFFFFFFFFULL;
108 w2 = t >> 32;
109 w1 = u0 * v1 + w1;
110 *rHi = u1 * v1 + w2 + (w1 >> 32);
111 *rLo = u * v;
112 }
113
114
115 static const UChar parity_table[256] = {
116 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
117 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
118 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
119 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
120 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
121 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
122 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
123 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
124 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
125 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
126 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
127 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
128 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
129 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
130 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
131 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
132 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
133 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
134 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
135 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
136 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
137 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
138 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
139 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
140 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
141 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
142 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
143 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
144 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
145 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
146 AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
147 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
148 };
149
150 /* generalised left-shifter */
lshift(Long x,Int n)151 static inline Long lshift ( Long x, Int n )
152 {
153 if (n >= 0)
154 return x << n;
155 else
156 return x >> (-n);
157 }
158
159 /* identity on ULong */
idULong(ULong x)160 static inline ULong idULong ( ULong x )
161 {
162 return x;
163 }
164
165
166 #define PREAMBLE(__data_bits) \
167 /* const */ ULong DATA_MASK \
168 = __data_bits==8 \
169 ? 0xFFULL \
170 : (__data_bits==16 \
171 ? 0xFFFFULL \
172 : (__data_bits==32 \
173 ? 0xFFFFFFFFULL \
174 : 0xFFFFFFFFFFFFFFFFULL)); \
175 /* const */ ULong SIGN_MASK = 1ULL << (__data_bits - 1); \
176 /* const */ ULong CC_DEP1 = cc_dep1_formal; \
177 /* const */ ULong CC_DEP2 = cc_dep2_formal; \
178 /* const */ ULong CC_NDEP = cc_ndep_formal; \
179 /* Four bogus assignments, which hopefully gcc can */ \
180 /* optimise away, and which stop it complaining about */ \
181 /* unused variables. */ \
182 SIGN_MASK = SIGN_MASK; \
183 DATA_MASK = DATA_MASK; \
184 CC_DEP2 = CC_DEP2; \
185 CC_NDEP = CC_NDEP;
186
187
188 /*-------------------------------------------------------------*/
189
190 #define ACTIONS_ADD(DATA_BITS,DATA_UTYPE) \
191 { \
192 PREAMBLE(DATA_BITS); \
193 { Long cf, pf, af, zf, sf, of; \
194 Long argL, argR, res; \
195 argL = CC_DEP1; \
196 argR = CC_DEP2; \
197 res = argL + argR; \
198 cf = (DATA_UTYPE)res < (DATA_UTYPE)argL; \
199 pf = parity_table[(UChar)res]; \
200 af = (res ^ argL ^ argR) & 0x10; \
201 zf = ((DATA_UTYPE)res == 0) << 6; \
202 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
203 of = lshift((argL ^ argR ^ -1) & (argL ^ res), \
204 12 - DATA_BITS) & AMD64G_CC_MASK_O; \
205 return cf | pf | af | zf | sf | of; \
206 } \
207 }
208
209 /*-------------------------------------------------------------*/
210
211 #define ACTIONS_SUB(DATA_BITS,DATA_UTYPE) \
212 { \
213 PREAMBLE(DATA_BITS); \
214 { Long cf, pf, af, zf, sf, of; \
215 Long argL, argR, res; \
216 argL = CC_DEP1; \
217 argR = CC_DEP2; \
218 res = argL - argR; \
219 cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR; \
220 pf = parity_table[(UChar)res]; \
221 af = (res ^ argL ^ argR) & 0x10; \
222 zf = ((DATA_UTYPE)res == 0) << 6; \
223 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
224 of = lshift((argL ^ argR) & (argL ^ res), \
225 12 - DATA_BITS) & AMD64G_CC_MASK_O; \
226 return cf | pf | af | zf | sf | of; \
227 } \
228 }
229
230 /*-------------------------------------------------------------*/
231
232 #define ACTIONS_ADC(DATA_BITS,DATA_UTYPE) \
233 { \
234 PREAMBLE(DATA_BITS); \
235 { Long cf, pf, af, zf, sf, of; \
236 Long argL, argR, oldC, res; \
237 oldC = CC_NDEP & AMD64G_CC_MASK_C; \
238 argL = CC_DEP1; \
239 argR = CC_DEP2 ^ oldC; \
240 res = (argL + argR) + oldC; \
241 if (oldC) \
242 cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL; \
243 else \
244 cf = (DATA_UTYPE)res < (DATA_UTYPE)argL; \
245 pf = parity_table[(UChar)res]; \
246 af = (res ^ argL ^ argR) & 0x10; \
247 zf = ((DATA_UTYPE)res == 0) << 6; \
248 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
249 of = lshift((argL ^ argR ^ -1) & (argL ^ res), \
250 12 - DATA_BITS) & AMD64G_CC_MASK_O; \
251 return cf | pf | af | zf | sf | of; \
252 } \
253 }
254
255 /*-------------------------------------------------------------*/
256
257 #define ACTIONS_SBB(DATA_BITS,DATA_UTYPE) \
258 { \
259 PREAMBLE(DATA_BITS); \
260 { Long cf, pf, af, zf, sf, of; \
261 Long argL, argR, oldC, res; \
262 oldC = CC_NDEP & AMD64G_CC_MASK_C; \
263 argL = CC_DEP1; \
264 argR = CC_DEP2 ^ oldC; \
265 res = (argL - argR) - oldC; \
266 if (oldC) \
267 cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR; \
268 else \
269 cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR; \
270 pf = parity_table[(UChar)res]; \
271 af = (res ^ argL ^ argR) & 0x10; \
272 zf = ((DATA_UTYPE)res == 0) << 6; \
273 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
274 of = lshift((argL ^ argR) & (argL ^ res), \
275 12 - DATA_BITS) & AMD64G_CC_MASK_O; \
276 return cf | pf | af | zf | sf | of; \
277 } \
278 }
279
280 /*-------------------------------------------------------------*/
281
282 #define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE) \
283 { \
284 PREAMBLE(DATA_BITS); \
285 { Long cf, pf, af, zf, sf, of; \
286 cf = 0; \
287 pf = parity_table[(UChar)CC_DEP1]; \
288 af = 0; \
289 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \
290 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
291 of = 0; \
292 return cf | pf | af | zf | sf | of; \
293 } \
294 }
295
296 /*-------------------------------------------------------------*/
297
298 #define ACTIONS_INC(DATA_BITS,DATA_UTYPE) \
299 { \
300 PREAMBLE(DATA_BITS); \
301 { Long cf, pf, af, zf, sf, of; \
302 Long argL, argR, res; \
303 res = CC_DEP1; \
304 argL = res - 1; \
305 argR = 1; \
306 cf = CC_NDEP & AMD64G_CC_MASK_C; \
307 pf = parity_table[(UChar)res]; \
308 af = (res ^ argL ^ argR) & 0x10; \
309 zf = ((DATA_UTYPE)res == 0) << 6; \
310 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
311 of = ((res & DATA_MASK) == SIGN_MASK) << 11; \
312 return cf | pf | af | zf | sf | of; \
313 } \
314 }
315
316 /*-------------------------------------------------------------*/
317
318 #define ACTIONS_DEC(DATA_BITS,DATA_UTYPE) \
319 { \
320 PREAMBLE(DATA_BITS); \
321 { Long cf, pf, af, zf, sf, of; \
322 Long argL, argR, res; \
323 res = CC_DEP1; \
324 argL = res + 1; \
325 argR = 1; \
326 cf = CC_NDEP & AMD64G_CC_MASK_C; \
327 pf = parity_table[(UChar)res]; \
328 af = (res ^ argL ^ argR) & 0x10; \
329 zf = ((DATA_UTYPE)res == 0) << 6; \
330 sf = lshift(res, 8 - DATA_BITS) & 0x80; \
331 of = ((res & DATA_MASK) \
332 == ((ULong)SIGN_MASK - 1)) << 11; \
333 return cf | pf | af | zf | sf | of; \
334 } \
335 }
336
337 /*-------------------------------------------------------------*/
338
339 #define ACTIONS_SHL(DATA_BITS,DATA_UTYPE) \
340 { \
341 PREAMBLE(DATA_BITS); \
342 { Long cf, pf, af, zf, sf, of; \
343 cf = (CC_DEP2 >> (DATA_BITS - 1)) & AMD64G_CC_MASK_C; \
344 pf = parity_table[(UChar)CC_DEP1]; \
345 af = 0; /* undefined */ \
346 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \
347 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
348 /* of is defined if shift count == 1 */ \
349 of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) \
350 & AMD64G_CC_MASK_O; \
351 return cf | pf | af | zf | sf | of; \
352 } \
353 }
354
355 /*-------------------------------------------------------------*/
356
357 #define ACTIONS_SHR(DATA_BITS,DATA_UTYPE) \
358 { \
359 PREAMBLE(DATA_BITS); \
360 { Long cf, pf, af, zf, sf, of; \
361 cf = CC_DEP2 & 1; \
362 pf = parity_table[(UChar)CC_DEP1]; \
363 af = 0; /* undefined */ \
364 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \
365 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
366 /* of is defined if shift count == 1 */ \
367 of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) \
368 & AMD64G_CC_MASK_O; \
369 return cf | pf | af | zf | sf | of; \
370 } \
371 }
372
373 /*-------------------------------------------------------------*/
374
375 /* ROL: cf' = lsb(result). of' = msb(result) ^ lsb(result). */
376 /* DEP1 = result, NDEP = old flags */
377 #define ACTIONS_ROL(DATA_BITS,DATA_UTYPE) \
378 { \
379 PREAMBLE(DATA_BITS); \
380 { Long fl \
381 = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C)) \
382 | (AMD64G_CC_MASK_C & CC_DEP1) \
383 | (AMD64G_CC_MASK_O & (lshift(CC_DEP1, \
384 11-(DATA_BITS-1)) \
385 ^ lshift(CC_DEP1, 11))); \
386 return fl; \
387 } \
388 }
389
390 /*-------------------------------------------------------------*/
391
392 /* ROR: cf' = msb(result). of' = msb(result) ^ msb-1(result). */
393 /* DEP1 = result, NDEP = old flags */
394 #define ACTIONS_ROR(DATA_BITS,DATA_UTYPE) \
395 { \
396 PREAMBLE(DATA_BITS); \
397 { Long fl \
398 = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C)) \
399 | (AMD64G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1))) \
400 | (AMD64G_CC_MASK_O & (lshift(CC_DEP1, \
401 11-(DATA_BITS-1)) \
402 ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1))); \
403 return fl; \
404 } \
405 }
406
407 /*-------------------------------------------------------------*/
408
409 #define ACTIONS_UMUL(DATA_BITS, DATA_UTYPE, NARROWtoU, \
410 DATA_U2TYPE, NARROWto2U) \
411 { \
412 PREAMBLE(DATA_BITS); \
413 { Long cf, pf, af, zf, sf, of; \
414 DATA_UTYPE hi; \
415 DATA_UTYPE lo \
416 = NARROWtoU( ((DATA_UTYPE)CC_DEP1) \
417 * ((DATA_UTYPE)CC_DEP2) ); \
418 DATA_U2TYPE rr \
419 = NARROWto2U( \
420 ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1)) \
421 * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2)) ); \
422 hi = NARROWtoU(rr >>/*u*/ DATA_BITS); \
423 cf = (hi != 0); \
424 pf = parity_table[(UChar)lo]; \
425 af = 0; /* undefined */ \
426 zf = (lo == 0) << 6; \
427 sf = lshift(lo, 8 - DATA_BITS) & 0x80; \
428 of = cf << 11; \
429 return cf | pf | af | zf | sf | of; \
430 } \
431 }
432
433 /*-------------------------------------------------------------*/
434
435 #define ACTIONS_SMUL(DATA_BITS, DATA_STYPE, NARROWtoS, \
436 DATA_S2TYPE, NARROWto2S) \
437 { \
438 PREAMBLE(DATA_BITS); \
439 { Long cf, pf, af, zf, sf, of; \
440 DATA_STYPE hi; \
441 DATA_STYPE lo \
442 = NARROWtoS( ((DATA_STYPE)CC_DEP1) \
443 * ((DATA_STYPE)CC_DEP2) ); \
444 DATA_S2TYPE rr \
445 = NARROWto2S( \
446 ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1)) \
447 * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2)) ); \
448 hi = NARROWtoS(rr >>/*s*/ DATA_BITS); \
449 cf = (hi != (lo >>/*s*/ (DATA_BITS-1))); \
450 pf = parity_table[(UChar)lo]; \
451 af = 0; /* undefined */ \
452 zf = (lo == 0) << 6; \
453 sf = lshift(lo, 8 - DATA_BITS) & 0x80; \
454 of = cf << 11; \
455 return cf | pf | af | zf | sf | of; \
456 } \
457 }
458
459 /*-------------------------------------------------------------*/
460
461 #define ACTIONS_UMULQ \
462 { \
463 PREAMBLE(64); \
464 { Long cf, pf, af, zf, sf, of; \
465 ULong lo, hi; \
466 mullU64( (ULong)CC_DEP1, (ULong)CC_DEP2, &hi, &lo ); \
467 cf = (hi != 0); \
468 pf = parity_table[(UChar)lo]; \
469 af = 0; /* undefined */ \
470 zf = (lo == 0) << 6; \
471 sf = lshift(lo, 8 - 64) & 0x80; \
472 of = cf << 11; \
473 return cf | pf | af | zf | sf | of; \
474 } \
475 }
476
477 /*-------------------------------------------------------------*/
478
479 #define ACTIONS_SMULQ \
480 { \
481 PREAMBLE(64); \
482 { Long cf, pf, af, zf, sf, of; \
483 Long lo, hi; \
484 mullS64( (Long)CC_DEP1, (Long)CC_DEP2, &hi, &lo ); \
485 cf = (hi != (lo >>/*s*/ (64-1))); \
486 pf = parity_table[(UChar)lo]; \
487 af = 0; /* undefined */ \
488 zf = (lo == 0) << 6; \
489 sf = lshift(lo, 8 - 64) & 0x80; \
490 of = cf << 11; \
491 return cf | pf | af | zf | sf | of; \
492 } \
493 }
494
495 /*-------------------------------------------------------------*/
496
497 #define ACTIONS_ANDN(DATA_BITS,DATA_UTYPE) \
498 { \
499 PREAMBLE(DATA_BITS); \
500 { Long cf, pf, af, zf, sf, of; \
501 cf = 0; \
502 pf = 0; \
503 af = 0; \
504 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \
505 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
506 of = 0; \
507 return cf | pf | af | zf | sf | of; \
508 } \
509 }
510
511 /*-------------------------------------------------------------*/
512
513 #define ACTIONS_BLSI(DATA_BITS,DATA_UTYPE) \
514 { \
515 PREAMBLE(DATA_BITS); \
516 { Long cf, pf, af, zf, sf, of; \
517 cf = ((DATA_UTYPE)CC_DEP2 != 0); \
518 pf = 0; \
519 af = 0; \
520 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \
521 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
522 of = 0; \
523 return cf | pf | af | zf | sf | of; \
524 } \
525 }
526
527 /*-------------------------------------------------------------*/
528
529 #define ACTIONS_BLSMSK(DATA_BITS,DATA_UTYPE) \
530 { \
531 PREAMBLE(DATA_BITS); \
532 { Long cf, pf, af, zf, sf, of; \
533 cf = ((DATA_UTYPE)CC_DEP2 == 0); \
534 pf = 0; \
535 af = 0; \
536 zf = 0; \
537 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
538 of = 0; \
539 return cf | pf | af | zf | sf | of; \
540 } \
541 }
542
543 /*-------------------------------------------------------------*/
544
545 #define ACTIONS_BLSR(DATA_BITS,DATA_UTYPE) \
546 { \
547 PREAMBLE(DATA_BITS); \
548 { Long cf, pf, af, zf, sf, of; \
549 cf = ((DATA_UTYPE)CC_DEP2 == 0); \
550 pf = 0; \
551 af = 0; \
552 zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6; \
553 sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80; \
554 of = 0; \
555 return cf | pf | af | zf | sf | of; \
556 } \
557 }
558
559 /*-------------------------------------------------------------*/
560
561
562 #if PROFILE_RFLAGS
563
564 static Bool initted = False;
565
566 /* C flag, fast route */
567 static UInt tabc_fast[AMD64G_CC_OP_NUMBER];
568 /* C flag, slow route */
569 static UInt tabc_slow[AMD64G_CC_OP_NUMBER];
570 /* table for calculate_cond */
571 static UInt tab_cond[AMD64G_CC_OP_NUMBER][16];
572 /* total entry counts for calc_all, calc_c, calc_cond. */
573 static UInt n_calc_all = 0;
574 static UInt n_calc_c = 0;
575 static UInt n_calc_cond = 0;
576
577 #define SHOW_COUNTS_NOW (0 == (0x3FFFFF & (n_calc_all+n_calc_c+n_calc_cond)))
578
579
showCounts(void)580 static void showCounts ( void )
581 {
582 Int op, co;
583 HChar ch;
584 vex_printf("\nTotal calls: calc_all=%u calc_cond=%u calc_c=%u\n",
585 n_calc_all, n_calc_cond, n_calc_c);
586
587 vex_printf(" cSLOW cFAST O NO B NB Z NZ BE NBE"
588 " S NS P NP L NL LE NLE\n");
589 vex_printf(" -----------------------------------------------------"
590 "----------------------------------------\n");
591 for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
592
593 ch = ' ';
594 if (op > 0 && (op-1) % 4 == 0)
595 ch = 'B';
596 if (op > 0 && (op-1) % 4 == 1)
597 ch = 'W';
598 if (op > 0 && (op-1) % 4 == 2)
599 ch = 'L';
600 if (op > 0 && (op-1) % 4 == 3)
601 ch = 'Q';
602
603 vex_printf("%2d%c: ", op, ch);
604 vex_printf("%6u ", tabc_slow[op]);
605 vex_printf("%6u ", tabc_fast[op]);
606 for (co = 0; co < 16; co++) {
607 Int n = tab_cond[op][co];
608 if (n >= 1000) {
609 vex_printf(" %3dK", n / 1000);
610 } else
611 if (n >= 0) {
612 vex_printf(" %3d ", n );
613 } else {
614 vex_printf(" ");
615 }
616 }
617 vex_printf("\n");
618 }
619 vex_printf("\n");
620 }
621
initCounts(void)622 static void initCounts ( void )
623 {
624 Int op, co;
625 initted = True;
626 for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
627 tabc_fast[op] = tabc_slow[op] = 0;
628 for (co = 0; co < 16; co++)
629 tab_cond[op][co] = 0;
630 }
631 }
632
633 #endif /* PROFILE_RFLAGS */
634
635
636 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
637 /* Calculate all the 6 flags from the supplied thunk parameters.
638 Worker function, not directly called from generated code. */
639 static
amd64g_calculate_rflags_all_WRK(ULong cc_op,ULong cc_dep1_formal,ULong cc_dep2_formal,ULong cc_ndep_formal)640 ULong amd64g_calculate_rflags_all_WRK ( ULong cc_op,
641 ULong cc_dep1_formal,
642 ULong cc_dep2_formal,
643 ULong cc_ndep_formal )
644 {
645 switch (cc_op) {
646 case AMD64G_CC_OP_COPY:
647 return cc_dep1_formal
648 & (AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z
649 | AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P);
650
651 case AMD64G_CC_OP_ADDB: ACTIONS_ADD( 8, UChar );
652 case AMD64G_CC_OP_ADDW: ACTIONS_ADD( 16, UShort );
653 case AMD64G_CC_OP_ADDL: ACTIONS_ADD( 32, UInt );
654 case AMD64G_CC_OP_ADDQ: ACTIONS_ADD( 64, ULong );
655
656 case AMD64G_CC_OP_ADCB: ACTIONS_ADC( 8, UChar );
657 case AMD64G_CC_OP_ADCW: ACTIONS_ADC( 16, UShort );
658 case AMD64G_CC_OP_ADCL: ACTIONS_ADC( 32, UInt );
659 case AMD64G_CC_OP_ADCQ: ACTIONS_ADC( 64, ULong );
660
661 case AMD64G_CC_OP_SUBB: ACTIONS_SUB( 8, UChar );
662 case AMD64G_CC_OP_SUBW: ACTIONS_SUB( 16, UShort );
663 case AMD64G_CC_OP_SUBL: ACTIONS_SUB( 32, UInt );
664 case AMD64G_CC_OP_SUBQ: ACTIONS_SUB( 64, ULong );
665
666 case AMD64G_CC_OP_SBBB: ACTIONS_SBB( 8, UChar );
667 case AMD64G_CC_OP_SBBW: ACTIONS_SBB( 16, UShort );
668 case AMD64G_CC_OP_SBBL: ACTIONS_SBB( 32, UInt );
669 case AMD64G_CC_OP_SBBQ: ACTIONS_SBB( 64, ULong );
670
671 case AMD64G_CC_OP_LOGICB: ACTIONS_LOGIC( 8, UChar );
672 case AMD64G_CC_OP_LOGICW: ACTIONS_LOGIC( 16, UShort );
673 case AMD64G_CC_OP_LOGICL: ACTIONS_LOGIC( 32, UInt );
674 case AMD64G_CC_OP_LOGICQ: ACTIONS_LOGIC( 64, ULong );
675
676 case AMD64G_CC_OP_INCB: ACTIONS_INC( 8, UChar );
677 case AMD64G_CC_OP_INCW: ACTIONS_INC( 16, UShort );
678 case AMD64G_CC_OP_INCL: ACTIONS_INC( 32, UInt );
679 case AMD64G_CC_OP_INCQ: ACTIONS_INC( 64, ULong );
680
681 case AMD64G_CC_OP_DECB: ACTIONS_DEC( 8, UChar );
682 case AMD64G_CC_OP_DECW: ACTIONS_DEC( 16, UShort );
683 case AMD64G_CC_OP_DECL: ACTIONS_DEC( 32, UInt );
684 case AMD64G_CC_OP_DECQ: ACTIONS_DEC( 64, ULong );
685
686 case AMD64G_CC_OP_SHLB: ACTIONS_SHL( 8, UChar );
687 case AMD64G_CC_OP_SHLW: ACTIONS_SHL( 16, UShort );
688 case AMD64G_CC_OP_SHLL: ACTIONS_SHL( 32, UInt );
689 case AMD64G_CC_OP_SHLQ: ACTIONS_SHL( 64, ULong );
690
691 case AMD64G_CC_OP_SHRB: ACTIONS_SHR( 8, UChar );
692 case AMD64G_CC_OP_SHRW: ACTIONS_SHR( 16, UShort );
693 case AMD64G_CC_OP_SHRL: ACTIONS_SHR( 32, UInt );
694 case AMD64G_CC_OP_SHRQ: ACTIONS_SHR( 64, ULong );
695
696 case AMD64G_CC_OP_ROLB: ACTIONS_ROL( 8, UChar );
697 case AMD64G_CC_OP_ROLW: ACTIONS_ROL( 16, UShort );
698 case AMD64G_CC_OP_ROLL: ACTIONS_ROL( 32, UInt );
699 case AMD64G_CC_OP_ROLQ: ACTIONS_ROL( 64, ULong );
700
701 case AMD64G_CC_OP_RORB: ACTIONS_ROR( 8, UChar );
702 case AMD64G_CC_OP_RORW: ACTIONS_ROR( 16, UShort );
703 case AMD64G_CC_OP_RORL: ACTIONS_ROR( 32, UInt );
704 case AMD64G_CC_OP_RORQ: ACTIONS_ROR( 64, ULong );
705
706 case AMD64G_CC_OP_UMULB: ACTIONS_UMUL( 8, UChar, toUChar,
707 UShort, toUShort );
708 case AMD64G_CC_OP_UMULW: ACTIONS_UMUL( 16, UShort, toUShort,
709 UInt, toUInt );
710 case AMD64G_CC_OP_UMULL: ACTIONS_UMUL( 32, UInt, toUInt,
711 ULong, idULong );
712
713 case AMD64G_CC_OP_UMULQ: ACTIONS_UMULQ;
714
715 case AMD64G_CC_OP_SMULB: ACTIONS_SMUL( 8, Char, toUChar,
716 Short, toUShort );
717 case AMD64G_CC_OP_SMULW: ACTIONS_SMUL( 16, Short, toUShort,
718 Int, toUInt );
719 case AMD64G_CC_OP_SMULL: ACTIONS_SMUL( 32, Int, toUInt,
720 Long, idULong );
721
722 case AMD64G_CC_OP_SMULQ: ACTIONS_SMULQ;
723
724 case AMD64G_CC_OP_ANDN32: ACTIONS_ANDN( 32, UInt );
725 case AMD64G_CC_OP_ANDN64: ACTIONS_ANDN( 64, ULong );
726
727 case AMD64G_CC_OP_BLSI32: ACTIONS_BLSI( 32, UInt );
728 case AMD64G_CC_OP_BLSI64: ACTIONS_BLSI( 64, ULong );
729
730 case AMD64G_CC_OP_BLSMSK32: ACTIONS_BLSMSK( 32, UInt );
731 case AMD64G_CC_OP_BLSMSK64: ACTIONS_BLSMSK( 64, ULong );
732
733 case AMD64G_CC_OP_BLSR32: ACTIONS_BLSR( 32, UInt );
734 case AMD64G_CC_OP_BLSR64: ACTIONS_BLSR( 64, ULong );
735
736 default:
737 /* shouldn't really make these calls from generated code */
738 vex_printf("amd64g_calculate_rflags_all_WRK(AMD64)"
739 "( %llu, 0x%llx, 0x%llx, 0x%llx )\n",
740 cc_op, cc_dep1_formal, cc_dep2_formal, cc_ndep_formal );
741 vpanic("amd64g_calculate_rflags_all_WRK(AMD64)");
742 }
743 }
744
745
746 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
747 /* Calculate all the 6 flags from the supplied thunk parameters. */
amd64g_calculate_rflags_all(ULong cc_op,ULong cc_dep1,ULong cc_dep2,ULong cc_ndep)748 ULong amd64g_calculate_rflags_all ( ULong cc_op,
749 ULong cc_dep1,
750 ULong cc_dep2,
751 ULong cc_ndep )
752 {
753 # if PROFILE_RFLAGS
754 if (!initted) initCounts();
755 n_calc_all++;
756 if (SHOW_COUNTS_NOW) showCounts();
757 # endif
758 return
759 amd64g_calculate_rflags_all_WRK ( cc_op, cc_dep1, cc_dep2, cc_ndep );
760 }
761
762
763 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
764 /* Calculate just the carry flag from the supplied thunk parameters. */
amd64g_calculate_rflags_c(ULong cc_op,ULong cc_dep1,ULong cc_dep2,ULong cc_ndep)765 ULong amd64g_calculate_rflags_c ( ULong cc_op,
766 ULong cc_dep1,
767 ULong cc_dep2,
768 ULong cc_ndep )
769 {
770 # if PROFILE_RFLAGS
771 if (!initted) initCounts();
772 n_calc_c++;
773 tabc_fast[cc_op]++;
774 if (SHOW_COUNTS_NOW) showCounts();
775 # endif
776
777 /* Fast-case some common ones. */
778 switch (cc_op) {
779 case AMD64G_CC_OP_COPY:
780 return (cc_dep1 >> AMD64G_CC_SHIFT_C) & 1;
781 case AMD64G_CC_OP_LOGICQ:
782 case AMD64G_CC_OP_LOGICL:
783 case AMD64G_CC_OP_LOGICW:
784 case AMD64G_CC_OP_LOGICB:
785 return 0;
786 // case AMD64G_CC_OP_SUBL:
787 // return ((UInt)cc_dep1) < ((UInt)cc_dep2)
788 // ? AMD64G_CC_MASK_C : 0;
789 // case AMD64G_CC_OP_SUBW:
790 // return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF))
791 // ? AMD64G_CC_MASK_C : 0;
792 // case AMD64G_CC_OP_SUBB:
793 // return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
794 // ? AMD64G_CC_MASK_C : 0;
795 // case AMD64G_CC_OP_INCL:
796 // case AMD64G_CC_OP_DECL:
797 // return cc_ndep & AMD64G_CC_MASK_C;
798 default:
799 break;
800 }
801
802 # if PROFILE_RFLAGS
803 tabc_fast[cc_op]--;
804 tabc_slow[cc_op]++;
805 # endif
806
807 return amd64g_calculate_rflags_all_WRK(cc_op,cc_dep1,cc_dep2,cc_ndep)
808 & AMD64G_CC_MASK_C;
809 }
810
811
812 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
813 /* returns 1 or 0 */
amd64g_calculate_condition(ULong cond,ULong cc_op,ULong cc_dep1,ULong cc_dep2,ULong cc_ndep)814 ULong amd64g_calculate_condition ( ULong/*AMD64Condcode*/ cond,
815 ULong cc_op,
816 ULong cc_dep1,
817 ULong cc_dep2,
818 ULong cc_ndep )
819 {
820 ULong rflags = amd64g_calculate_rflags_all_WRK(cc_op, cc_dep1,
821 cc_dep2, cc_ndep);
822 ULong of,sf,zf,cf,pf;
823 ULong inv = cond & 1;
824
825 # if PROFILE_RFLAGS
826 if (!initted) initCounts();
827 tab_cond[cc_op][cond]++;
828 n_calc_cond++;
829 if (SHOW_COUNTS_NOW) showCounts();
830 # endif
831
832 switch (cond) {
833 case AMD64CondNO:
834 case AMD64CondO: /* OF == 1 */
835 of = rflags >> AMD64G_CC_SHIFT_O;
836 return 1 & (inv ^ of);
837
838 case AMD64CondNZ:
839 case AMD64CondZ: /* ZF == 1 */
840 zf = rflags >> AMD64G_CC_SHIFT_Z;
841 return 1 & (inv ^ zf);
842
843 case AMD64CondNB:
844 case AMD64CondB: /* CF == 1 */
845 cf = rflags >> AMD64G_CC_SHIFT_C;
846 return 1 & (inv ^ cf);
847 break;
848
849 case AMD64CondNBE:
850 case AMD64CondBE: /* (CF or ZF) == 1 */
851 cf = rflags >> AMD64G_CC_SHIFT_C;
852 zf = rflags >> AMD64G_CC_SHIFT_Z;
853 return 1 & (inv ^ (cf | zf));
854 break;
855
856 case AMD64CondNS:
857 case AMD64CondS: /* SF == 1 */
858 sf = rflags >> AMD64G_CC_SHIFT_S;
859 return 1 & (inv ^ sf);
860
861 case AMD64CondNP:
862 case AMD64CondP: /* PF == 1 */
863 pf = rflags >> AMD64G_CC_SHIFT_P;
864 return 1 & (inv ^ pf);
865
866 case AMD64CondNL:
867 case AMD64CondL: /* (SF xor OF) == 1 */
868 sf = rflags >> AMD64G_CC_SHIFT_S;
869 of = rflags >> AMD64G_CC_SHIFT_O;
870 return 1 & (inv ^ (sf ^ of));
871 break;
872
873 case AMD64CondNLE:
874 case AMD64CondLE: /* ((SF xor OF) or ZF) == 1 */
875 sf = rflags >> AMD64G_CC_SHIFT_S;
876 of = rflags >> AMD64G_CC_SHIFT_O;
877 zf = rflags >> AMD64G_CC_SHIFT_Z;
878 return 1 & (inv ^ ((sf ^ of) | zf));
879 break;
880
881 default:
882 /* shouldn't really make these calls from generated code */
883 vex_printf("amd64g_calculate_condition"
884 "( %llu, %llu, 0x%llx, 0x%llx, 0x%llx )\n",
885 cond, cc_op, cc_dep1, cc_dep2, cc_ndep );
886 vpanic("amd64g_calculate_condition");
887 }
888 }
889
890
891 /* VISIBLE TO LIBVEX CLIENT */
LibVEX_GuestAMD64_get_rflags(const VexGuestAMD64State * vex_state)892 ULong LibVEX_GuestAMD64_get_rflags ( /*IN*/const VexGuestAMD64State* vex_state )
893 {
894 ULong rflags = amd64g_calculate_rflags_all_WRK(
895 vex_state->guest_CC_OP,
896 vex_state->guest_CC_DEP1,
897 vex_state->guest_CC_DEP2,
898 vex_state->guest_CC_NDEP
899 );
900 Long dflag = vex_state->guest_DFLAG;
901 vassert(dflag == 1 || dflag == -1);
902 if (dflag == -1)
903 rflags |= (1<<10);
904 if (vex_state->guest_IDFLAG == 1)
905 rflags |= (1<<21);
906 if (vex_state->guest_ACFLAG == 1)
907 rflags |= (1<<18);
908
909 return rflags;
910 }
911
912 /* VISIBLE TO LIBVEX CLIENT */
913 void
LibVEX_GuestAMD64_put_rflag_c(ULong new_carry_flag,VexGuestAMD64State * vex_state)914 LibVEX_GuestAMD64_put_rflag_c ( ULong new_carry_flag,
915 /*MOD*/VexGuestAMD64State* vex_state )
916 {
917 ULong oszacp = amd64g_calculate_rflags_all_WRK(
918 vex_state->guest_CC_OP,
919 vex_state->guest_CC_DEP1,
920 vex_state->guest_CC_DEP2,
921 vex_state->guest_CC_NDEP
922 );
923 if (new_carry_flag & 1) {
924 oszacp |= AMD64G_CC_MASK_C;
925 } else {
926 oszacp &= ~AMD64G_CC_MASK_C;
927 }
928 vex_state->guest_CC_OP = AMD64G_CC_OP_COPY;
929 vex_state->guest_CC_DEP1 = oszacp;
930 vex_state->guest_CC_DEP2 = 0;
931 vex_state->guest_CC_NDEP = 0;
932 }
933
934
935 /*---------------------------------------------------------------*/
936 /*--- %rflags translation-time function specialisers. ---*/
937 /*--- These help iropt specialise calls the above run-time ---*/
938 /*--- %rflags functions. ---*/
939 /*---------------------------------------------------------------*/
940
941 /* Used by the optimiser to try specialisations. Returns an
942 equivalent expression, or NULL if none. */
943
isU64(IRExpr * e,ULong n)944 static Bool isU64 ( IRExpr* e, ULong n )
945 {
946 return toBool( e->tag == Iex_Const
947 && e->Iex.Const.con->tag == Ico_U64
948 && e->Iex.Const.con->Ico.U64 == n );
949 }
950
guest_amd64_spechelper(const HChar * function_name,IRExpr ** args,IRStmt ** precedingStmts,Int n_precedingStmts)951 IRExpr* guest_amd64_spechelper ( const HChar* function_name,
952 IRExpr** args,
953 IRStmt** precedingStmts,
954 Int n_precedingStmts )
955 {
956 # define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
957 # define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
958 # define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
959 # define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
960 # define mkU8(_n) IRExpr_Const(IRConst_U8(_n))
961
962 Int i, arity = 0;
963 for (i = 0; args[i]; i++)
964 arity++;
965 # if 0
966 vex_printf("spec request:\n");
967 vex_printf(" %s ", function_name);
968 for (i = 0; i < arity; i++) {
969 vex_printf(" ");
970 ppIRExpr(args[i]);
971 }
972 vex_printf("\n");
973 # endif
974
975 /* --------- specialising "amd64g_calculate_condition" --------- */
976
977 if (vex_streq(function_name, "amd64g_calculate_condition")) {
978 /* specialise calls to above "calculate condition" function */
979 IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2;
980 vassert(arity == 5);
981 cond = args[0];
982 cc_op = args[1];
983 cc_dep1 = args[2];
984 cc_dep2 = args[3];
985
986 /*---------------- ADDQ ----------------*/
987
988 if (isU64(cc_op, AMD64G_CC_OP_ADDQ) && isU64(cond, AMD64CondZ)) {
989 /* long long add, then Z --> test (dst+src == 0) */
990 return unop(Iop_1Uto64,
991 binop(Iop_CmpEQ64,
992 binop(Iop_Add64, cc_dep1, cc_dep2),
993 mkU64(0)));
994 }
995
996 /*---------------- SUBQ ----------------*/
997
998 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondZ)) {
999 /* long long sub/cmp, then Z --> test dst==src */
1000 return unop(Iop_1Uto64,
1001 binop(Iop_CmpEQ64,cc_dep1,cc_dep2));
1002 }
1003 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNZ)) {
1004 /* long long sub/cmp, then NZ --> test dst!=src */
1005 return unop(Iop_1Uto64,
1006 binop(Iop_CmpNE64,cc_dep1,cc_dep2));
1007 }
1008
1009 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondL)) {
1010 /* long long sub/cmp, then L (signed less than)
1011 --> test dst <s src */
1012 return unop(Iop_1Uto64,
1013 binop(Iop_CmpLT64S, cc_dep1, cc_dep2));
1014 }
1015
1016 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondB)) {
1017 /* long long sub/cmp, then B (unsigned less than)
1018 --> test dst <u src */
1019 return unop(Iop_1Uto64,
1020 binop(Iop_CmpLT64U, cc_dep1, cc_dep2));
1021 }
1022 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNB)) {
1023 /* long long sub/cmp, then NB (unsigned greater than or equal)
1024 --> test src <=u dst */
1025 /* Note, args are opposite way round from the usual */
1026 return unop(Iop_1Uto64,
1027 binop(Iop_CmpLE64U, cc_dep2, cc_dep1));
1028 }
1029
1030 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNLE)) {
1031 /* long sub/cmp, then NLE (signed greater than)
1032 --> test !(dst <=s src)
1033 --> test (dst >s src)
1034 --> test (src <s dst) */
1035 return unop(Iop_1Uto64,
1036 binop(Iop_CmpLT64S, cc_dep2, cc_dep1));
1037
1038 }
1039
1040 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondBE)) {
1041 /* long long sub/cmp, then BE (unsigned less than or equal)
1042 --> test dst <=u src */
1043 return unop(Iop_1Uto64,
1044 binop(Iop_CmpLE64U, cc_dep1, cc_dep2));
1045 }
1046 if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNBE)) {
1047 /* long long sub/cmp, then NBE (unsigned greater than)
1048 --> test !(dst <=u src) */
1049 return binop(Iop_Xor64,
1050 unop(Iop_1Uto64,
1051 binop(Iop_CmpLE64U, cc_dep1, cc_dep2)),
1052 mkU64(1));
1053 }
1054
1055 /*---------------- SUBL ----------------*/
1056
1057 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondZ)) {
1058 /* long sub/cmp, then Z --> test dst==src */
1059 return unop(Iop_1Uto64,
1060 binop(Iop_CmpEQ32,
1061 unop(Iop_64to32, cc_dep1),
1062 unop(Iop_64to32, cc_dep2)));
1063 }
1064 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNZ)) {
1065 /* long sub/cmp, then NZ --> test dst!=src */
1066 return unop(Iop_1Uto64,
1067 binop(Iop_CmpNE32,
1068 unop(Iop_64to32, cc_dep1),
1069 unop(Iop_64to32, cc_dep2)));
1070 }
1071
1072 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondL)) {
1073 /* long sub/cmp, then L (signed less than)
1074 --> test dst <s src */
1075 return unop(Iop_1Uto64,
1076 binop(Iop_CmpLT32S,
1077 unop(Iop_64to32, cc_dep1),
1078 unop(Iop_64to32, cc_dep2)));
1079 }
1080
1081 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondLE)) {
1082 /* long sub/cmp, then LE (signed less than or equal)
1083 --> test dst <=s src */
1084 return unop(Iop_1Uto64,
1085 binop(Iop_CmpLE32S,
1086 unop(Iop_64to32, cc_dep1),
1087 unop(Iop_64to32, cc_dep2)));
1088
1089 }
1090 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNLE)) {
1091 /* long sub/cmp, then NLE (signed greater than)
1092 --> test !(dst <=s src)
1093 --> test (dst >s src)
1094 --> test (src <s dst) */
1095 return unop(Iop_1Uto64,
1096 binop(Iop_CmpLT32S,
1097 unop(Iop_64to32, cc_dep2),
1098 unop(Iop_64to32, cc_dep1)));
1099
1100 }
1101
1102 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondBE)) {
1103 /* long sub/cmp, then BE (unsigned less than or equal)
1104 --> test dst <=u src */
1105 return unop(Iop_1Uto64,
1106 binop(Iop_CmpLE32U,
1107 unop(Iop_64to32, cc_dep1),
1108 unop(Iop_64to32, cc_dep2)));
1109 }
1110 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNBE)) {
1111 /* long sub/cmp, then NBE (unsigned greater than)
1112 --> test src <u dst */
1113 /* Note, args are opposite way round from the usual */
1114 return unop(Iop_1Uto64,
1115 binop(Iop_CmpLT32U,
1116 unop(Iop_64to32, cc_dep2),
1117 unop(Iop_64to32, cc_dep1)));
1118 }
1119
1120 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondS)) {
1121 /* long sub/cmp, then S (negative) --> test (dst-src <s 0) */
1122 return unop(Iop_1Uto64,
1123 binop(Iop_CmpLT32S,
1124 binop(Iop_Sub32,
1125 unop(Iop_64to32, cc_dep1),
1126 unop(Iop_64to32, cc_dep2)),
1127 mkU32(0)));
1128 }
1129
1130 if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondB)) {
1131 /* long sub/cmp, then B (unsigned less than)
1132 --> test dst <u src */
1133 return unop(Iop_1Uto64,
1134 binop(Iop_CmpLT32U,
1135 unop(Iop_64to32, cc_dep1),
1136 unop(Iop_64to32, cc_dep2)));
1137 }
1138
1139 /*---------------- SUBW ----------------*/
1140
1141 if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondZ)) {
1142 /* word sub/cmp, then Z --> test dst==src */
1143 return unop(Iop_1Uto64,
1144 binop(Iop_CmpEQ16,
1145 unop(Iop_64to16,cc_dep1),
1146 unop(Iop_64to16,cc_dep2)));
1147 }
1148 if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNZ)) {
1149 /* word sub/cmp, then NZ --> test dst!=src */
1150 return unop(Iop_1Uto64,
1151 binop(Iop_CmpNE16,
1152 unop(Iop_64to16,cc_dep1),
1153 unop(Iop_64to16,cc_dep2)));
1154 }
1155
1156 if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondLE)) {
1157 /* word sub/cmp, then LE (signed less than or equal)
1158 --> test dst <=s src */
1159 return unop(Iop_1Uto64,
1160 binop(Iop_CmpLE64S,
1161 binop(Iop_Shl64,cc_dep1,mkU8(48)),
1162 binop(Iop_Shl64,cc_dep2,mkU8(48))));
1163
1164 }
1165
1166 /*---------------- SUBB ----------------*/
1167
1168 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondZ)) {
1169 /* byte sub/cmp, then Z --> test dst==src */
1170 return unop(Iop_1Uto64,
1171 binop(Iop_CmpEQ8,
1172 unop(Iop_64to8,cc_dep1),
1173 unop(Iop_64to8,cc_dep2)));
1174 }
1175 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNZ)) {
1176 /* byte sub/cmp, then NZ --> test dst!=src */
1177 return unop(Iop_1Uto64,
1178 binop(Iop_CmpNE8,
1179 unop(Iop_64to8,cc_dep1),
1180 unop(Iop_64to8,cc_dep2)));
1181 }
1182
1183 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondBE)) {
1184 /* byte sub/cmp, then BE (unsigned less than or equal)
1185 --> test dst <=u src */
1186 return unop(Iop_1Uto64,
1187 binop(Iop_CmpLE64U,
1188 binop(Iop_And64, cc_dep1, mkU64(0xFF)),
1189 binop(Iop_And64, cc_dep2, mkU64(0xFF))));
1190 }
1191
1192 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondS)
1193 && isU64(cc_dep2, 0)) {
1194 /* byte sub/cmp of zero, then S --> test (dst-0 <s 0)
1195 --> test dst <s 0
1196 --> (ULong)dst[7]
1197 This is yet another scheme by which gcc figures out if the
1198 top bit of a byte is 1 or 0. See also LOGICB/CondS below. */
1199 /* Note: isU64(cc_dep2, 0) is correct, even though this is
1200 for an 8-bit comparison, since the args to the helper
1201 function are always U64s. */
1202 return binop(Iop_And64,
1203 binop(Iop_Shr64,cc_dep1,mkU8(7)),
1204 mkU64(1));
1205 }
1206 if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNS)
1207 && isU64(cc_dep2, 0)) {
1208 /* byte sub/cmp of zero, then NS --> test !(dst-0 <s 0)
1209 --> test !(dst <s 0)
1210 --> (ULong) !dst[7]
1211 */
1212 return binop(Iop_Xor64,
1213 binop(Iop_And64,
1214 binop(Iop_Shr64,cc_dep1,mkU8(7)),
1215 mkU64(1)),
1216 mkU64(1));
1217 }
1218
1219 /*---------------- LOGICQ ----------------*/
1220
1221 if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondZ)) {
1222 /* long long and/or/xor, then Z --> test dst==0 */
1223 return unop(Iop_1Uto64,
1224 binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
1225 }
1226 if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondNZ)) {
1227 /* long long and/or/xor, then NZ --> test dst!=0 */
1228 return unop(Iop_1Uto64,
1229 binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
1230 }
1231
1232 if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondL)) {
1233 /* long long and/or/xor, then L
1234 LOGIC sets SF and ZF according to the
1235 result and makes OF be zero. L computes SF ^ OF, but
1236 OF is zero, so this reduces to SF -- which will be 1 iff
1237 the result is < signed 0. Hence ...
1238 */
1239 return unop(Iop_1Uto64,
1240 binop(Iop_CmpLT64S,
1241 cc_dep1,
1242 mkU64(0)));
1243 }
1244
1245 /*---------------- LOGICL ----------------*/
1246
1247 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondZ)) {
1248 /* long and/or/xor, then Z --> test dst==0 */
1249 return unop(Iop_1Uto64,
1250 binop(Iop_CmpEQ32,
1251 unop(Iop_64to32, cc_dep1),
1252 mkU32(0)));
1253 }
1254 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNZ)) {
1255 /* long and/or/xor, then NZ --> test dst!=0 */
1256 return unop(Iop_1Uto64,
1257 binop(Iop_CmpNE32,
1258 unop(Iop_64to32, cc_dep1),
1259 mkU32(0)));
1260 }
1261
1262 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondLE)) {
1263 /* long and/or/xor, then LE
1264 This is pretty subtle. LOGIC sets SF and ZF according to the
1265 result and makes OF be zero. LE computes (SF ^ OF) | ZF, but
1266 OF is zero, so this reduces to SF | ZF -- which will be 1 iff
1267 the result is <=signed 0. Hence ...
1268 */
1269 return unop(Iop_1Uto64,
1270 binop(Iop_CmpLE32S,
1271 unop(Iop_64to32, cc_dep1),
1272 mkU32(0)));
1273 }
1274
1275 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondS)) {
1276 /* long and/or/xor, then S --> (ULong)result[31] */
1277 return binop(Iop_And64,
1278 binop(Iop_Shr64, cc_dep1, mkU8(31)),
1279 mkU64(1));
1280 }
1281 if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNS)) {
1282 /* long and/or/xor, then S --> (ULong) ~ result[31] */
1283 return binop(Iop_Xor64,
1284 binop(Iop_And64,
1285 binop(Iop_Shr64, cc_dep1, mkU8(31)),
1286 mkU64(1)),
1287 mkU64(1));
1288 }
1289
1290 /*---------------- LOGICW ----------------*/
1291
1292 if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondZ)) {
1293 /* word and/or/xor, then Z --> test dst==0 */
1294 return unop(Iop_1Uto64,
1295 binop(Iop_CmpEQ64,
1296 binop(Iop_And64, cc_dep1, mkU64(0xFFFF)),
1297 mkU64(0)));
1298 }
1299 if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondNZ)) {
1300 /* word and/or/xor, then NZ --> test dst!=0 */
1301 return unop(Iop_1Uto64,
1302 binop(Iop_CmpNE64,
1303 binop(Iop_And64, cc_dep1, mkU64(0xFFFF)),
1304 mkU64(0)));
1305 }
1306
1307 /*---------------- LOGICB ----------------*/
1308
1309 if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondZ)) {
1310 /* byte and/or/xor, then Z --> test dst==0 */
1311 return unop(Iop_1Uto64,
1312 binop(Iop_CmpEQ64, binop(Iop_And64,cc_dep1,mkU64(255)),
1313 mkU64(0)));
1314 }
1315 if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNZ)) {
1316 /* byte and/or/xor, then NZ --> test dst!=0 */
1317 return unop(Iop_1Uto64,
1318 binop(Iop_CmpNE64, binop(Iop_And64,cc_dep1,mkU64(255)),
1319 mkU64(0)));
1320 }
1321
1322 if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondS)) {
1323 /* this is an idiom gcc sometimes uses to find out if the top
1324 bit of a byte register is set: eg testb %al,%al; js ..
1325 Since it just depends on the top bit of the byte, extract
1326 that bit and explicitly get rid of all the rest. This
1327 helps memcheck avoid false positives in the case where any
1328 of the other bits in the byte are undefined. */
1329 /* byte and/or/xor, then S --> (UInt)result[7] */
1330 return binop(Iop_And64,
1331 binop(Iop_Shr64,cc_dep1,mkU8(7)),
1332 mkU64(1));
1333 }
1334 if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNS)) {
1335 /* byte and/or/xor, then NS --> (UInt)!result[7] */
1336 return binop(Iop_Xor64,
1337 binop(Iop_And64,
1338 binop(Iop_Shr64,cc_dep1,mkU8(7)),
1339 mkU64(1)),
1340 mkU64(1));
1341 }
1342
1343 /*---------------- INCB ----------------*/
1344
1345 if (isU64(cc_op, AMD64G_CC_OP_INCB) && isU64(cond, AMD64CondLE)) {
1346 /* 8-bit inc, then LE --> sign bit of the arg */
1347 return binop(Iop_And64,
1348 binop(Iop_Shr64,
1349 binop(Iop_Sub64, cc_dep1, mkU64(1)),
1350 mkU8(7)),
1351 mkU64(1));
1352 }
1353
1354 /*---------------- INCW ----------------*/
1355
1356 if (isU64(cc_op, AMD64G_CC_OP_INCW) && isU64(cond, AMD64CondZ)) {
1357 /* 16-bit inc, then Z --> test dst == 0 */
1358 return unop(Iop_1Uto64,
1359 binop(Iop_CmpEQ64,
1360 binop(Iop_Shl64,cc_dep1,mkU8(48)),
1361 mkU64(0)));
1362 }
1363
1364 /*---------------- DECL ----------------*/
1365
1366 if (isU64(cc_op, AMD64G_CC_OP_DECL) && isU64(cond, AMD64CondZ)) {
1367 /* dec L, then Z --> test dst == 0 */
1368 return unop(Iop_1Uto64,
1369 binop(Iop_CmpEQ32,
1370 unop(Iop_64to32, cc_dep1),
1371 mkU32(0)));
1372 }
1373
1374 /*---------------- DECW ----------------*/
1375
1376 if (isU64(cc_op, AMD64G_CC_OP_DECW) && isU64(cond, AMD64CondNZ)) {
1377 /* 16-bit dec, then NZ --> test dst != 0 */
1378 return unop(Iop_1Uto64,
1379 binop(Iop_CmpNE64,
1380 binop(Iop_Shl64,cc_dep1,mkU8(48)),
1381 mkU64(0)));
1382 }
1383
1384 /*---------------- COPY ----------------*/
1385 /* This can happen, as a result of amd64 FP compares: "comisd ... ;
1386 jbe" for example. */
1387
1388 if (isU64(cc_op, AMD64G_CC_OP_COPY) &&
1389 (isU64(cond, AMD64CondBE) || isU64(cond, AMD64CondNBE))) {
1390 /* COPY, then BE --> extract C and Z from dep1, and test (C
1391 or Z == 1). */
1392 /* COPY, then NBE --> extract C and Z from dep1, and test (C
1393 or Z == 0). */
1394 ULong nnn = isU64(cond, AMD64CondBE) ? 1 : 0;
1395 return
1396 unop(
1397 Iop_1Uto64,
1398 binop(
1399 Iop_CmpEQ64,
1400 binop(
1401 Iop_And64,
1402 binop(
1403 Iop_Or64,
1404 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
1405 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z))
1406 ),
1407 mkU64(1)
1408 ),
1409 mkU64(nnn)
1410 )
1411 );
1412 }
1413
1414 if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondB)) {
1415 /* COPY, then B --> extract C dep1, and test (C == 1). */
1416 return
1417 unop(
1418 Iop_1Uto64,
1419 binop(
1420 Iop_CmpNE64,
1421 binop(
1422 Iop_And64,
1423 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
1424 mkU64(1)
1425 ),
1426 mkU64(0)
1427 )
1428 );
1429 }
1430
1431 if (isU64(cc_op, AMD64G_CC_OP_COPY)
1432 && (isU64(cond, AMD64CondZ) || isU64(cond, AMD64CondNZ))) {
1433 /* COPY, then Z --> extract Z from dep1, and test (Z == 1). */
1434 /* COPY, then NZ --> extract Z from dep1, and test (Z == 0). */
1435 UInt nnn = isU64(cond, AMD64CondZ) ? 1 : 0;
1436 return
1437 unop(
1438 Iop_1Uto64,
1439 binop(
1440 Iop_CmpEQ64,
1441 binop(
1442 Iop_And64,
1443 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z)),
1444 mkU64(1)
1445 ),
1446 mkU64(nnn)
1447 )
1448 );
1449 }
1450
1451 if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondP)) {
1452 /* COPY, then P --> extract P from dep1, and test (P == 1). */
1453 return
1454 unop(
1455 Iop_1Uto64,
1456 binop(
1457 Iop_CmpNE64,
1458 binop(
1459 Iop_And64,
1460 binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_P)),
1461 mkU64(1)
1462 ),
1463 mkU64(0)
1464 )
1465 );
1466 }
1467
1468 return NULL;
1469 }
1470
1471 /* --------- specialising "amd64g_calculate_rflags_c" --------- */
1472
1473 if (vex_streq(function_name, "amd64g_calculate_rflags_c")) {
1474 /* specialise calls to above "calculate_rflags_c" function */
1475 IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
1476 vassert(arity == 4);
1477 cc_op = args[0];
1478 cc_dep1 = args[1];
1479 cc_dep2 = args[2];
1480 cc_ndep = args[3];
1481
1482 if (isU64(cc_op, AMD64G_CC_OP_SUBQ)) {
1483 /* C after sub denotes unsigned less than */
1484 return unop(Iop_1Uto64,
1485 binop(Iop_CmpLT64U,
1486 cc_dep1,
1487 cc_dep2));
1488 }
1489 if (isU64(cc_op, AMD64G_CC_OP_SUBL)) {
1490 /* C after sub denotes unsigned less than */
1491 return unop(Iop_1Uto64,
1492 binop(Iop_CmpLT32U,
1493 unop(Iop_64to32, cc_dep1),
1494 unop(Iop_64to32, cc_dep2)));
1495 }
1496 if (isU64(cc_op, AMD64G_CC_OP_SUBB)) {
1497 /* C after sub denotes unsigned less than */
1498 return unop(Iop_1Uto64,
1499 binop(Iop_CmpLT64U,
1500 binop(Iop_And64,cc_dep1,mkU64(0xFF)),
1501 binop(Iop_And64,cc_dep2,mkU64(0xFF))));
1502 }
1503 if (isU64(cc_op, AMD64G_CC_OP_LOGICQ)
1504 || isU64(cc_op, AMD64G_CC_OP_LOGICL)
1505 || isU64(cc_op, AMD64G_CC_OP_LOGICW)
1506 || isU64(cc_op, AMD64G_CC_OP_LOGICB)) {
1507 /* cflag after logic is zero */
1508 return mkU64(0);
1509 }
1510 if (isU64(cc_op, AMD64G_CC_OP_DECL) || isU64(cc_op, AMD64G_CC_OP_INCL)
1511 || isU64(cc_op, AMD64G_CC_OP_DECQ) || isU64(cc_op, AMD64G_CC_OP_INCQ)) {
1512 /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
1513 return cc_ndep;
1514 }
1515
1516 # if 0
1517 if (cc_op->tag == Iex_Const) {
1518 vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n");
1519 }
1520 # endif
1521
1522 return NULL;
1523 }
1524
1525 # undef unop
1526 # undef binop
1527 # undef mkU64
1528 # undef mkU32
1529 # undef mkU8
1530
1531 return NULL;
1532 }
1533
1534
1535 /*---------------------------------------------------------------*/
1536 /*--- Supporting functions for x87 FPU activities. ---*/
1537 /*---------------------------------------------------------------*/
1538
host_is_little_endian(void)1539 static inline Bool host_is_little_endian ( void )
1540 {
1541 UInt x = 0x76543210;
1542 UChar* p = (UChar*)(&x);
1543 return toBool(*p == 0x10);
1544 }
1545
1546 /* Inspect a value and its tag, as per the x87 'FXAM' instruction. */
1547 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_FXAM(ULong tag,ULong dbl)1548 ULong amd64g_calculate_FXAM ( ULong tag, ULong dbl )
1549 {
1550 Bool mantissaIsZero;
1551 Int bexp;
1552 UChar sign;
1553 UChar* f64;
1554
1555 vassert(host_is_little_endian());
1556
1557 /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */
1558
1559 f64 = (UChar*)(&dbl);
1560 sign = toUChar( (f64[7] >> 7) & 1 );
1561
1562 /* First off, if the tag indicates the register was empty,
1563 return 1,0,sign,1 */
1564 if (tag == 0) {
1565 /* vex_printf("Empty\n"); */
1566 return AMD64G_FC_MASK_C3 | 0 | (sign << AMD64G_FC_SHIFT_C1)
1567 | AMD64G_FC_MASK_C0;
1568 }
1569
1570 bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
1571 bexp &= 0x7FF;
1572
1573 mantissaIsZero
1574 = toBool(
1575 (f64[6] & 0x0F) == 0
1576 && (f64[5] | f64[4] | f64[3] | f64[2] | f64[1] | f64[0]) == 0
1577 );
1578
1579 /* If both exponent and mantissa are zero, the value is zero.
1580 Return 1,0,sign,0. */
1581 if (bexp == 0 && mantissaIsZero) {
1582 /* vex_printf("Zero\n"); */
1583 return AMD64G_FC_MASK_C3 | 0
1584 | (sign << AMD64G_FC_SHIFT_C1) | 0;
1585 }
1586
1587 /* If exponent is zero but mantissa isn't, it's a denormal.
1588 Return 1,1,sign,0. */
1589 if (bexp == 0 && !mantissaIsZero) {
1590 /* vex_printf("Denormal\n"); */
1591 return AMD64G_FC_MASK_C3 | AMD64G_FC_MASK_C2
1592 | (sign << AMD64G_FC_SHIFT_C1) | 0;
1593 }
1594
1595 /* If the exponent is 7FF and the mantissa is zero, this is an infinity.
1596 Return 0,1,sign,1. */
1597 if (bexp == 0x7FF && mantissaIsZero) {
1598 /* vex_printf("Inf\n"); */
1599 return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1)
1600 | AMD64G_FC_MASK_C0;
1601 }
1602
1603 /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN.
1604 Return 0,0,sign,1. */
1605 if (bexp == 0x7FF && !mantissaIsZero) {
1606 /* vex_printf("NaN\n"); */
1607 return 0 | 0 | (sign << AMD64G_FC_SHIFT_C1) | AMD64G_FC_MASK_C0;
1608 }
1609
1610 /* Uh, ok, we give up. It must be a normal finite number.
1611 Return 0,1,sign,0.
1612 */
1613 /* vex_printf("normal\n"); */
1614 return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1) | 0;
1615 }
1616
1617
1618 /* This is used to implement both 'frstor' and 'fldenv'. The latter
1619 appears to differ from the former only in that the 8 FP registers
1620 themselves are not transferred into the guest state. */
1621 static
do_put_x87(Bool moveRegs,UChar * x87_state,VexGuestAMD64State * vex_state)1622 VexEmNote do_put_x87 ( Bool moveRegs,
1623 /*IN*/UChar* x87_state,
1624 /*OUT*/VexGuestAMD64State* vex_state )
1625 {
1626 Int stno, preg;
1627 UInt tag;
1628 ULong* vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
1629 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
1630 Fpu_State* x87 = (Fpu_State*)x87_state;
1631 UInt ftop = (x87->env[FP_ENV_STAT] >> 11) & 7;
1632 UInt tagw = x87->env[FP_ENV_TAG];
1633 UInt fpucw = x87->env[FP_ENV_CTRL];
1634 UInt c3210 = x87->env[FP_ENV_STAT] & 0x4700;
1635 VexEmNote ew;
1636 UInt fpround;
1637 ULong pair;
1638
1639 /* Copy registers and tags */
1640 for (stno = 0; stno < 8; stno++) {
1641 preg = (stno + ftop) & 7;
1642 tag = (tagw >> (2*preg)) & 3;
1643 if (tag == 3) {
1644 /* register is empty */
1645 /* hmm, if it's empty, does it still get written? Probably
1646 safer to say it does. If we don't, memcheck could get out
1647 of sync, in that it thinks all FP registers are defined by
1648 this helper, but in reality some have not been updated. */
1649 if (moveRegs)
1650 vexRegs[preg] = 0; /* IEEE754 64-bit zero */
1651 vexTags[preg] = 0;
1652 } else {
1653 /* register is non-empty */
1654 if (moveRegs)
1655 convert_f80le_to_f64le( &x87->reg[10*stno],
1656 (UChar*)&vexRegs[preg] );
1657 vexTags[preg] = 1;
1658 }
1659 }
1660
1661 /* stack pointer */
1662 vex_state->guest_FTOP = ftop;
1663
1664 /* status word */
1665 vex_state->guest_FC3210 = c3210;
1666
1667 /* handle the control word, setting FPROUND and detecting any
1668 emulation warnings. */
1669 pair = amd64g_check_fldcw ( (ULong)fpucw );
1670 fpround = (UInt)pair & 0xFFFFFFFFULL;
1671 ew = (VexEmNote)(pair >> 32);
1672
1673 vex_state->guest_FPROUND = fpround & 3;
1674
1675 /* emulation warnings --> caller */
1676 return ew;
1677 }
1678
1679
1680 /* Create an x87 FPU state from the guest state, as close as
1681 we can approximate it. */
1682 static
do_get_x87(VexGuestAMD64State * vex_state,UChar * x87_state)1683 void do_get_x87 ( /*IN*/VexGuestAMD64State* vex_state,
1684 /*OUT*/UChar* x87_state )
1685 {
1686 Int i, stno, preg;
1687 UInt tagw;
1688 ULong* vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
1689 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
1690 Fpu_State* x87 = (Fpu_State*)x87_state;
1691 UInt ftop = vex_state->guest_FTOP;
1692 UInt c3210 = vex_state->guest_FC3210;
1693
1694 for (i = 0; i < 14; i++)
1695 x87->env[i] = 0;
1696
1697 x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
1698 x87->env[FP_ENV_STAT]
1699 = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
1700 x87->env[FP_ENV_CTRL]
1701 = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
1702
1703 /* Dump the register stack in ST order. */
1704 tagw = 0;
1705 for (stno = 0; stno < 8; stno++) {
1706 preg = (stno + ftop) & 7;
1707 if (vexTags[preg] == 0) {
1708 /* register is empty */
1709 tagw |= (3 << (2*preg));
1710 convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
1711 &x87->reg[10*stno] );
1712 } else {
1713 /* register is full. */
1714 tagw |= (0 << (2*preg));
1715 convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
1716 &x87->reg[10*stno] );
1717 }
1718 }
1719 x87->env[FP_ENV_TAG] = toUShort(tagw);
1720 }
1721
1722
1723 /* CALLED FROM GENERATED CODE */
1724 /* DIRTY HELPER (reads guest state, writes guest mem) */
1725 /* NOTE: only handles 32-bit format (no REX.W on the insn) */
amd64g_dirtyhelper_FXSAVE(VexGuestAMD64State * gst,HWord addr)1726 void amd64g_dirtyhelper_FXSAVE ( VexGuestAMD64State* gst, HWord addr )
1727 {
1728 /* Derived from values obtained from
1729 vendor_id : AuthenticAMD
1730 cpu family : 15
1731 model : 12
1732 model name : AMD Athlon(tm) 64 Processor 3200+
1733 stepping : 0
1734 cpu MHz : 2200.000
1735 cache size : 512 KB
1736 */
1737 /* Somewhat roundabout, but at least it's simple. */
1738 Fpu_State tmp;
1739 UShort* addrS = (UShort*)addr;
1740 UChar* addrC = (UChar*)addr;
1741 U128* xmm = (U128*)(addr + 160);
1742 UInt mxcsr;
1743 UShort fp_tags;
1744 UInt summary_tags;
1745 Int r, stno;
1746 UShort *srcS, *dstS;
1747
1748 do_get_x87( gst, (UChar*)&tmp );
1749 mxcsr = amd64g_create_mxcsr( gst->guest_SSEROUND );
1750
1751 /* Now build the proper fxsave image from the x87 image we just
1752 made. */
1753
1754 addrS[0] = tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */
1755 addrS[1] = tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */
1756
1757 /* set addrS[2] in an endian-independent way */
1758 summary_tags = 0;
1759 fp_tags = tmp.env[FP_ENV_TAG];
1760 for (r = 0; r < 8; r++) {
1761 if ( ((fp_tags >> (2*r)) & 3) != 3 )
1762 summary_tags |= (1 << r);
1763 }
1764 addrC[4] = toUChar(summary_tags); /* FTW: tag summary byte */
1765 addrC[5] = 0; /* pad */
1766
1767 /* FOP: faulting fpu opcode. From experimentation, the real CPU
1768 does not write this field. (?!) */
1769 addrS[3] = 0; /* BOGUS */
1770
1771 /* RIP (Last x87 instruction pointer). From experimentation, the
1772 real CPU does not write this field. (?!) */
1773 addrS[4] = 0; /* BOGUS */
1774 addrS[5] = 0; /* BOGUS */
1775 addrS[6] = 0; /* BOGUS */
1776 addrS[7] = 0; /* BOGUS */
1777
1778 /* RDP (Last x87 data pointer). From experimentation, the real CPU
1779 does not write this field. (?!) */
1780 addrS[8] = 0; /* BOGUS */
1781 addrS[9] = 0; /* BOGUS */
1782 addrS[10] = 0; /* BOGUS */
1783 addrS[11] = 0; /* BOGUS */
1784
1785 addrS[12] = toUShort(mxcsr); /* MXCSR */
1786 addrS[13] = toUShort(mxcsr >> 16);
1787
1788 addrS[14] = 0xFFFF; /* MXCSR mask (lo16) */
1789 addrS[15] = 0x0000; /* MXCSR mask (hi16) */
1790
1791 /* Copy in the FP registers, in ST order. */
1792 for (stno = 0; stno < 8; stno++) {
1793 srcS = (UShort*)(&tmp.reg[10*stno]);
1794 dstS = (UShort*)(&addrS[16 + 8*stno]);
1795 dstS[0] = srcS[0];
1796 dstS[1] = srcS[1];
1797 dstS[2] = srcS[2];
1798 dstS[3] = srcS[3];
1799 dstS[4] = srcS[4];
1800 dstS[5] = 0;
1801 dstS[6] = 0;
1802 dstS[7] = 0;
1803 }
1804
1805 /* That's the first 160 bytes of the image done. Now only %xmm0
1806 .. %xmm15 remain to be copied. If the host is big-endian, these
1807 need to be byte-swapped. */
1808 vassert(host_is_little_endian());
1809
1810 # define COPY_U128(_dst,_src) \
1811 do { _dst[0] = _src[0]; _dst[1] = _src[1]; \
1812 _dst[2] = _src[2]; _dst[3] = _src[3]; } \
1813 while (0)
1814
1815 COPY_U128( xmm[0], gst->guest_YMM0 );
1816 COPY_U128( xmm[1], gst->guest_YMM1 );
1817 COPY_U128( xmm[2], gst->guest_YMM2 );
1818 COPY_U128( xmm[3], gst->guest_YMM3 );
1819 COPY_U128( xmm[4], gst->guest_YMM4 );
1820 COPY_U128( xmm[5], gst->guest_YMM5 );
1821 COPY_U128( xmm[6], gst->guest_YMM6 );
1822 COPY_U128( xmm[7], gst->guest_YMM7 );
1823 COPY_U128( xmm[8], gst->guest_YMM8 );
1824 COPY_U128( xmm[9], gst->guest_YMM9 );
1825 COPY_U128( xmm[10], gst->guest_YMM10 );
1826 COPY_U128( xmm[11], gst->guest_YMM11 );
1827 COPY_U128( xmm[12], gst->guest_YMM12 );
1828 COPY_U128( xmm[13], gst->guest_YMM13 );
1829 COPY_U128( xmm[14], gst->guest_YMM14 );
1830 COPY_U128( xmm[15], gst->guest_YMM15 );
1831
1832 # undef COPY_U128
1833 }
1834
1835
1836 /* CALLED FROM GENERATED CODE */
1837 /* DIRTY HELPER (writes guest state, reads guest mem) */
amd64g_dirtyhelper_FXRSTOR(VexGuestAMD64State * gst,HWord addr)1838 VexEmNote amd64g_dirtyhelper_FXRSTOR ( VexGuestAMD64State* gst, HWord addr )
1839 {
1840 Fpu_State tmp;
1841 VexEmNote warnX87 = EmNote_NONE;
1842 VexEmNote warnXMM = EmNote_NONE;
1843 UShort* addrS = (UShort*)addr;
1844 UChar* addrC = (UChar*)addr;
1845 U128* xmm = (U128*)(addr + 160);
1846 UShort fp_tags;
1847 Int r, stno, i;
1848
1849 /* Restore %xmm0 .. %xmm15. If the host is big-endian, these need
1850 to be byte-swapped. */
1851 vassert(host_is_little_endian());
1852
1853 # define COPY_U128(_dst,_src) \
1854 do { _dst[0] = _src[0]; _dst[1] = _src[1]; \
1855 _dst[2] = _src[2]; _dst[3] = _src[3]; } \
1856 while (0)
1857
1858 COPY_U128( gst->guest_YMM0, xmm[0] );
1859 COPY_U128( gst->guest_YMM1, xmm[1] );
1860 COPY_U128( gst->guest_YMM2, xmm[2] );
1861 COPY_U128( gst->guest_YMM3, xmm[3] );
1862 COPY_U128( gst->guest_YMM4, xmm[4] );
1863 COPY_U128( gst->guest_YMM5, xmm[5] );
1864 COPY_U128( gst->guest_YMM6, xmm[6] );
1865 COPY_U128( gst->guest_YMM7, xmm[7] );
1866 COPY_U128( gst->guest_YMM8, xmm[8] );
1867 COPY_U128( gst->guest_YMM9, xmm[9] );
1868 COPY_U128( gst->guest_YMM10, xmm[10] );
1869 COPY_U128( gst->guest_YMM11, xmm[11] );
1870 COPY_U128( gst->guest_YMM12, xmm[12] );
1871 COPY_U128( gst->guest_YMM13, xmm[13] );
1872 COPY_U128( gst->guest_YMM14, xmm[14] );
1873 COPY_U128( gst->guest_YMM15, xmm[15] );
1874
1875 # undef COPY_U128
1876
1877 /* Copy the x87 registers out of the image, into a temporary
1878 Fpu_State struct. */
1879 for (i = 0; i < 14; i++) tmp.env[i] = 0;
1880 for (i = 0; i < 80; i++) tmp.reg[i] = 0;
1881 /* fill in tmp.reg[0..7] */
1882 for (stno = 0; stno < 8; stno++) {
1883 UShort* dstS = (UShort*)(&tmp.reg[10*stno]);
1884 UShort* srcS = (UShort*)(&addrS[16 + 8*stno]);
1885 dstS[0] = srcS[0];
1886 dstS[1] = srcS[1];
1887 dstS[2] = srcS[2];
1888 dstS[3] = srcS[3];
1889 dstS[4] = srcS[4];
1890 }
1891 /* fill in tmp.env[0..13] */
1892 tmp.env[FP_ENV_CTRL] = addrS[0]; /* FCW: fpu control word */
1893 tmp.env[FP_ENV_STAT] = addrS[1]; /* FCW: fpu status word */
1894
1895 fp_tags = 0;
1896 for (r = 0; r < 8; r++) {
1897 if (addrC[4] & (1<<r))
1898 fp_tags |= (0 << (2*r)); /* EMPTY */
1899 else
1900 fp_tags |= (3 << (2*r)); /* VALID -- not really precise enough. */
1901 }
1902 tmp.env[FP_ENV_TAG] = fp_tags;
1903
1904 /* Now write 'tmp' into the guest state. */
1905 warnX87 = do_put_x87( True/*moveRegs*/, (UChar*)&tmp, gst );
1906
1907 { UInt w32 = (((UInt)addrS[12]) & 0xFFFF)
1908 | ((((UInt)addrS[13]) & 0xFFFF) << 16);
1909 ULong w64 = amd64g_check_ldmxcsr( (ULong)w32 );
1910
1911 warnXMM = (VexEmNote)(w64 >> 32);
1912
1913 gst->guest_SSEROUND = w64 & 0xFFFFFFFFULL;
1914 }
1915
1916 /* Prefer an X87 emwarn over an XMM one, if both exist. */
1917 if (warnX87 != EmNote_NONE)
1918 return warnX87;
1919 else
1920 return warnXMM;
1921 }
1922
1923
1924 /* DIRTY HELPER (writes guest state) */
1925 /* Initialise the x87 FPU state as per 'finit'. */
amd64g_dirtyhelper_FINIT(VexGuestAMD64State * gst)1926 void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* gst )
1927 {
1928 Int i;
1929 gst->guest_FTOP = 0;
1930 for (i = 0; i < 8; i++) {
1931 gst->guest_FPTAG[i] = 0; /* empty */
1932 gst->guest_FPREG[i] = 0; /* IEEE754 64-bit zero */
1933 }
1934 gst->guest_FPROUND = (ULong)Irrm_NEAREST;
1935 gst->guest_FC3210 = 0;
1936 }
1937
1938
1939 /* CALLED FROM GENERATED CODE */
1940 /* DIRTY HELPER (reads guest memory) */
amd64g_dirtyhelper_loadF80le(ULong addrU)1941 ULong amd64g_dirtyhelper_loadF80le ( ULong addrU )
1942 {
1943 ULong f64;
1944 convert_f80le_to_f64le ( (UChar*)ULong_to_Ptr(addrU), (UChar*)&f64 );
1945 return f64;
1946 }
1947
1948 /* CALLED FROM GENERATED CODE */
1949 /* DIRTY HELPER (writes guest memory) */
amd64g_dirtyhelper_storeF80le(ULong addrU,ULong f64)1950 void amd64g_dirtyhelper_storeF80le ( ULong addrU, ULong f64 )
1951 {
1952 convert_f64le_to_f80le( (UChar*)&f64, (UChar*)ULong_to_Ptr(addrU) );
1953 }
1954
1955
1956 /* CALLED FROM GENERATED CODE */
1957 /* CLEAN HELPER */
1958 /* mxcsr[15:0] contains a SSE native format MXCSR value.
1959 Extract from it the required SSEROUND value and any resulting
1960 emulation warning, and return (warn << 32) | sseround value.
1961 */
amd64g_check_ldmxcsr(ULong mxcsr)1962 ULong amd64g_check_ldmxcsr ( ULong mxcsr )
1963 {
1964 /* Decide on a rounding mode. mxcsr[14:13] holds it. */
1965 /* NOTE, encoded exactly as per enum IRRoundingMode. */
1966 ULong rmode = (mxcsr >> 13) & 3;
1967
1968 /* Detect any required emulation warnings. */
1969 VexEmNote ew = EmNote_NONE;
1970
1971 if ((mxcsr & 0x1F80) != 0x1F80) {
1972 /* unmasked exceptions! */
1973 ew = EmWarn_X86_sseExns;
1974 }
1975 else
1976 if (mxcsr & (1<<15)) {
1977 /* FZ is set */
1978 ew = EmWarn_X86_fz;
1979 }
1980 else
1981 if (mxcsr & (1<<6)) {
1982 /* DAZ is set */
1983 ew = EmWarn_X86_daz;
1984 }
1985
1986 return (((ULong)ew) << 32) | ((ULong)rmode);
1987 }
1988
1989
1990 /* CALLED FROM GENERATED CODE */
1991 /* CLEAN HELPER */
1992 /* Given sseround as an IRRoundingMode value, create a suitable SSE
1993 native format MXCSR value. */
amd64g_create_mxcsr(ULong sseround)1994 ULong amd64g_create_mxcsr ( ULong sseround )
1995 {
1996 sseround &= 3;
1997 return 0x1F80 | (sseround << 13);
1998 }
1999
2000
2001 /* CLEAN HELPER */
2002 /* fpucw[15:0] contains a x87 native format FPU control word.
2003 Extract from it the required FPROUND value and any resulting
2004 emulation warning, and return (warn << 32) | fpround value.
2005 */
amd64g_check_fldcw(ULong fpucw)2006 ULong amd64g_check_fldcw ( ULong fpucw )
2007 {
2008 /* Decide on a rounding mode. fpucw[11:10] holds it. */
2009 /* NOTE, encoded exactly as per enum IRRoundingMode. */
2010 ULong rmode = (fpucw >> 10) & 3;
2011
2012 /* Detect any required emulation warnings. */
2013 VexEmNote ew = EmNote_NONE;
2014
2015 if ((fpucw & 0x3F) != 0x3F) {
2016 /* unmasked exceptions! */
2017 ew = EmWarn_X86_x87exns;
2018 }
2019 else
2020 if (((fpucw >> 8) & 3) != 3) {
2021 /* unsupported precision */
2022 ew = EmWarn_X86_x87precision;
2023 }
2024
2025 return (((ULong)ew) << 32) | ((ULong)rmode);
2026 }
2027
2028
2029 /* CLEAN HELPER */
2030 /* Given fpround as an IRRoundingMode value, create a suitable x87
2031 native format FPU control word. */
amd64g_create_fpucw(ULong fpround)2032 ULong amd64g_create_fpucw ( ULong fpround )
2033 {
2034 fpround &= 3;
2035 return 0x037F | (fpround << 10);
2036 }
2037
2038
2039 /* This is used to implement 'fldenv'.
2040 Reads 28 bytes at x87_state[0 .. 27]. */
2041 /* CALLED FROM GENERATED CODE */
2042 /* DIRTY HELPER */
amd64g_dirtyhelper_FLDENV(VexGuestAMD64State * vex_state,HWord x87_state)2043 VexEmNote amd64g_dirtyhelper_FLDENV ( /*OUT*/VexGuestAMD64State* vex_state,
2044 /*IN*/HWord x87_state)
2045 {
2046 return do_put_x87( False, (UChar*)x87_state, vex_state );
2047 }
2048
2049
2050 /* CALLED FROM GENERATED CODE */
2051 /* DIRTY HELPER */
2052 /* Create an x87 FPU env from the guest state, as close as we can
2053 approximate it. Writes 28 bytes at x87_state[0..27]. */
amd64g_dirtyhelper_FSTENV(VexGuestAMD64State * vex_state,HWord x87_state)2054 void amd64g_dirtyhelper_FSTENV ( /*IN*/VexGuestAMD64State* vex_state,
2055 /*OUT*/HWord x87_state )
2056 {
2057 Int i, stno, preg;
2058 UInt tagw;
2059 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2060 Fpu_State* x87 = (Fpu_State*)x87_state;
2061 UInt ftop = vex_state->guest_FTOP;
2062 ULong c3210 = vex_state->guest_FC3210;
2063
2064 for (i = 0; i < 14; i++)
2065 x87->env[i] = 0;
2066
2067 x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
2068 x87->env[FP_ENV_STAT]
2069 = toUShort(toUInt( ((ftop & 7) << 11) | (c3210 & 0x4700) ));
2070 x87->env[FP_ENV_CTRL]
2071 = toUShort(toUInt( amd64g_create_fpucw( vex_state->guest_FPROUND ) ));
2072
2073 /* Compute the x87 tag word. */
2074 tagw = 0;
2075 for (stno = 0; stno < 8; stno++) {
2076 preg = (stno + ftop) & 7;
2077 if (vexTags[preg] == 0) {
2078 /* register is empty */
2079 tagw |= (3 << (2*preg));
2080 } else {
2081 /* register is full. */
2082 tagw |= (0 << (2*preg));
2083 }
2084 }
2085 x87->env[FP_ENV_TAG] = toUShort(tagw);
2086
2087 /* We don't dump the x87 registers, tho. */
2088 }
2089
2090
2091 /* This is used to implement 'fnsave'.
2092 Writes 108 bytes at x87_state[0 .. 107]. */
2093 /* CALLED FROM GENERATED CODE */
2094 /* DIRTY HELPER */
amd64g_dirtyhelper_FNSAVE(VexGuestAMD64State * vex_state,HWord x87_state)2095 void amd64g_dirtyhelper_FNSAVE ( /*IN*/VexGuestAMD64State* vex_state,
2096 /*OUT*/HWord x87_state)
2097 {
2098 do_get_x87( vex_state, (UChar*)x87_state );
2099 }
2100
2101
2102 /* This is used to implement 'fnsaves'.
2103 Writes 94 bytes at x87_state[0 .. 93]. */
2104 /* CALLED FROM GENERATED CODE */
2105 /* DIRTY HELPER */
amd64g_dirtyhelper_FNSAVES(VexGuestAMD64State * vex_state,HWord x87_state)2106 void amd64g_dirtyhelper_FNSAVES ( /*IN*/VexGuestAMD64State* vex_state,
2107 /*OUT*/HWord x87_state)
2108 {
2109 Int i, stno, preg;
2110 UInt tagw;
2111 ULong* vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2112 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2113 Fpu_State_16* x87 = (Fpu_State_16*)x87_state;
2114 UInt ftop = vex_state->guest_FTOP;
2115 UInt c3210 = vex_state->guest_FC3210;
2116
2117 for (i = 0; i < 7; i++)
2118 x87->env[i] = 0;
2119
2120 x87->env[FPS_ENV_STAT]
2121 = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
2122 x87->env[FPS_ENV_CTRL]
2123 = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
2124
2125 /* Dump the register stack in ST order. */
2126 tagw = 0;
2127 for (stno = 0; stno < 8; stno++) {
2128 preg = (stno + ftop) & 7;
2129 if (vexTags[preg] == 0) {
2130 /* register is empty */
2131 tagw |= (3 << (2*preg));
2132 convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2133 &x87->reg[10*stno] );
2134 } else {
2135 /* register is full. */
2136 tagw |= (0 << (2*preg));
2137 convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2138 &x87->reg[10*stno] );
2139 }
2140 }
2141 x87->env[FPS_ENV_TAG] = toUShort(tagw);
2142 }
2143
2144
2145 /* This is used to implement 'frstor'.
2146 Reads 108 bytes at x87_state[0 .. 107]. */
2147 /* CALLED FROM GENERATED CODE */
2148 /* DIRTY HELPER */
amd64g_dirtyhelper_FRSTOR(VexGuestAMD64State * vex_state,HWord x87_state)2149 VexEmNote amd64g_dirtyhelper_FRSTOR ( /*OUT*/VexGuestAMD64State* vex_state,
2150 /*IN*/HWord x87_state)
2151 {
2152 return do_put_x87( True, (UChar*)x87_state, vex_state );
2153 }
2154
2155
2156 /* This is used to implement 'frstors'.
2157 Reads 94 bytes at x87_state[0 .. 93]. */
2158 /* CALLED FROM GENERATED CODE */
2159 /* DIRTY HELPER */
amd64g_dirtyhelper_FRSTORS(VexGuestAMD64State * vex_state,HWord x87_state)2160 VexEmNote amd64g_dirtyhelper_FRSTORS ( /*OUT*/VexGuestAMD64State* vex_state,
2161 /*IN*/HWord x87_state)
2162 {
2163 Int stno, preg;
2164 UInt tag;
2165 ULong* vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2166 UChar* vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2167 Fpu_State_16* x87 = (Fpu_State_16*)x87_state;
2168 UInt ftop = (x87->env[FPS_ENV_STAT] >> 11) & 7;
2169 UInt tagw = x87->env[FPS_ENV_TAG];
2170 UInt fpucw = x87->env[FPS_ENV_CTRL];
2171 UInt c3210 = x87->env[FPS_ENV_STAT] & 0x4700;
2172 VexEmNote ew;
2173 UInt fpround;
2174 ULong pair;
2175
2176 /* Copy registers and tags */
2177 for (stno = 0; stno < 8; stno++) {
2178 preg = (stno + ftop) & 7;
2179 tag = (tagw >> (2*preg)) & 3;
2180 if (tag == 3) {
2181 /* register is empty */
2182 /* hmm, if it's empty, does it still get written? Probably
2183 safer to say it does. If we don't, memcheck could get out
2184 of sync, in that it thinks all FP registers are defined by
2185 this helper, but in reality some have not been updated. */
2186 vexRegs[preg] = 0; /* IEEE754 64-bit zero */
2187 vexTags[preg] = 0;
2188 } else {
2189 /* register is non-empty */
2190 convert_f80le_to_f64le( &x87->reg[10*stno],
2191 (UChar*)&vexRegs[preg] );
2192 vexTags[preg] = 1;
2193 }
2194 }
2195
2196 /* stack pointer */
2197 vex_state->guest_FTOP = ftop;
2198
2199 /* status word */
2200 vex_state->guest_FC3210 = c3210;
2201
2202 /* handle the control word, setting FPROUND and detecting any
2203 emulation warnings. */
2204 pair = amd64g_check_fldcw ( (ULong)fpucw );
2205 fpround = (UInt)pair & 0xFFFFFFFFULL;
2206 ew = (VexEmNote)(pair >> 32);
2207
2208 vex_state->guest_FPROUND = fpround & 3;
2209
2210 /* emulation warnings --> caller */
2211 return ew;
2212 }
2213
2214
2215 /*---------------------------------------------------------------*/
2216 /*--- Misc integer helpers, including rotates and CPUID. ---*/
2217 /*---------------------------------------------------------------*/
2218
2219 /* Claim to be the following CPU, which is probably representative of
2220 the lowliest (earliest) amd64 offerings. It can do neither sse3
2221 nor cx16.
2222
2223 vendor_id : AuthenticAMD
2224 cpu family : 15
2225 model : 5
2226 model name : AMD Opteron (tm) Processor 848
2227 stepping : 10
2228 cpu MHz : 1797.682
2229 cache size : 1024 KB
2230 fpu : yes
2231 fpu_exception : yes
2232 cpuid level : 1
2233 wp : yes
2234 flags : fpu vme de pse tsc msr pae mce cx8 apic sep
2235 mtrr pge mca cmov pat pse36 clflush mmx fxsr
2236 sse sse2 syscall nx mmxext lm 3dnowext 3dnow
2237 bogomips : 3600.62
2238 TLB size : 1088 4K pages
2239 clflush size : 64
2240 cache_alignment : 64
2241 address sizes : 40 bits physical, 48 bits virtual
2242 power management: ts fid vid ttp
2243
2244 2012-Feb-21: don't claim 3dnow or 3dnowext, since in fact
2245 we don't support them. See #291568. 3dnow is 80000001.EDX.31
2246 and 3dnowext is 80000001.EDX.30.
2247 */
amd64g_dirtyhelper_CPUID_baseline(VexGuestAMD64State * st)2248 void amd64g_dirtyhelper_CPUID_baseline ( VexGuestAMD64State* st )
2249 {
2250 # define SET_ABCD(_a,_b,_c,_d) \
2251 do { st->guest_RAX = (ULong)(_a); \
2252 st->guest_RBX = (ULong)(_b); \
2253 st->guest_RCX = (ULong)(_c); \
2254 st->guest_RDX = (ULong)(_d); \
2255 } while (0)
2256
2257 switch (0xFFFFFFFF & st->guest_RAX) {
2258 case 0x00000000:
2259 SET_ABCD(0x00000001, 0x68747541, 0x444d4163, 0x69746e65);
2260 break;
2261 case 0x00000001:
2262 SET_ABCD(0x00000f5a, 0x01000800, 0x00000000, 0x078bfbff);
2263 break;
2264 case 0x80000000:
2265 SET_ABCD(0x80000018, 0x68747541, 0x444d4163, 0x69746e65);
2266 break;
2267 case 0x80000001:
2268 /* Don't claim to support 3dnow or 3dnowext. 0xe1d3fbff is
2269 the original it-is-supported value that the h/w provides.
2270 See #291568. */
2271 SET_ABCD(0x00000f5a, 0x00000505, 0x00000000, /*0xe1d3fbff*/
2272 0x21d3fbff);
2273 break;
2274 case 0x80000002:
2275 SET_ABCD(0x20444d41, 0x6574704f, 0x206e6f72, 0x296d7428);
2276 break;
2277 case 0x80000003:
2278 SET_ABCD(0x6f725020, 0x73736563, 0x3820726f, 0x00003834);
2279 break;
2280 case 0x80000004:
2281 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2282 break;
2283 case 0x80000005:
2284 SET_ABCD(0xff08ff08, 0xff20ff20, 0x40020140, 0x40020140);
2285 break;
2286 case 0x80000006:
2287 SET_ABCD(0x00000000, 0x42004200, 0x04008140, 0x00000000);
2288 break;
2289 case 0x80000007:
2290 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x0000000f);
2291 break;
2292 case 0x80000008:
2293 SET_ABCD(0x00003028, 0x00000000, 0x00000000, 0x00000000);
2294 break;
2295 default:
2296 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2297 break;
2298 }
2299 # undef SET_ABCD
2300 }
2301
2302
2303 /* Claim to be the following CPU (2 x ...), which is sse3 and cx16
2304 capable.
2305
2306 vendor_id : GenuineIntel
2307 cpu family : 6
2308 model : 15
2309 model name : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
2310 stepping : 6
2311 cpu MHz : 2394.000
2312 cache size : 4096 KB
2313 physical id : 0
2314 siblings : 2
2315 core id : 0
2316 cpu cores : 2
2317 fpu : yes
2318 fpu_exception : yes
2319 cpuid level : 10
2320 wp : yes
2321 flags : fpu vme de pse tsc msr pae mce cx8 apic sep
2322 mtrr pge mca cmov pat pse36 clflush dts acpi
2323 mmx fxsr sse sse2 ss ht tm syscall nx lm
2324 constant_tsc pni monitor ds_cpl vmx est tm2
2325 cx16 xtpr lahf_lm
2326 bogomips : 4798.78
2327 clflush size : 64
2328 cache_alignment : 64
2329 address sizes : 36 bits physical, 48 bits virtual
2330 power management:
2331 */
amd64g_dirtyhelper_CPUID_sse3_and_cx16(VexGuestAMD64State * st)2332 void amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State* st )
2333 {
2334 # define SET_ABCD(_a,_b,_c,_d) \
2335 do { st->guest_RAX = (ULong)(_a); \
2336 st->guest_RBX = (ULong)(_b); \
2337 st->guest_RCX = (ULong)(_c); \
2338 st->guest_RDX = (ULong)(_d); \
2339 } while (0)
2340
2341 switch (0xFFFFFFFF & st->guest_RAX) {
2342 case 0x00000000:
2343 SET_ABCD(0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69);
2344 break;
2345 case 0x00000001:
2346 SET_ABCD(0x000006f6, 0x00020800, 0x0000e3bd, 0xbfebfbff);
2347 break;
2348 case 0x00000002:
2349 SET_ABCD(0x05b0b101, 0x005657f0, 0x00000000, 0x2cb43049);
2350 break;
2351 case 0x00000003:
2352 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2353 break;
2354 case 0x00000004: {
2355 switch (0xFFFFFFFF & st->guest_RCX) {
2356 case 0x00000000: SET_ABCD(0x04000121, 0x01c0003f,
2357 0x0000003f, 0x00000001); break;
2358 case 0x00000001: SET_ABCD(0x04000122, 0x01c0003f,
2359 0x0000003f, 0x00000001); break;
2360 case 0x00000002: SET_ABCD(0x04004143, 0x03c0003f,
2361 0x00000fff, 0x00000001); break;
2362 default: SET_ABCD(0x00000000, 0x00000000,
2363 0x00000000, 0x00000000); break;
2364 }
2365 break;
2366 }
2367 case 0x00000005:
2368 SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000020);
2369 break;
2370 case 0x00000006:
2371 SET_ABCD(0x00000001, 0x00000002, 0x00000001, 0x00000000);
2372 break;
2373 case 0x00000007:
2374 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2375 break;
2376 case 0x00000008:
2377 SET_ABCD(0x00000400, 0x00000000, 0x00000000, 0x00000000);
2378 break;
2379 case 0x00000009:
2380 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2381 break;
2382 case 0x0000000a:
2383 unhandled_eax_value:
2384 SET_ABCD(0x07280202, 0x00000000, 0x00000000, 0x00000000);
2385 break;
2386 case 0x80000000:
2387 SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
2388 break;
2389 case 0x80000001:
2390 SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x20100800);
2391 break;
2392 case 0x80000002:
2393 SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
2394 break;
2395 case 0x80000003:
2396 SET_ABCD(0x43203229, 0x20205550, 0x20202020, 0x20202020);
2397 break;
2398 case 0x80000004:
2399 SET_ABCD(0x30303636, 0x20402020, 0x30342e32, 0x007a4847);
2400 break;
2401 case 0x80000005:
2402 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2403 break;
2404 case 0x80000006:
2405 SET_ABCD(0x00000000, 0x00000000, 0x10008040, 0x00000000);
2406 break;
2407 case 0x80000007:
2408 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2409 break;
2410 case 0x80000008:
2411 SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
2412 break;
2413 default:
2414 goto unhandled_eax_value;
2415 }
2416 # undef SET_ABCD
2417 }
2418
2419
2420 /* Claim to be the following CPU (4 x ...), which is sse4.2 and cx16
2421 capable.
2422
2423 vendor_id : GenuineIntel
2424 cpu family : 6
2425 model : 37
2426 model name : Intel(R) Core(TM) i5 CPU 670 @ 3.47GHz
2427 stepping : 2
2428 cpu MHz : 3334.000
2429 cache size : 4096 KB
2430 physical id : 0
2431 siblings : 4
2432 core id : 0
2433 cpu cores : 2
2434 apicid : 0
2435 initial apicid : 0
2436 fpu : yes
2437 fpu_exception : yes
2438 cpuid level : 11
2439 wp : yes
2440 flags : fpu vme de pse tsc msr pae mce cx8 apic sep
2441 mtrr pge mca cmov pat pse36 clflush dts acpi
2442 mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
2443 lm constant_tsc arch_perfmon pebs bts rep_good
2444 xtopology nonstop_tsc aperfmperf pni pclmulqdq
2445 dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16
2446 xtpr pdcm sse4_1 sse4_2 popcnt aes lahf_lm ida
2447 arat tpr_shadow vnmi flexpriority ept vpid
2448 bogomips : 6957.57
2449 clflush size : 64
2450 cache_alignment : 64
2451 address sizes : 36 bits physical, 48 bits virtual
2452 power management:
2453 */
amd64g_dirtyhelper_CPUID_sse42_and_cx16(VexGuestAMD64State * st)2454 void amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st )
2455 {
2456 # define SET_ABCD(_a,_b,_c,_d) \
2457 do { st->guest_RAX = (ULong)(_a); \
2458 st->guest_RBX = (ULong)(_b); \
2459 st->guest_RCX = (ULong)(_c); \
2460 st->guest_RDX = (ULong)(_d); \
2461 } while (0)
2462
2463 UInt old_eax = (UInt)st->guest_RAX;
2464 UInt old_ecx = (UInt)st->guest_RCX;
2465
2466 switch (old_eax) {
2467 case 0x00000000:
2468 SET_ABCD(0x0000000b, 0x756e6547, 0x6c65746e, 0x49656e69);
2469 break;
2470 case 0x00000001:
2471 SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff, 0xbfebfbff);
2472 break;
2473 case 0x00000002:
2474 SET_ABCD(0x55035a01, 0x00f0b2e3, 0x00000000, 0x09ca212c);
2475 break;
2476 case 0x00000003:
2477 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2478 break;
2479 case 0x00000004:
2480 switch (old_ecx) {
2481 case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
2482 0x0000003f, 0x00000000); break;
2483 case 0x00000001: SET_ABCD(0x1c004122, 0x00c0003f,
2484 0x0000007f, 0x00000000); break;
2485 case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
2486 0x000001ff, 0x00000000); break;
2487 case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
2488 0x00000fff, 0x00000002); break;
2489 default: SET_ABCD(0x00000000, 0x00000000,
2490 0x00000000, 0x00000000); break;
2491 }
2492 break;
2493 case 0x00000005:
2494 SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
2495 break;
2496 case 0x00000006:
2497 SET_ABCD(0x00000007, 0x00000002, 0x00000001, 0x00000000);
2498 break;
2499 case 0x00000007:
2500 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2501 break;
2502 case 0x00000008:
2503 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2504 break;
2505 case 0x00000009:
2506 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2507 break;
2508 case 0x0000000a:
2509 SET_ABCD(0x07300403, 0x00000004, 0x00000000, 0x00000603);
2510 break;
2511 case 0x0000000b:
2512 switch (old_ecx) {
2513 case 0x00000000:
2514 SET_ABCD(0x00000001, 0x00000002,
2515 0x00000100, 0x00000000); break;
2516 case 0x00000001:
2517 SET_ABCD(0x00000004, 0x00000004,
2518 0x00000201, 0x00000000); break;
2519 default:
2520 SET_ABCD(0x00000000, 0x00000000,
2521 old_ecx, 0x00000000); break;
2522 }
2523 break;
2524 case 0x0000000c:
2525 SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
2526 break;
2527 case 0x0000000d:
2528 switch (old_ecx) {
2529 case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
2530 0x00000100, 0x00000000); break;
2531 case 0x00000001: SET_ABCD(0x00000004, 0x00000004,
2532 0x00000201, 0x00000000); break;
2533 default: SET_ABCD(0x00000000, 0x00000000,
2534 old_ecx, 0x00000000); break;
2535 }
2536 break;
2537 case 0x80000000:
2538 SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
2539 break;
2540 case 0x80000001:
2541 SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
2542 break;
2543 case 0x80000002:
2544 SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
2545 break;
2546 case 0x80000003:
2547 SET_ABCD(0x35692029, 0x55504320, 0x20202020, 0x20202020);
2548 break;
2549 case 0x80000004:
2550 SET_ABCD(0x30373620, 0x20402020, 0x37342e33, 0x007a4847);
2551 break;
2552 case 0x80000005:
2553 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2554 break;
2555 case 0x80000006:
2556 SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
2557 break;
2558 case 0x80000007:
2559 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
2560 break;
2561 case 0x80000008:
2562 SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
2563 break;
2564 default:
2565 SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
2566 break;
2567 }
2568 # undef SET_ABCD
2569 }
2570
2571
2572 /* Claim to be the following CPU (4 x ...), which is AVX and cx16
2573 capable. Plus (kludge!) it "supports" HTM.
2574
2575 vendor_id : GenuineIntel
2576 cpu family : 6
2577 model : 42
2578 model name : Intel(R) Core(TM) i5-2300 CPU @ 2.80GHz
2579 stepping : 7
2580 cpu MHz : 1600.000
2581 cache size : 6144 KB
2582 physical id : 0
2583 siblings : 4
2584 core id : 3
2585 cpu cores : 4
2586 apicid : 6
2587 initial apicid : 6
2588 fpu : yes
2589 fpu_exception : yes
2590 cpuid level : 13
2591 wp : yes
2592 flags : fpu vme de pse tsc msr pae mce cx8 apic sep
2593 mtrr pge mca cmov pat pse36 clflush dts acpi
2594 mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
2595 lm constant_tsc arch_perfmon pebs bts rep_good
2596 nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq
2597 dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16
2598 xtpr pdcm sse4_1 sse4_2 popcnt aes xsave avx
2599 lahf_lm ida arat epb xsaveopt pln pts dts
2600 tpr_shadow vnmi flexpriority ept vpid
2601
2602 bogomips : 5768.94
2603 clflush size : 64
2604 cache_alignment : 64
2605 address sizes : 36 bits physical, 48 bits virtual
2606 power management:
2607 */
amd64g_dirtyhelper_CPUID_avx_and_cx16(VexGuestAMD64State * st)2608 void amd64g_dirtyhelper_CPUID_avx_and_cx16 ( VexGuestAMD64State* st )
2609 {
2610 # define SET_ABCD(_a,_b,_c,_d) \
2611 do { st->guest_RAX = (ULong)(_a); \
2612 st->guest_RBX = (ULong)(_b); \
2613 st->guest_RCX = (ULong)(_c); \
2614 st->guest_RDX = (ULong)(_d); \
2615 } while (0)
2616
2617 UInt old_eax = (UInt)st->guest_RAX;
2618 UInt old_ecx = (UInt)st->guest_RCX;
2619
2620 switch (old_eax) {
2621 case 0x00000000:
2622 SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
2623 break;
2624 case 0x00000001:
2625 SET_ABCD(0x000206a7, 0x00100800, 0x1f9ae3bf, 0xbfebfbff);
2626 break;
2627 case 0x00000002:
2628 SET_ABCD(0x76035a01, 0x00f0b0ff, 0x00000000, 0x00ca0000);
2629 break;
2630 case 0x00000003:
2631 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2632 break;
2633 case 0x00000004:
2634 switch (old_ecx) {
2635 case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
2636 0x0000003f, 0x00000000); break;
2637 case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
2638 0x0000003f, 0x00000000); break;
2639 case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
2640 0x000001ff, 0x00000000); break;
2641 case 0x00000003: SET_ABCD(0x1c03c163, 0x02c0003f,
2642 0x00001fff, 0x00000006); break;
2643 default: SET_ABCD(0x00000000, 0x00000000,
2644 0x00000000, 0x00000000); break;
2645 }
2646 break;
2647 case 0x00000005:
2648 SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
2649 break;
2650 case 0x00000006:
2651 SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
2652 break;
2653 case 0x00000007:
2654 SET_ABCD(0x00000000, 0x00000800, 0x00000000, 0x00000000);
2655 break;
2656 case 0x00000008:
2657 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2658 break;
2659 case 0x00000009:
2660 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2661 break;
2662 case 0x0000000a:
2663 SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
2664 break;
2665 case 0x0000000b:
2666 switch (old_ecx) {
2667 case 0x00000000:
2668 SET_ABCD(0x00000001, 0x00000001,
2669 0x00000100, 0x00000000); break;
2670 case 0x00000001:
2671 SET_ABCD(0x00000004, 0x00000004,
2672 0x00000201, 0x00000000); break;
2673 default:
2674 SET_ABCD(0x00000000, 0x00000000,
2675 old_ecx, 0x00000000); break;
2676 }
2677 break;
2678 case 0x0000000c:
2679 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2680 break;
2681 case 0x0000000d:
2682 switch (old_ecx) {
2683 case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
2684 0x00000340, 0x00000000); break;
2685 case 0x00000001: SET_ABCD(0x00000001, 0x00000000,
2686 0x00000000, 0x00000000); break;
2687 case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
2688 0x00000000, 0x00000000); break;
2689 default: SET_ABCD(0x00000000, 0x00000000,
2690 0x00000000, 0x00000000); break;
2691 }
2692 break;
2693 case 0x0000000e:
2694 SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
2695 break;
2696 case 0x0000000f:
2697 SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
2698 break;
2699 case 0x80000000:
2700 SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
2701 break;
2702 case 0x80000001:
2703 SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
2704 break;
2705 case 0x80000002:
2706 SET_ABCD(0x20202020, 0x20202020, 0x65746e49, 0x2952286c);
2707 break;
2708 case 0x80000003:
2709 SET_ABCD(0x726f4320, 0x4d542865, 0x35692029, 0x3033322d);
2710 break;
2711 case 0x80000004:
2712 SET_ABCD(0x50432030, 0x20402055, 0x30382e32, 0x007a4847);
2713 break;
2714 case 0x80000005:
2715 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2716 break;
2717 case 0x80000006:
2718 SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
2719 break;
2720 case 0x80000007:
2721 SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
2722 break;
2723 case 0x80000008:
2724 SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
2725 break;
2726 default:
2727 SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
2728 break;
2729 }
2730 # undef SET_ABCD
2731 }
2732
2733
amd64g_calculate_RCR(ULong arg,ULong rot_amt,ULong rflags_in,Long szIN)2734 ULong amd64g_calculate_RCR ( ULong arg,
2735 ULong rot_amt,
2736 ULong rflags_in,
2737 Long szIN )
2738 {
2739 Bool wantRflags = toBool(szIN < 0);
2740 ULong sz = wantRflags ? (-szIN) : szIN;
2741 ULong tempCOUNT = rot_amt & (sz == 8 ? 0x3F : 0x1F);
2742 ULong cf=0, of=0, tempcf;
2743
2744 switch (sz) {
2745 case 8:
2746 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2747 of = ((arg >> 63) ^ cf) & 1;
2748 while (tempCOUNT > 0) {
2749 tempcf = arg & 1;
2750 arg = (arg >> 1) | (cf << 63);
2751 cf = tempcf;
2752 tempCOUNT--;
2753 }
2754 break;
2755 case 4:
2756 while (tempCOUNT >= 33) tempCOUNT -= 33;
2757 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2758 of = ((arg >> 31) ^ cf) & 1;
2759 while (tempCOUNT > 0) {
2760 tempcf = arg & 1;
2761 arg = ((arg >> 1) & 0x7FFFFFFFULL) | (cf << 31);
2762 cf = tempcf;
2763 tempCOUNT--;
2764 }
2765 break;
2766 case 2:
2767 while (tempCOUNT >= 17) tempCOUNT -= 17;
2768 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2769 of = ((arg >> 15) ^ cf) & 1;
2770 while (tempCOUNT > 0) {
2771 tempcf = arg & 1;
2772 arg = ((arg >> 1) & 0x7FFFULL) | (cf << 15);
2773 cf = tempcf;
2774 tempCOUNT--;
2775 }
2776 break;
2777 case 1:
2778 while (tempCOUNT >= 9) tempCOUNT -= 9;
2779 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2780 of = ((arg >> 7) ^ cf) & 1;
2781 while (tempCOUNT > 0) {
2782 tempcf = arg & 1;
2783 arg = ((arg >> 1) & 0x7FULL) | (cf << 7);
2784 cf = tempcf;
2785 tempCOUNT--;
2786 }
2787 break;
2788 default:
2789 vpanic("calculate_RCR(amd64g): invalid size");
2790 }
2791
2792 cf &= 1;
2793 of &= 1;
2794 rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
2795 rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
2796
2797 /* caller can ask to have back either the resulting flags or
2798 resulting value, but not both */
2799 return wantRflags ? rflags_in : arg;
2800 }
2801
amd64g_calculate_RCL(ULong arg,ULong rot_amt,ULong rflags_in,Long szIN)2802 ULong amd64g_calculate_RCL ( ULong arg,
2803 ULong rot_amt,
2804 ULong rflags_in,
2805 Long szIN )
2806 {
2807 Bool wantRflags = toBool(szIN < 0);
2808 ULong sz = wantRflags ? (-szIN) : szIN;
2809 ULong tempCOUNT = rot_amt & (sz == 8 ? 0x3F : 0x1F);
2810 ULong cf=0, of=0, tempcf;
2811
2812 switch (sz) {
2813 case 8:
2814 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2815 while (tempCOUNT > 0) {
2816 tempcf = (arg >> 63) & 1;
2817 arg = (arg << 1) | (cf & 1);
2818 cf = tempcf;
2819 tempCOUNT--;
2820 }
2821 of = ((arg >> 63) ^ cf) & 1;
2822 break;
2823 case 4:
2824 while (tempCOUNT >= 33) tempCOUNT -= 33;
2825 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2826 while (tempCOUNT > 0) {
2827 tempcf = (arg >> 31) & 1;
2828 arg = 0xFFFFFFFFULL & ((arg << 1) | (cf & 1));
2829 cf = tempcf;
2830 tempCOUNT--;
2831 }
2832 of = ((arg >> 31) ^ cf) & 1;
2833 break;
2834 case 2:
2835 while (tempCOUNT >= 17) tempCOUNT -= 17;
2836 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2837 while (tempCOUNT > 0) {
2838 tempcf = (arg >> 15) & 1;
2839 arg = 0xFFFFULL & ((arg << 1) | (cf & 1));
2840 cf = tempcf;
2841 tempCOUNT--;
2842 }
2843 of = ((arg >> 15) ^ cf) & 1;
2844 break;
2845 case 1:
2846 while (tempCOUNT >= 9) tempCOUNT -= 9;
2847 cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2848 while (tempCOUNT > 0) {
2849 tempcf = (arg >> 7) & 1;
2850 arg = 0xFFULL & ((arg << 1) | (cf & 1));
2851 cf = tempcf;
2852 tempCOUNT--;
2853 }
2854 of = ((arg >> 7) ^ cf) & 1;
2855 break;
2856 default:
2857 vpanic("calculate_RCL(amd64g): invalid size");
2858 }
2859
2860 cf &= 1;
2861 of &= 1;
2862 rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
2863 rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
2864
2865 return wantRflags ? rflags_in : arg;
2866 }
2867
2868 /* Taken from gf2x-0.9.5, released under GPLv2+ (later versions LGPLv2+)
2869 * svn://scm.gforge.inria.fr/svn/gf2x/trunk/hardware/opteron/gf2x_mul1.h@25
2870 */
amd64g_calculate_pclmul(ULong a,ULong b,ULong which)2871 ULong amd64g_calculate_pclmul(ULong a, ULong b, ULong which)
2872 {
2873 ULong hi, lo, tmp, A[16];
2874
2875 A[0] = 0; A[1] = a;
2876 A[2] = A[1] << 1; A[3] = A[2] ^ a;
2877 A[4] = A[2] << 1; A[5] = A[4] ^ a;
2878 A[6] = A[3] << 1; A[7] = A[6] ^ a;
2879 A[8] = A[4] << 1; A[9] = A[8] ^ a;
2880 A[10] = A[5] << 1; A[11] = A[10] ^ a;
2881 A[12] = A[6] << 1; A[13] = A[12] ^ a;
2882 A[14] = A[7] << 1; A[15] = A[14] ^ a;
2883
2884 lo = (A[b >> 60] << 4) ^ A[(b >> 56) & 15];
2885 hi = lo >> 56;
2886 lo = (lo << 8) ^ (A[(b >> 52) & 15] << 4) ^ A[(b >> 48) & 15];
2887 hi = (hi << 8) | (lo >> 56);
2888 lo = (lo << 8) ^ (A[(b >> 44) & 15] << 4) ^ A[(b >> 40) & 15];
2889 hi = (hi << 8) | (lo >> 56);
2890 lo = (lo << 8) ^ (A[(b >> 36) & 15] << 4) ^ A[(b >> 32) & 15];
2891 hi = (hi << 8) | (lo >> 56);
2892 lo = (lo << 8) ^ (A[(b >> 28) & 15] << 4) ^ A[(b >> 24) & 15];
2893 hi = (hi << 8) | (lo >> 56);
2894 lo = (lo << 8) ^ (A[(b >> 20) & 15] << 4) ^ A[(b >> 16) & 15];
2895 hi = (hi << 8) | (lo >> 56);
2896 lo = (lo << 8) ^ (A[(b >> 12) & 15] << 4) ^ A[(b >> 8) & 15];
2897 hi = (hi << 8) | (lo >> 56);
2898 lo = (lo << 8) ^ (A[(b >> 4) & 15] << 4) ^ A[b & 15];
2899
2900 ULong m0 = -1;
2901 m0 /= 255;
2902 tmp = -((a >> 63) & 1); tmp &= ((b & (m0 * 0xfe)) >> 1); hi = hi ^ tmp;
2903 tmp = -((a >> 62) & 1); tmp &= ((b & (m0 * 0xfc)) >> 2); hi = hi ^ tmp;
2904 tmp = -((a >> 61) & 1); tmp &= ((b & (m0 * 0xf8)) >> 3); hi = hi ^ tmp;
2905 tmp = -((a >> 60) & 1); tmp &= ((b & (m0 * 0xf0)) >> 4); hi = hi ^ tmp;
2906 tmp = -((a >> 59) & 1); tmp &= ((b & (m0 * 0xe0)) >> 5); hi = hi ^ tmp;
2907 tmp = -((a >> 58) & 1); tmp &= ((b & (m0 * 0xc0)) >> 6); hi = hi ^ tmp;
2908 tmp = -((a >> 57) & 1); tmp &= ((b & (m0 * 0x80)) >> 7); hi = hi ^ tmp;
2909
2910 return which ? hi : lo;
2911 }
2912
2913
2914 /* CALLED FROM GENERATED CODE */
2915 /* DIRTY HELPER (non-referentially-transparent) */
2916 /* Horrible hack. On non-amd64 platforms, return 1. */
amd64g_dirtyhelper_RDTSC(void)2917 ULong amd64g_dirtyhelper_RDTSC ( void )
2918 {
2919 # if defined(__x86_64__)
2920 UInt eax, edx;
2921 __asm__ __volatile__("rdtsc" : "=a" (eax), "=d" (edx));
2922 return (((ULong)edx) << 32) | ((ULong)eax);
2923 # else
2924 return 1ULL;
2925 # endif
2926 }
2927
2928 /* CALLED FROM GENERATED CODE */
2929 /* DIRTY HELPER (non-referentially-transparent) */
2930 /* Horrible hack. On non-amd64 platforms, return 1. */
2931 /* This uses a different calling convention from _RDTSC just above
2932 only because of the difficulty of returning 96 bits from a C
2933 function -- RDTSC returns 64 bits and so is simple by comparison,
2934 on amd64. */
amd64g_dirtyhelper_RDTSCP(VexGuestAMD64State * st)2935 void amd64g_dirtyhelper_RDTSCP ( VexGuestAMD64State* st )
2936 {
2937 # if defined(__x86_64__)
2938 UInt eax, ecx, edx;
2939 __asm__ __volatile__("rdtscp" : "=a" (eax), "=d" (edx), "=c" (ecx));
2940 st->guest_RAX = (ULong)eax;
2941 st->guest_RCX = (ULong)ecx;
2942 st->guest_RDX = (ULong)edx;
2943 # else
2944 /* Do nothing. */
2945 # endif
2946 }
2947
2948 /* CALLED FROM GENERATED CODE */
2949 /* DIRTY HELPER (non-referentially-transparent) */
2950 /* Horrible hack. On non-amd64 platforms, return 0. */
amd64g_dirtyhelper_IN(ULong portno,ULong sz)2951 ULong amd64g_dirtyhelper_IN ( ULong portno, ULong sz/*1,2 or 4*/ )
2952 {
2953 # if defined(__x86_64__)
2954 ULong r = 0;
2955 portno &= 0xFFFF;
2956 switch (sz) {
2957 case 4:
2958 __asm__ __volatile__("movq $0,%%rax; inl %w1,%%eax; movq %%rax,%0"
2959 : "=a" (r) : "Nd" (portno));
2960 break;
2961 case 2:
2962 __asm__ __volatile__("movq $0,%%rax; inw %w1,%w0"
2963 : "=a" (r) : "Nd" (portno));
2964 break;
2965 case 1:
2966 __asm__ __volatile__("movq $0,%%rax; inb %w1,%b0"
2967 : "=a" (r) : "Nd" (portno));
2968 break;
2969 default:
2970 break; /* note: no 64-bit version of insn exists */
2971 }
2972 return r;
2973 # else
2974 return 0;
2975 # endif
2976 }
2977
2978
2979 /* CALLED FROM GENERATED CODE */
2980 /* DIRTY HELPER (non-referentially-transparent) */
2981 /* Horrible hack. On non-amd64 platforms, do nothing. */
amd64g_dirtyhelper_OUT(ULong portno,ULong data,ULong sz)2982 void amd64g_dirtyhelper_OUT ( ULong portno, ULong data, ULong sz/*1,2 or 4*/ )
2983 {
2984 # if defined(__x86_64__)
2985 portno &= 0xFFFF;
2986 switch (sz) {
2987 case 4:
2988 __asm__ __volatile__("movq %0,%%rax; outl %%eax, %w1"
2989 : : "a" (data), "Nd" (portno));
2990 break;
2991 case 2:
2992 __asm__ __volatile__("outw %w0, %w1"
2993 : : "a" (data), "Nd" (portno));
2994 break;
2995 case 1:
2996 __asm__ __volatile__("outb %b0, %w1"
2997 : : "a" (data), "Nd" (portno));
2998 break;
2999 default:
3000 break; /* note: no 64-bit version of insn exists */
3001 }
3002 # else
3003 /* do nothing */
3004 # endif
3005 }
3006
3007 /* CALLED FROM GENERATED CODE */
3008 /* DIRTY HELPER (non-referentially-transparent) */
3009 /* Horrible hack. On non-amd64 platforms, do nothing. */
3010 /* op = 0: call the native SGDT instruction.
3011 op = 1: call the native SIDT instruction.
3012 */
amd64g_dirtyhelper_SxDT(void * address,ULong op)3013 void amd64g_dirtyhelper_SxDT ( void *address, ULong op ) {
3014 # if defined(__x86_64__)
3015 switch (op) {
3016 case 0:
3017 __asm__ __volatile__("sgdt (%0)" : : "r" (address) : "memory");
3018 break;
3019 case 1:
3020 __asm__ __volatile__("sidt (%0)" : : "r" (address) : "memory");
3021 break;
3022 default:
3023 vpanic("amd64g_dirtyhelper_SxDT");
3024 }
3025 # else
3026 /* do nothing */
3027 UChar* p = (UChar*)address;
3028 p[0] = p[1] = p[2] = p[3] = p[4] = p[5] = 0;
3029 p[6] = p[7] = p[8] = p[9] = 0;
3030 # endif
3031 }
3032
3033 /*---------------------------------------------------------------*/
3034 /*--- Helpers for MMX/SSE/SSE2. ---*/
3035 /*---------------------------------------------------------------*/
3036
abdU8(UChar xx,UChar yy)3037 static inline UChar abdU8 ( UChar xx, UChar yy ) {
3038 return toUChar(xx>yy ? xx-yy : yy-xx);
3039 }
3040
mk32x2(UInt w1,UInt w0)3041 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
3042 return (((ULong)w1) << 32) | ((ULong)w0);
3043 }
3044
sel16x4_3(ULong w64)3045 static inline UShort sel16x4_3 ( ULong w64 ) {
3046 UInt hi32 = toUInt(w64 >> 32);
3047 return toUShort(hi32 >> 16);
3048 }
sel16x4_2(ULong w64)3049 static inline UShort sel16x4_2 ( ULong w64 ) {
3050 UInt hi32 = toUInt(w64 >> 32);
3051 return toUShort(hi32);
3052 }
sel16x4_1(ULong w64)3053 static inline UShort sel16x4_1 ( ULong w64 ) {
3054 UInt lo32 = toUInt(w64);
3055 return toUShort(lo32 >> 16);
3056 }
sel16x4_0(ULong w64)3057 static inline UShort sel16x4_0 ( ULong w64 ) {
3058 UInt lo32 = toUInt(w64);
3059 return toUShort(lo32);
3060 }
3061
sel8x8_7(ULong w64)3062 static inline UChar sel8x8_7 ( ULong w64 ) {
3063 UInt hi32 = toUInt(w64 >> 32);
3064 return toUChar(hi32 >> 24);
3065 }
sel8x8_6(ULong w64)3066 static inline UChar sel8x8_6 ( ULong w64 ) {
3067 UInt hi32 = toUInt(w64 >> 32);
3068 return toUChar(hi32 >> 16);
3069 }
sel8x8_5(ULong w64)3070 static inline UChar sel8x8_5 ( ULong w64 ) {
3071 UInt hi32 = toUInt(w64 >> 32);
3072 return toUChar(hi32 >> 8);
3073 }
sel8x8_4(ULong w64)3074 static inline UChar sel8x8_4 ( ULong w64 ) {
3075 UInt hi32 = toUInt(w64 >> 32);
3076 return toUChar(hi32 >> 0);
3077 }
sel8x8_3(ULong w64)3078 static inline UChar sel8x8_3 ( ULong w64 ) {
3079 UInt lo32 = toUInt(w64);
3080 return toUChar(lo32 >> 24);
3081 }
sel8x8_2(ULong w64)3082 static inline UChar sel8x8_2 ( ULong w64 ) {
3083 UInt lo32 = toUInt(w64);
3084 return toUChar(lo32 >> 16);
3085 }
sel8x8_1(ULong w64)3086 static inline UChar sel8x8_1 ( ULong w64 ) {
3087 UInt lo32 = toUInt(w64);
3088 return toUChar(lo32 >> 8);
3089 }
sel8x8_0(ULong w64)3090 static inline UChar sel8x8_0 ( ULong w64 ) {
3091 UInt lo32 = toUInt(w64);
3092 return toUChar(lo32 >> 0);
3093 }
3094
3095 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_mmx_pmaddwd(ULong xx,ULong yy)3096 ULong amd64g_calculate_mmx_pmaddwd ( ULong xx, ULong yy )
3097 {
3098 return
3099 mk32x2(
3100 (((Int)(Short)sel16x4_3(xx)) * ((Int)(Short)sel16x4_3(yy)))
3101 + (((Int)(Short)sel16x4_2(xx)) * ((Int)(Short)sel16x4_2(yy))),
3102 (((Int)(Short)sel16x4_1(xx)) * ((Int)(Short)sel16x4_1(yy)))
3103 + (((Int)(Short)sel16x4_0(xx)) * ((Int)(Short)sel16x4_0(yy)))
3104 );
3105 }
3106
3107 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_mmx_psadbw(ULong xx,ULong yy)3108 ULong amd64g_calculate_mmx_psadbw ( ULong xx, ULong yy )
3109 {
3110 UInt t = 0;
3111 t += (UInt)abdU8( sel8x8_7(xx), sel8x8_7(yy) );
3112 t += (UInt)abdU8( sel8x8_6(xx), sel8x8_6(yy) );
3113 t += (UInt)abdU8( sel8x8_5(xx), sel8x8_5(yy) );
3114 t += (UInt)abdU8( sel8x8_4(xx), sel8x8_4(yy) );
3115 t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
3116 t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
3117 t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
3118 t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
3119 t &= 0xFFFF;
3120 return (ULong)t;
3121 }
3122
3123 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_sse_phminposuw(ULong sLo,ULong sHi)3124 ULong amd64g_calculate_sse_phminposuw ( ULong sLo, ULong sHi )
3125 {
3126 UShort t, min;
3127 UInt idx;
3128 t = sel16x4_0(sLo); if (True) { min = t; idx = 0; }
3129 t = sel16x4_1(sLo); if (t < min) { min = t; idx = 1; }
3130 t = sel16x4_2(sLo); if (t < min) { min = t; idx = 2; }
3131 t = sel16x4_3(sLo); if (t < min) { min = t; idx = 3; }
3132 t = sel16x4_0(sHi); if (t < min) { min = t; idx = 4; }
3133 t = sel16x4_1(sHi); if (t < min) { min = t; idx = 5; }
3134 t = sel16x4_2(sHi); if (t < min) { min = t; idx = 6; }
3135 t = sel16x4_3(sHi); if (t < min) { min = t; idx = 7; }
3136 return ((ULong)(idx << 16)) | ((ULong)min);
3137 }
3138
3139 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calc_crc32b(ULong crcIn,ULong b)3140 ULong amd64g_calc_crc32b ( ULong crcIn, ULong b )
3141 {
3142 UInt i;
3143 ULong crc = (b & 0xFFULL) ^ crcIn;
3144 for (i = 0; i < 8; i++)
3145 crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3146 return crc;
3147 }
3148
3149 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calc_crc32w(ULong crcIn,ULong w)3150 ULong amd64g_calc_crc32w ( ULong crcIn, ULong w )
3151 {
3152 UInt i;
3153 ULong crc = (w & 0xFFFFULL) ^ crcIn;
3154 for (i = 0; i < 16; i++)
3155 crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3156 return crc;
3157 }
3158
3159 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calc_crc32l(ULong crcIn,ULong l)3160 ULong amd64g_calc_crc32l ( ULong crcIn, ULong l )
3161 {
3162 UInt i;
3163 ULong crc = (l & 0xFFFFFFFFULL) ^ crcIn;
3164 for (i = 0; i < 32; i++)
3165 crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3166 return crc;
3167 }
3168
3169 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calc_crc32q(ULong crcIn,ULong q)3170 ULong amd64g_calc_crc32q ( ULong crcIn, ULong q )
3171 {
3172 ULong crc = amd64g_calc_crc32l(crcIn, q);
3173 return amd64g_calc_crc32l(crc, q >> 32);
3174 }
3175
3176
3177 /* .. helper for next fn .. */
sad_8x4(ULong xx,ULong yy)3178 static inline ULong sad_8x4 ( ULong xx, ULong yy )
3179 {
3180 UInt t = 0;
3181 t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
3182 t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
3183 t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
3184 t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
3185 return (ULong)t;
3186 }
3187
3188 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calc_mpsadbw(ULong sHi,ULong sLo,ULong dHi,ULong dLo,ULong imm_and_return_control_bit)3189 ULong amd64g_calc_mpsadbw ( ULong sHi, ULong sLo,
3190 ULong dHi, ULong dLo,
3191 ULong imm_and_return_control_bit )
3192 {
3193 UInt imm8 = imm_and_return_control_bit & 7;
3194 Bool calcHi = (imm_and_return_control_bit >> 7) & 1;
3195 UInt srcOffsL = imm8 & 3; /* src offs in 32-bit (L) chunks */
3196 UInt dstOffsL = (imm8 >> 2) & 1; /* dst offs in ditto chunks */
3197 /* For src we only need 32 bits, so get them into the
3198 lower half of a 64 bit word. */
3199 ULong src = ((srcOffsL & 2) ? sHi : sLo) >> (32 * (srcOffsL & 1));
3200 /* For dst we need to get hold of 56 bits (7 bytes) from a total of
3201 11 bytes. If calculating the low part of the result, need bytes
3202 dstOffsL * 4 + (0 .. 6); if calculating the high part,
3203 dstOffsL * 4 + (4 .. 10). */
3204 ULong dst;
3205 /* dstOffL = 0, Lo -> 0 .. 6
3206 dstOffL = 1, Lo -> 4 .. 10
3207 dstOffL = 0, Hi -> 4 .. 10
3208 dstOffL = 1, Hi -> 8 .. 14
3209 */
3210 if (calcHi && dstOffsL) {
3211 /* 8 .. 14 */
3212 dst = dHi & 0x00FFFFFFFFFFFFFFULL;
3213 }
3214 else if (!calcHi && !dstOffsL) {
3215 /* 0 .. 6 */
3216 dst = dLo & 0x00FFFFFFFFFFFFFFULL;
3217 }
3218 else {
3219 /* 4 .. 10 */
3220 dst = (dLo >> 32) | ((dHi & 0x00FFFFFFULL) << 32);
3221 }
3222 ULong r0 = sad_8x4( dst >> 0, src );
3223 ULong r1 = sad_8x4( dst >> 8, src );
3224 ULong r2 = sad_8x4( dst >> 16, src );
3225 ULong r3 = sad_8x4( dst >> 24, src );
3226 ULong res = (r3 << 48) | (r2 << 32) | (r1 << 16) | r0;
3227 return res;
3228 }
3229
3230 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_pext(ULong src_masked,ULong mask)3231 ULong amd64g_calculate_pext ( ULong src_masked, ULong mask )
3232 {
3233 ULong dst = 0;
3234 ULong src_bit;
3235 ULong dst_bit = 1;
3236 for (src_bit = 1; src_bit; src_bit <<= 1) {
3237 if (mask & src_bit) {
3238 if (src_masked & src_bit) dst |= dst_bit;
3239 dst_bit <<= 1;
3240 }
3241 }
3242 return dst;
3243 }
3244
3245 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_pdep(ULong src,ULong mask)3246 ULong amd64g_calculate_pdep ( ULong src, ULong mask )
3247 {
3248 ULong dst = 0;
3249 ULong dst_bit;
3250 ULong src_bit = 1;
3251 for (dst_bit = 1; dst_bit; dst_bit <<= 1) {
3252 if (mask & dst_bit) {
3253 if (src & src_bit) dst |= dst_bit;
3254 src_bit <<= 1;
3255 }
3256 }
3257 return dst;
3258 }
3259
3260 /*---------------------------------------------------------------*/
3261 /*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M} ---*/
3262 /*---------------------------------------------------------------*/
3263
zmask_from_V128(V128 * arg)3264 static UInt zmask_from_V128 ( V128* arg )
3265 {
3266 UInt i, res = 0;
3267 for (i = 0; i < 16; i++) {
3268 res |= ((arg->w8[i] == 0) ? 1 : 0) << i;
3269 }
3270 return res;
3271 }
3272
zmask_from_V128_wide(V128 * arg)3273 static UInt zmask_from_V128_wide ( V128* arg )
3274 {
3275 UInt i, res = 0;
3276 for (i = 0; i < 8; i++) {
3277 res |= ((arg->w16[i] == 0) ? 1 : 0) << i;
3278 }
3279 return res;
3280 }
3281
3282 /* Helps with PCMP{I,E}STR{I,M}.
3283
3284 CALLED FROM GENERATED CODE: DIRTY HELPER(s). (But not really,
3285 actually it could be a clean helper, but for the fact that we can't
3286 pass by value 2 x V128 to a clean helper, nor have one returned.)
3287 Reads guest state, writes to guest state for the xSTRM cases, no
3288 accesses of memory, is a pure function.
3289
3290 opc_and_imm contains (4th byte of opcode << 8) | the-imm8-byte so
3291 the callee knows which I/E and I/M variant it is dealing with and
3292 what the specific operation is. 4th byte of opcode is in the range
3293 0x60 to 0x63:
3294 istri 66 0F 3A 63
3295 istrm 66 0F 3A 62
3296 estri 66 0F 3A 61
3297 estrm 66 0F 3A 60
3298
3299 gstOffL and gstOffR are the guest state offsets for the two XMM
3300 register inputs. We never have to deal with the memory case since
3301 that is handled by pre-loading the relevant value into the fake
3302 XMM16 register.
3303
3304 For ESTRx variants, edxIN and eaxIN hold the values of those two
3305 registers.
3306
3307 In all cases, the bottom 16 bits of the result contain the new
3308 OSZACP %rflags values. For xSTRI variants, bits[31:16] of the
3309 result hold the new %ecx value. For xSTRM variants, the helper
3310 writes the result directly to the guest XMM0.
3311
3312 Declarable side effects: in all cases, reads guest state at
3313 [gstOffL, +16) and [gstOffR, +16). For xSTRM variants, also writes
3314 guest_XMM0.
3315
3316 Is expected to be called with opc_and_imm combinations which have
3317 actually been validated, and will assert if otherwise. The front
3318 end should ensure we're only called with verified values.
3319 */
amd64g_dirtyhelper_PCMPxSTRx(VexGuestAMD64State * gst,HWord opc4_and_imm,HWord gstOffL,HWord gstOffR,HWord edxIN,HWord eaxIN)3320 ULong amd64g_dirtyhelper_PCMPxSTRx (
3321 VexGuestAMD64State* gst,
3322 HWord opc4_and_imm,
3323 HWord gstOffL, HWord gstOffR,
3324 HWord edxIN, HWord eaxIN
3325 )
3326 {
3327 HWord opc4 = (opc4_and_imm >> 8) & 0xFF;
3328 HWord imm8 = opc4_and_imm & 0xFF;
3329 HWord isISTRx = opc4 & 2;
3330 HWord isxSTRM = (opc4 & 1) ^ 1;
3331 vassert((opc4 & 0xFC) == 0x60); /* 0x60 .. 0x63 */
3332 HWord wide = (imm8 & 1);
3333
3334 // where the args are
3335 V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
3336 V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
3337
3338 /* Create the arg validity masks, either from the vectors
3339 themselves or from the supplied edx/eax values. */
3340 // FIXME: this is only right for the 8-bit data cases.
3341 // At least that is asserted above.
3342 UInt zmaskL, zmaskR;
3343
3344 // temp spot for the resulting flags and vector.
3345 V128 resV;
3346 UInt resOSZACP;
3347
3348 // for checking whether case was handled
3349 Bool ok = False;
3350
3351 if (wide) {
3352 if (isISTRx) {
3353 zmaskL = zmask_from_V128_wide(argL);
3354 zmaskR = zmask_from_V128_wide(argR);
3355 } else {
3356 Int tmp;
3357 tmp = edxIN & 0xFFFFFFFF;
3358 if (tmp < -8) tmp = -8;
3359 if (tmp > 8) tmp = 8;
3360 if (tmp < 0) tmp = -tmp;
3361 vassert(tmp >= 0 && tmp <= 8);
3362 zmaskL = (1 << tmp) & 0xFF;
3363 tmp = eaxIN & 0xFFFFFFFF;
3364 if (tmp < -8) tmp = -8;
3365 if (tmp > 8) tmp = 8;
3366 if (tmp < 0) tmp = -tmp;
3367 vassert(tmp >= 0 && tmp <= 8);
3368 zmaskR = (1 << tmp) & 0xFF;
3369 }
3370 // do the meyaath
3371 ok = compute_PCMPxSTRx_wide (
3372 &resV, &resOSZACP, argL, argR,
3373 zmaskL, zmaskR, imm8, (Bool)isxSTRM
3374 );
3375 } else {
3376 if (isISTRx) {
3377 zmaskL = zmask_from_V128(argL);
3378 zmaskR = zmask_from_V128(argR);
3379 } else {
3380 Int tmp;
3381 tmp = edxIN & 0xFFFFFFFF;
3382 if (tmp < -16) tmp = -16;
3383 if (tmp > 16) tmp = 16;
3384 if (tmp < 0) tmp = -tmp;
3385 vassert(tmp >= 0 && tmp <= 16);
3386 zmaskL = (1 << tmp) & 0xFFFF;
3387 tmp = eaxIN & 0xFFFFFFFF;
3388 if (tmp < -16) tmp = -16;
3389 if (tmp > 16) tmp = 16;
3390 if (tmp < 0) tmp = -tmp;
3391 vassert(tmp >= 0 && tmp <= 16);
3392 zmaskR = (1 << tmp) & 0xFFFF;
3393 }
3394 // do the meyaath
3395 ok = compute_PCMPxSTRx (
3396 &resV, &resOSZACP, argL, argR,
3397 zmaskL, zmaskR, imm8, (Bool)isxSTRM
3398 );
3399 }
3400
3401 // front end shouldn't pass us any imm8 variants we can't
3402 // handle. Hence:
3403 vassert(ok);
3404
3405 // So, finally we need to get the results back to the caller.
3406 // In all cases, the new OSZACP value is the lowest 16 of
3407 // the return value.
3408 if (isxSTRM) {
3409 gst->guest_YMM0[0] = resV.w32[0];
3410 gst->guest_YMM0[1] = resV.w32[1];
3411 gst->guest_YMM0[2] = resV.w32[2];
3412 gst->guest_YMM0[3] = resV.w32[3];
3413 return resOSZACP & 0x8D5;
3414 } else {
3415 UInt newECX = resV.w32[0] & 0xFFFF;
3416 return (newECX << 16) | (resOSZACP & 0x8D5);
3417 }
3418 }
3419
3420 /*---------------------------------------------------------------*/
3421 /*--- AES primitives and helpers ---*/
3422 /*---------------------------------------------------------------*/
3423 /* a 16 x 16 matrix */
3424 static const UChar sbox[256] = { // row nr
3425 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, // 1
3426 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
3427 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, // 2
3428 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
3429 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, // 3
3430 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
3431 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, // 4
3432 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
3433 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, // 5
3434 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
3435 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, // 6
3436 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
3437 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, // 7
3438 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
3439 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, // 8
3440 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
3441 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, // 9
3442 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
3443 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, //10
3444 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
3445 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, //11
3446 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
3447 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, //12
3448 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
3449 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, //13
3450 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
3451 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, //14
3452 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
3453 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, //15
3454 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
3455 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, //16
3456 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
3457 };
SubBytes(V128 * v)3458 static void SubBytes (V128* v)
3459 {
3460 V128 r;
3461 UInt i;
3462 for (i = 0; i < 16; i++)
3463 r.w8[i] = sbox[v->w8[i]];
3464 *v = r;
3465 }
3466
3467 /* a 16 x 16 matrix */
3468 static const UChar invsbox[256] = { // row nr
3469 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, // 1
3470 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
3471 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, // 2
3472 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
3473 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, // 3
3474 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
3475 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, // 4
3476 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
3477 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, // 5
3478 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
3479 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, // 6
3480 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
3481 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, // 7
3482 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
3483 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, // 8
3484 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
3485 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, // 9
3486 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
3487 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, //10
3488 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
3489 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, //11
3490 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
3491 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, //12
3492 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
3493 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, //13
3494 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
3495 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, //14
3496 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
3497 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, //15
3498 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
3499 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, //16
3500 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
3501 };
InvSubBytes(V128 * v)3502 static void InvSubBytes (V128* v)
3503 {
3504 V128 r;
3505 UInt i;
3506 for (i = 0; i < 16; i++)
3507 r.w8[i] = invsbox[v->w8[i]];
3508 *v = r;
3509 }
3510
3511 static const UChar ShiftRows_op[16] =
3512 {11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0};
ShiftRows(V128 * v)3513 static void ShiftRows (V128* v)
3514 {
3515 V128 r;
3516 UInt i;
3517 for (i = 0; i < 16; i++)
3518 r.w8[i] = v->w8[ShiftRows_op[15-i]];
3519 *v = r;
3520 }
3521
3522 static const UChar InvShiftRows_op[16] =
3523 {3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0};
InvShiftRows(V128 * v)3524 static void InvShiftRows (V128* v)
3525 {
3526 V128 r;
3527 UInt i;
3528 for (i = 0; i < 16; i++)
3529 r.w8[i] = v->w8[InvShiftRows_op[15-i]];
3530 *v = r;
3531 }
3532
3533 /* Multiplication of the finite fields elements of AES.
3534 See "A Specification for The AES Algorithm Rijndael
3535 (by Joan Daemen & Vincent Rijmen)"
3536 Dr. Brian Gladman, v3.1, 3rd March 2001. */
3537 /* N values so that (hex) xy = 0x03^N.
3538 0x00 cannot be used. We put 0xff for this value.*/
3539 /* a 16 x 16 matrix */
3540 static const UChar Nxy[256] = { // row nr
3541 0xff, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, // 1
3542 0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03,
3543 0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, // 2
3544 0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1,
3545 0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, // 3
3546 0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78,
3547 0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, // 4
3548 0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e,
3549 0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, // 5
3550 0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38,
3551 0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, // 6
3552 0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10,
3553 0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, // 7
3554 0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba,
3555 0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, // 8
3556 0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57,
3557 0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, // 9
3558 0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8,
3559 0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, //10
3560 0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0,
3561 0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, //11
3562 0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7,
3563 0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, //12
3564 0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d,
3565 0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, //13
3566 0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1,
3567 0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, //14
3568 0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab,
3569 0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, //15
3570 0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5,
3571 0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, //16
3572 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07
3573 };
3574
3575 /* E values so that E = 0x03^xy. */
3576 static const UChar Exy[256] = { // row nr
3577 0x01, 0x03, 0x05, 0x0f, 0x11, 0x33, 0x55, 0xff, // 1
3578 0x1a, 0x2e, 0x72, 0x96, 0xa1, 0xf8, 0x13, 0x35,
3579 0x5f, 0xe1, 0x38, 0x48, 0xd8, 0x73, 0x95, 0xa4, // 2
3580 0xf7, 0x02, 0x06, 0x0a, 0x1e, 0x22, 0x66, 0xaa,
3581 0xe5, 0x34, 0x5c, 0xe4, 0x37, 0x59, 0xeb, 0x26, // 3
3582 0x6a, 0xbe, 0xd9, 0x70, 0x90, 0xab, 0xe6, 0x31,
3583 0x53, 0xf5, 0x04, 0x0c, 0x14, 0x3c, 0x44, 0xcc, // 4
3584 0x4f, 0xd1, 0x68, 0xb8, 0xd3, 0x6e, 0xb2, 0xcd,
3585 0x4c, 0xd4, 0x67, 0xa9, 0xe0, 0x3b, 0x4d, 0xd7, // 5
3586 0x62, 0xa6, 0xf1, 0x08, 0x18, 0x28, 0x78, 0x88,
3587 0x83, 0x9e, 0xb9, 0xd0, 0x6b, 0xbd, 0xdc, 0x7f, // 6
3588 0x81, 0x98, 0xb3, 0xce, 0x49, 0xdb, 0x76, 0x9a,
3589 0xb5, 0xc4, 0x57, 0xf9, 0x10, 0x30, 0x50, 0xf0, // 7
3590 0x0b, 0x1d, 0x27, 0x69, 0xbb, 0xd6, 0x61, 0xa3,
3591 0xfe, 0x19, 0x2b, 0x7d, 0x87, 0x92, 0xad, 0xec, // 8
3592 0x2f, 0x71, 0x93, 0xae, 0xe9, 0x20, 0x60, 0xa0,
3593 0xfb, 0x16, 0x3a, 0x4e, 0xd2, 0x6d, 0xb7, 0xc2, // 9
3594 0x5d, 0xe7, 0x32, 0x56, 0xfa, 0x15, 0x3f, 0x41,
3595 0xc3, 0x5e, 0xe2, 0x3d, 0x47, 0xc9, 0x40, 0xc0, //10
3596 0x5b, 0xed, 0x2c, 0x74, 0x9c, 0xbf, 0xda, 0x75,
3597 0x9f, 0xba, 0xd5, 0x64, 0xac, 0xef, 0x2a, 0x7e, //11
3598 0x82, 0x9d, 0xbc, 0xdf, 0x7a, 0x8e, 0x89, 0x80,
3599 0x9b, 0xb6, 0xc1, 0x58, 0xe8, 0x23, 0x65, 0xaf, //12
3600 0xea, 0x25, 0x6f, 0xb1, 0xc8, 0x43, 0xc5, 0x54,
3601 0xfc, 0x1f, 0x21, 0x63, 0xa5, 0xf4, 0x07, 0x09, //13
3602 0x1b, 0x2d, 0x77, 0x99, 0xb0, 0xcb, 0x46, 0xca,
3603 0x45, 0xcf, 0x4a, 0xde, 0x79, 0x8b, 0x86, 0x91, //14
3604 0xa8, 0xe3, 0x3e, 0x42, 0xc6, 0x51, 0xf3, 0x0e,
3605 0x12, 0x36, 0x5a, 0xee, 0x29, 0x7b, 0x8d, 0x8c, //15
3606 0x8f, 0x8a, 0x85, 0x94, 0xa7, 0xf2, 0x0d, 0x17,
3607 0x39, 0x4b, 0xdd, 0x7c, 0x84, 0x97, 0xa2, 0xfd, //16
3608 0x1c, 0x24, 0x6c, 0xb4, 0xc7, 0x52, 0xf6, 0x01};
3609
ff_mul(UChar u1,UChar u2)3610 static inline UChar ff_mul(UChar u1, UChar u2)
3611 {
3612 if ((u1 > 0) && (u2 > 0)) {
3613 UInt ui = Nxy[u1] + Nxy[u2];
3614 if (ui >= 255)
3615 ui = ui - 255;
3616 return Exy[ui];
3617 } else {
3618 return 0;
3619 };
3620 }
3621
MixColumns(V128 * v)3622 static void MixColumns (V128* v)
3623 {
3624 V128 r;
3625 Int j;
3626 #define P(x,row,col) (x)->w8[((row)*4+(col))]
3627 for (j = 0; j < 4; j++) {
3628 P(&r,j,0) = ff_mul(0x02, P(v,j,0)) ^ ff_mul(0x03, P(v,j,1))
3629 ^ P(v,j,2) ^ P(v,j,3);
3630 P(&r,j,1) = P(v,j,0) ^ ff_mul( 0x02, P(v,j,1) )
3631 ^ ff_mul(0x03, P(v,j,2) ) ^ P(v,j,3);
3632 P(&r,j,2) = P(v,j,0) ^ P(v,j,1) ^ ff_mul( 0x02, P(v,j,2) )
3633 ^ ff_mul(0x03, P(v,j,3) );
3634 P(&r,j,3) = ff_mul(0x03, P(v,j,0) ) ^ P(v,j,1) ^ P(v,j,2)
3635 ^ ff_mul( 0x02, P(v,j,3) );
3636 }
3637 *v = r;
3638 #undef P
3639 }
3640
InvMixColumns(V128 * v)3641 static void InvMixColumns (V128* v)
3642 {
3643 V128 r;
3644 Int j;
3645 #define P(x,row,col) (x)->w8[((row)*4+(col))]
3646 for (j = 0; j < 4; j++) {
3647 P(&r,j,0) = ff_mul(0x0e, P(v,j,0) ) ^ ff_mul(0x0b, P(v,j,1) )
3648 ^ ff_mul(0x0d,P(v,j,2) ) ^ ff_mul(0x09, P(v,j,3) );
3649 P(&r,j,1) = ff_mul(0x09, P(v,j,0) ) ^ ff_mul(0x0e, P(v,j,1) )
3650 ^ ff_mul(0x0b,P(v,j,2) ) ^ ff_mul(0x0d, P(v,j,3) );
3651 P(&r,j,2) = ff_mul(0x0d, P(v,j,0) ) ^ ff_mul(0x09, P(v,j,1) )
3652 ^ ff_mul(0x0e,P(v,j,2) ) ^ ff_mul(0x0b, P(v,j,3) );
3653 P(&r,j,3) = ff_mul(0x0b, P(v,j,0) ) ^ ff_mul(0x0d, P(v,j,1) )
3654 ^ ff_mul(0x09,P(v,j,2) ) ^ ff_mul(0x0e, P(v,j,3) );
3655 }
3656 *v = r;
3657 #undef P
3658
3659 }
3660
3661 /* For description, see definition in guest_amd64_defs.h */
amd64g_dirtyhelper_AES(VexGuestAMD64State * gst,HWord opc4,HWord gstOffD,HWord gstOffL,HWord gstOffR)3662 void amd64g_dirtyhelper_AES (
3663 VexGuestAMD64State* gst,
3664 HWord opc4, HWord gstOffD,
3665 HWord gstOffL, HWord gstOffR
3666 )
3667 {
3668 // where the args are
3669 V128* argD = (V128*)( ((UChar*)gst) + gstOffD );
3670 V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
3671 V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
3672 V128 r;
3673
3674 switch (opc4) {
3675 case 0xDC: /* AESENC */
3676 case 0xDD: /* AESENCLAST */
3677 r = *argR;
3678 ShiftRows (&r);
3679 SubBytes (&r);
3680 if (opc4 == 0xDC)
3681 MixColumns (&r);
3682 argD->w64[0] = r.w64[0] ^ argL->w64[0];
3683 argD->w64[1] = r.w64[1] ^ argL->w64[1];
3684 break;
3685
3686 case 0xDE: /* AESDEC */
3687 case 0xDF: /* AESDECLAST */
3688 r = *argR;
3689 InvShiftRows (&r);
3690 InvSubBytes (&r);
3691 if (opc4 == 0xDE)
3692 InvMixColumns (&r);
3693 argD->w64[0] = r.w64[0] ^ argL->w64[0];
3694 argD->w64[1] = r.w64[1] ^ argL->w64[1];
3695 break;
3696
3697 case 0xDB: /* AESIMC */
3698 *argD = *argL;
3699 InvMixColumns (argD);
3700 break;
3701 default: vassert(0);
3702 }
3703 }
3704
RotWord(UInt w32)3705 static inline UInt RotWord (UInt w32)
3706 {
3707 return ((w32 >> 8) | (w32 << 24));
3708 }
3709
SubWord(UInt w32)3710 static inline UInt SubWord (UInt w32)
3711 {
3712 UChar *w8;
3713 UChar *r8;
3714 UInt res;
3715 w8 = (UChar*) &w32;
3716 r8 = (UChar*) &res;
3717 r8[0] = sbox[w8[0]];
3718 r8[1] = sbox[w8[1]];
3719 r8[2] = sbox[w8[2]];
3720 r8[3] = sbox[w8[3]];
3721 return res;
3722 }
3723
3724 /* For description, see definition in guest_amd64_defs.h */
amd64g_dirtyhelper_AESKEYGENASSIST(VexGuestAMD64State * gst,HWord imm8,HWord gstOffL,HWord gstOffR)3725 extern void amd64g_dirtyhelper_AESKEYGENASSIST (
3726 VexGuestAMD64State* gst,
3727 HWord imm8,
3728 HWord gstOffL, HWord gstOffR
3729 )
3730 {
3731 // where the args are
3732 V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
3733 V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
3734
3735 argR->w32[3] = RotWord (SubWord (argL->w32[3])) ^ imm8;
3736 argR->w32[2] = SubWord (argL->w32[3]);
3737 argR->w32[1] = RotWord (SubWord (argL->w32[1])) ^ imm8;
3738 argR->w32[0] = SubWord (argL->w32[1]);
3739 }
3740
3741
3742
3743 /*---------------------------------------------------------------*/
3744 /*--- Helpers for dealing with, and describing, ---*/
3745 /*--- guest state as a whole. ---*/
3746 /*---------------------------------------------------------------*/
3747
3748 /* Initialise the entire amd64 guest state. */
3749 /* VISIBLE TO LIBVEX CLIENT */
LibVEX_GuestAMD64_initialise(VexGuestAMD64State * vex_state)3750 void LibVEX_GuestAMD64_initialise ( /*OUT*/VexGuestAMD64State* vex_state )
3751 {
3752 vex_state->host_EvC_FAILADDR = 0;
3753 vex_state->host_EvC_COUNTER = 0;
3754 vex_state->pad0 = 0;
3755
3756 vex_state->guest_RAX = 0;
3757 vex_state->guest_RCX = 0;
3758 vex_state->guest_RDX = 0;
3759 vex_state->guest_RBX = 0;
3760 vex_state->guest_RSP = 0;
3761 vex_state->guest_RBP = 0;
3762 vex_state->guest_RSI = 0;
3763 vex_state->guest_RDI = 0;
3764 vex_state->guest_R8 = 0;
3765 vex_state->guest_R9 = 0;
3766 vex_state->guest_R10 = 0;
3767 vex_state->guest_R11 = 0;
3768 vex_state->guest_R12 = 0;
3769 vex_state->guest_R13 = 0;
3770 vex_state->guest_R14 = 0;
3771 vex_state->guest_R15 = 0;
3772
3773 vex_state->guest_CC_OP = AMD64G_CC_OP_COPY;
3774 vex_state->guest_CC_DEP1 = 0;
3775 vex_state->guest_CC_DEP2 = 0;
3776 vex_state->guest_CC_NDEP = 0;
3777
3778 vex_state->guest_DFLAG = 1; /* forwards */
3779 vex_state->guest_IDFLAG = 0;
3780 vex_state->guest_ACFLAG = 0;
3781
3782 /* HACK: represent the offset associated with %fs==0. This
3783 assumes that %fs is only ever zero. */
3784 vex_state->guest_FS_ZERO = 0;
3785
3786 vex_state->guest_RIP = 0;
3787
3788 /* Initialise the simulated FPU */
3789 amd64g_dirtyhelper_FINIT( vex_state );
3790
3791 /* Initialise the AVX state. */
3792 # define AVXZERO(_ymm) \
3793 do { _ymm[0]=_ymm[1]=_ymm[2]=_ymm[3] = 0; \
3794 _ymm[4]=_ymm[5]=_ymm[6]=_ymm[7] = 0; \
3795 } while (0)
3796 vex_state->guest_SSEROUND = (ULong)Irrm_NEAREST;
3797 AVXZERO(vex_state->guest_YMM0);
3798 AVXZERO(vex_state->guest_YMM1);
3799 AVXZERO(vex_state->guest_YMM2);
3800 AVXZERO(vex_state->guest_YMM3);
3801 AVXZERO(vex_state->guest_YMM4);
3802 AVXZERO(vex_state->guest_YMM5);
3803 AVXZERO(vex_state->guest_YMM6);
3804 AVXZERO(vex_state->guest_YMM7);
3805 AVXZERO(vex_state->guest_YMM8);
3806 AVXZERO(vex_state->guest_YMM9);
3807 AVXZERO(vex_state->guest_YMM10);
3808 AVXZERO(vex_state->guest_YMM11);
3809 AVXZERO(vex_state->guest_YMM12);
3810 AVXZERO(vex_state->guest_YMM13);
3811 AVXZERO(vex_state->guest_YMM14);
3812 AVXZERO(vex_state->guest_YMM15);
3813 AVXZERO(vex_state->guest_YMM16);
3814
3815 # undef AVXZERO
3816
3817 vex_state->guest_EMNOTE = EmNote_NONE;
3818
3819 /* These should not ever be either read or written, but we
3820 initialise them anyway. */
3821 vex_state->guest_CMSTART = 0;
3822 vex_state->guest_CMLEN = 0;
3823
3824 vex_state->guest_NRADDR = 0;
3825 vex_state->guest_SC_CLASS = 0;
3826 vex_state->guest_GS_0x60 = 0;
3827
3828 vex_state->guest_IP_AT_SYSCALL = 0;
3829 vex_state->pad1 = 0;
3830 }
3831
3832
3833 /* Figure out if any part of the guest state contained in minoff
3834 .. maxoff requires precise memory exceptions. If in doubt return
3835 True (but this generates significantly slower code).
3836
3837 By default we enforce precise exns for guest %RSP, %RBP and %RIP
3838 only. These are the minimum needed to extract correct stack
3839 backtraces from amd64 code.
3840
3841 Only %RSP is needed in mode VexRegUpdSpAtMemAccess.
3842 */
guest_amd64_state_requires_precise_mem_exns(Int minoff,Int maxoff)3843 Bool guest_amd64_state_requires_precise_mem_exns ( Int minoff,
3844 Int maxoff)
3845 {
3846 Int rbp_min = offsetof(VexGuestAMD64State, guest_RBP);
3847 Int rbp_max = rbp_min + 8 - 1;
3848 Int rsp_min = offsetof(VexGuestAMD64State, guest_RSP);
3849 Int rsp_max = rsp_min + 8 - 1;
3850 Int rip_min = offsetof(VexGuestAMD64State, guest_RIP);
3851 Int rip_max = rip_min + 8 - 1;
3852
3853 if (maxoff < rsp_min || minoff > rsp_max) {
3854 /* no overlap with rsp */
3855 if (vex_control.iropt_register_updates == VexRegUpdSpAtMemAccess)
3856 return False; // We only need to check stack pointer.
3857 } else {
3858 return True;
3859 }
3860
3861 if (maxoff < rbp_min || minoff > rbp_max) {
3862 /* no overlap with rbp */
3863 } else {
3864 return True;
3865 }
3866
3867 if (maxoff < rip_min || minoff > rip_max) {
3868 /* no overlap with eip */
3869 } else {
3870 return True;
3871 }
3872
3873 return False;
3874 }
3875
3876
3877 #define ALWAYSDEFD(field) \
3878 { offsetof(VexGuestAMD64State, field), \
3879 (sizeof ((VexGuestAMD64State*)0)->field) }
3880
3881 VexGuestLayout
3882 amd64guest_layout
3883 = {
3884 /* Total size of the guest state, in bytes. */
3885 .total_sizeB = sizeof(VexGuestAMD64State),
3886
3887 /* Describe the stack pointer. */
3888 .offset_SP = offsetof(VexGuestAMD64State,guest_RSP),
3889 .sizeof_SP = 8,
3890
3891 /* Describe the frame pointer. */
3892 .offset_FP = offsetof(VexGuestAMD64State,guest_RBP),
3893 .sizeof_FP = 8,
3894
3895 /* Describe the instruction pointer. */
3896 .offset_IP = offsetof(VexGuestAMD64State,guest_RIP),
3897 .sizeof_IP = 8,
3898
3899 /* Describe any sections to be regarded by Memcheck as
3900 'always-defined'. */
3901 .n_alwaysDefd = 16,
3902
3903 /* flags thunk: OP and NDEP are always defd, whereas DEP1
3904 and DEP2 have to be tracked. See detailed comment in
3905 gdefs.h on meaning of thunk fields. */
3906 .alwaysDefd
3907 = { /* 0 */ ALWAYSDEFD(guest_CC_OP),
3908 /* 1 */ ALWAYSDEFD(guest_CC_NDEP),
3909 /* 2 */ ALWAYSDEFD(guest_DFLAG),
3910 /* 3 */ ALWAYSDEFD(guest_IDFLAG),
3911 /* 4 */ ALWAYSDEFD(guest_RIP),
3912 /* 5 */ ALWAYSDEFD(guest_FS_ZERO),
3913 /* 6 */ ALWAYSDEFD(guest_FTOP),
3914 /* 7 */ ALWAYSDEFD(guest_FPTAG),
3915 /* 8 */ ALWAYSDEFD(guest_FPROUND),
3916 /* 9 */ ALWAYSDEFD(guest_FC3210),
3917 // /* */ ALWAYSDEFD(guest_CS),
3918 // /* */ ALWAYSDEFD(guest_DS),
3919 // /* */ ALWAYSDEFD(guest_ES),
3920 // /* */ ALWAYSDEFD(guest_FS),
3921 // /* */ ALWAYSDEFD(guest_GS),
3922 // /* */ ALWAYSDEFD(guest_SS),
3923 // /* */ ALWAYSDEFD(guest_LDT),
3924 // /* */ ALWAYSDEFD(guest_GDT),
3925 /* 10 */ ALWAYSDEFD(guest_EMNOTE),
3926 /* 11 */ ALWAYSDEFD(guest_SSEROUND),
3927 /* 12 */ ALWAYSDEFD(guest_CMSTART),
3928 /* 13 */ ALWAYSDEFD(guest_CMLEN),
3929 /* 14 */ ALWAYSDEFD(guest_SC_CLASS),
3930 /* 15 */ ALWAYSDEFD(guest_IP_AT_SYSCALL)
3931 }
3932 };
3933
3934
3935 /*---------------------------------------------------------------*/
3936 /*--- end guest_amd64_helpers.c ---*/
3937 /*---------------------------------------------------------------*/
3938