• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 
2 /*---------------------------------------------------------------*/
3 /*--- begin                             guest_amd64_helpers.c ---*/
4 /*---------------------------------------------------------------*/
5 
6 /*
7    This file is part of Valgrind, a dynamic binary instrumentation
8    framework.
9 
10    Copyright (C) 2004-2012 OpenWorks LLP
11       info@open-works.net
12 
13    This program is free software; you can redistribute it and/or
14    modify it under the terms of the GNU General Public License as
15    published by the Free Software Foundation; either version 2 of the
16    License, or (at your option) any later version.
17 
18    This program is distributed in the hope that it will be useful, but
19    WITHOUT ANY WARRANTY; without even the implied warranty of
20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21    General Public License for more details.
22 
23    You should have received a copy of the GNU General Public License
24    along with this program; if not, write to the Free Software
25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26    02110-1301, USA.
27 
28    The GNU General Public License is contained in the file COPYING.
29 
30    Neither the names of the U.S. Department of Energy nor the
31    University of California nor the names of its contributors may be
32    used to endorse or promote products derived from this software
33    without prior written permission.
34 */
35 
36 #include "libvex_basictypes.h"
37 #include "libvex_emwarn.h"
38 #include "libvex_guest_amd64.h"
39 #include "libvex_ir.h"
40 #include "libvex.h"
41 
42 #include "main_util.h"
43 #include "guest_generic_bb_to_IR.h"
44 #include "guest_amd64_defs.h"
45 #include "guest_generic_x87.h"
46 
47 
48 /* This file contains helper functions for amd64 guest code.
49    Calls to these functions are generated by the back end.
50    These calls are of course in the host machine code and
51    this file will be compiled to host machine code, so that
52    all makes sense.
53 
54    Only change the signatures of these helper functions very
55    carefully.  If you change the signature here, you'll have to change
56    the parameters passed to it in the IR calls constructed by
57    guest-amd64/toIR.c.
58 
59    The convention used is that all functions called from generated
60    code are named amd64g_<something>, and any function whose name lacks
61    that prefix is not called from generated code.  Note that some
62    LibVEX_* functions can however be called by VEX's client, but that
63    is not the same as calling them from VEX-generated code.
64 */
65 
66 
67 /* Set to 1 to get detailed profiling info about use of the flag
68    machinery. */
69 #define PROFILE_RFLAGS 0
70 
71 
72 /*---------------------------------------------------------------*/
73 /*--- %rflags run-time helpers.                               ---*/
74 /*---------------------------------------------------------------*/
75 
76 /* Do 64x64 -> 128 signed/unsigned multiplies, for computing flags
77    after imulq/mulq. */
78 
mullS64(Long u,Long v,Long * rHi,Long * rLo)79 static void mullS64 ( Long u, Long v, Long* rHi, Long* rLo )
80 {
81    ULong u0, v0, w0;
82     Long u1, v1, w1, w2, t;
83    u0   = u & 0xFFFFFFFFULL;
84    u1   = u >> 32;
85    v0   = v & 0xFFFFFFFFULL;
86    v1   = v >> 32;
87    w0   = u0 * v0;
88    t    = u1 * v0 + (w0 >> 32);
89    w1   = t & 0xFFFFFFFFULL;
90    w2   = t >> 32;
91    w1   = u0 * v1 + w1;
92    *rHi = u1 * v1 + w2 + (w1 >> 32);
93    *rLo = u * v;
94 }
95 
mullU64(ULong u,ULong v,ULong * rHi,ULong * rLo)96 static void mullU64 ( ULong u, ULong v, ULong* rHi, ULong* rLo )
97 {
98    ULong u0, v0, w0;
99    ULong u1, v1, w1,w2,t;
100    u0   = u & 0xFFFFFFFFULL;
101    u1   = u >> 32;
102    v0   = v & 0xFFFFFFFFULL;
103    v1   = v >> 32;
104    w0   = u0 * v0;
105    t    = u1 * v0 + (w0 >> 32);
106    w1   = t & 0xFFFFFFFFULL;
107    w2   = t >> 32;
108    w1   = u0 * v1 + w1;
109    *rHi = u1 * v1 + w2 + (w1 >> 32);
110    *rLo = u * v;
111 }
112 
113 
114 static const UChar parity_table[256] = {
115     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
116     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
117     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
118     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
119     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
120     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
121     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
122     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
123     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
124     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
125     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
126     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
127     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
128     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
129     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
130     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
131     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
132     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
133     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
134     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
135     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
136     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
137     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
138     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
139     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
140     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
141     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
142     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
143     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
144     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
145     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
146     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
147 };
148 
149 /* generalised left-shifter */
lshift(Long x,Int n)150 static inline Long lshift ( Long x, Int n )
151 {
152    if (n >= 0)
153       return x << n;
154    else
155       return x >> (-n);
156 }
157 
158 /* identity on ULong */
idULong(ULong x)159 static inline ULong idULong ( ULong x )
160 {
161    return x;
162 }
163 
164 
165 #define PREAMBLE(__data_bits)					\
166    /* const */ ULong DATA_MASK 					\
167       = __data_bits==8                                          \
168            ? 0xFFULL 					        \
169            : (__data_bits==16                                   \
170                 ? 0xFFFFULL 		                        \
171                 : (__data_bits==32                              \
172                      ? 0xFFFFFFFFULL                            \
173                      : 0xFFFFFFFFFFFFFFFFULL));                 \
174    /* const */ ULong SIGN_MASK = 1ULL << (__data_bits - 1);     \
175    /* const */ ULong CC_DEP1 = cc_dep1_formal;			\
176    /* const */ ULong CC_DEP2 = cc_dep2_formal;			\
177    /* const */ ULong CC_NDEP = cc_ndep_formal;			\
178    /* Four bogus assignments, which hopefully gcc can     */	\
179    /* optimise away, and which stop it complaining about  */	\
180    /* unused variables.                                   */	\
181    SIGN_MASK = SIGN_MASK;					\
182    DATA_MASK = DATA_MASK;					\
183    CC_DEP2 = CC_DEP2;						\
184    CC_NDEP = CC_NDEP;
185 
186 
187 /*-------------------------------------------------------------*/
188 
189 #define ACTIONS_ADD(DATA_BITS,DATA_UTYPE)			\
190 {								\
191    PREAMBLE(DATA_BITS);						\
192    { Long cf, pf, af, zf, sf, of;				\
193      Long argL, argR, res;					\
194      argL = CC_DEP1;						\
195      argR = CC_DEP2;						\
196      res  = argL + argR;					\
197      cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;			\
198      pf = parity_table[(UChar)res];				\
199      af = (res ^ argL ^ argR) & 0x10;				\
200      zf = ((DATA_UTYPE)res == 0) << 6;				\
201      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
202      of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
203                  12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
204      return cf | pf | af | zf | sf | of;			\
205    }								\
206 }
207 
208 /*-------------------------------------------------------------*/
209 
210 #define ACTIONS_SUB(DATA_BITS,DATA_UTYPE)			\
211 {								\
212    PREAMBLE(DATA_BITS);						\
213    { Long cf, pf, af, zf, sf, of;				\
214      Long argL, argR, res;					\
215      argL = CC_DEP1;						\
216      argR = CC_DEP2;						\
217      res  = argL - argR;					\
218      cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;			\
219      pf = parity_table[(UChar)res];				\
220      af = (res ^ argL ^ argR) & 0x10;				\
221      zf = ((DATA_UTYPE)res == 0) << 6;				\
222      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
223      of = lshift((argL ^ argR) & (argL ^ res),	 		\
224                  12 - DATA_BITS) & AMD64G_CC_MASK_O; 		\
225      return cf | pf | af | zf | sf | of;			\
226    }								\
227 }
228 
229 /*-------------------------------------------------------------*/
230 
231 #define ACTIONS_ADC(DATA_BITS,DATA_UTYPE)			\
232 {								\
233    PREAMBLE(DATA_BITS);						\
234    { Long cf, pf, af, zf, sf, of;				\
235      Long argL, argR, oldC, res;		 		\
236      oldC = CC_NDEP & AMD64G_CC_MASK_C;				\
237      argL = CC_DEP1;						\
238      argR = CC_DEP2 ^ oldC;	       				\
239      res  = (argL + argR) + oldC;				\
240      if (oldC)							\
241         cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;		\
242      else							\
243         cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;		\
244      pf = parity_table[(UChar)res];				\
245      af = (res ^ argL ^ argR) & 0x10;				\
246      zf = ((DATA_UTYPE)res == 0) << 6;				\
247      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
248      of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
249                   12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
250      return cf | pf | af | zf | sf | of;			\
251    }								\
252 }
253 
254 /*-------------------------------------------------------------*/
255 
256 #define ACTIONS_SBB(DATA_BITS,DATA_UTYPE)			\
257 {								\
258    PREAMBLE(DATA_BITS);						\
259    { Long cf, pf, af, zf, sf, of;				\
260      Long argL, argR, oldC, res;	       			\
261      oldC = CC_NDEP & AMD64G_CC_MASK_C;				\
262      argL = CC_DEP1;						\
263      argR = CC_DEP2 ^ oldC;	       				\
264      res  = (argL - argR) - oldC;				\
265      if (oldC)							\
266         cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR;		\
267      else							\
268         cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;		\
269      pf = parity_table[(UChar)res];				\
270      af = (res ^ argL ^ argR) & 0x10;				\
271      zf = ((DATA_UTYPE)res == 0) << 6;				\
272      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
273      of = lshift((argL ^ argR) & (argL ^ res), 			\
274                  12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
275      return cf | pf | af | zf | sf | of;			\
276    }								\
277 }
278 
279 /*-------------------------------------------------------------*/
280 
281 #define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE)			\
282 {								\
283    PREAMBLE(DATA_BITS);						\
284    { Long cf, pf, af, zf, sf, of;				\
285      cf = 0;							\
286      pf = parity_table[(UChar)CC_DEP1];				\
287      af = 0;							\
288      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
289      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
290      of = 0;							\
291      return cf | pf | af | zf | sf | of;			\
292    }								\
293 }
294 
295 /*-------------------------------------------------------------*/
296 
297 #define ACTIONS_INC(DATA_BITS,DATA_UTYPE)			\
298 {								\
299    PREAMBLE(DATA_BITS);						\
300    { Long cf, pf, af, zf, sf, of;				\
301      Long argL, argR, res;					\
302      res  = CC_DEP1;						\
303      argL = res - 1;						\
304      argR = 1;							\
305      cf = CC_NDEP & AMD64G_CC_MASK_C;				\
306      pf = parity_table[(UChar)res];				\
307      af = (res ^ argL ^ argR) & 0x10;				\
308      zf = ((DATA_UTYPE)res == 0) << 6;				\
309      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
310      of = ((res & DATA_MASK) == SIGN_MASK) << 11;		\
311      return cf | pf | af | zf | sf | of;			\
312    }								\
313 }
314 
315 /*-------------------------------------------------------------*/
316 
317 #define ACTIONS_DEC(DATA_BITS,DATA_UTYPE)			\
318 {								\
319    PREAMBLE(DATA_BITS);						\
320    { Long cf, pf, af, zf, sf, of;				\
321      Long argL, argR, res;					\
322      res  = CC_DEP1;						\
323      argL = res + 1;						\
324      argR = 1;							\
325      cf = CC_NDEP & AMD64G_CC_MASK_C;				\
326      pf = parity_table[(UChar)res];				\
327      af = (res ^ argL ^ argR) & 0x10;				\
328      zf = ((DATA_UTYPE)res == 0) << 6;				\
329      sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
330      of = ((res & DATA_MASK) 					\
331           == ((ULong)SIGN_MASK - 1)) << 11;			\
332      return cf | pf | af | zf | sf | of;			\
333    }								\
334 }
335 
336 /*-------------------------------------------------------------*/
337 
338 #define ACTIONS_SHL(DATA_BITS,DATA_UTYPE)			\
339 {								\
340    PREAMBLE(DATA_BITS);						\
341    { Long cf, pf, af, zf, sf, of;				\
342      cf = (CC_DEP2 >> (DATA_BITS - 1)) & AMD64G_CC_MASK_C;	\
343      pf = parity_table[(UChar)CC_DEP1];				\
344      af = 0; /* undefined */					\
345      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
346      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
347      /* of is defined if shift count == 1 */			\
348      of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) 		\
349           & AMD64G_CC_MASK_O;					\
350      return cf | pf | af | zf | sf | of;			\
351    }								\
352 }
353 
354 /*-------------------------------------------------------------*/
355 
356 #define ACTIONS_SHR(DATA_BITS,DATA_UTYPE)			\
357 {								\
358    PREAMBLE(DATA_BITS);  					\
359    { Long cf, pf, af, zf, sf, of;				\
360      cf = CC_DEP2 & 1;						\
361      pf = parity_table[(UChar)CC_DEP1];				\
362      af = 0; /* undefined */					\
363      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
364      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
365      /* of is defined if shift count == 1 */			\
366      of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS)		\
367           & AMD64G_CC_MASK_O;					\
368      return cf | pf | af | zf | sf | of;			\
369    }								\
370 }
371 
372 /*-------------------------------------------------------------*/
373 
374 /* ROL: cf' = lsb(result).  of' = msb(result) ^ lsb(result). */
375 /* DEP1 = result, NDEP = old flags */
376 #define ACTIONS_ROL(DATA_BITS,DATA_UTYPE)			\
377 {								\
378    PREAMBLE(DATA_BITS);						\
379    { Long fl 							\
380         = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))	\
381           | (AMD64G_CC_MASK_C & CC_DEP1)			\
382           | (AMD64G_CC_MASK_O & (lshift(CC_DEP1,  		\
383                                       11-(DATA_BITS-1)) 	\
384                      ^ lshift(CC_DEP1, 11)));			\
385      return fl;							\
386    }								\
387 }
388 
389 /*-------------------------------------------------------------*/
390 
391 /* ROR: cf' = msb(result).  of' = msb(result) ^ msb-1(result). */
392 /* DEP1 = result, NDEP = old flags */
393 #define ACTIONS_ROR(DATA_BITS,DATA_UTYPE)			\
394 {								\
395    PREAMBLE(DATA_BITS);						\
396    { Long fl 							\
397         = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))	\
398           | (AMD64G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1)))	\
399           | (AMD64G_CC_MASK_O & (lshift(CC_DEP1, 		\
400                                       11-(DATA_BITS-1)) 	\
401                      ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1)));	\
402      return fl;							\
403    }								\
404 }
405 
406 /*-------------------------------------------------------------*/
407 
408 #define ACTIONS_UMUL(DATA_BITS, DATA_UTYPE,  NARROWtoU,         \
409                                 DATA_U2TYPE, NARROWto2U)        \
410 {                                                               \
411    PREAMBLE(DATA_BITS);                                         \
412    { Long cf, pf, af, zf, sf, of;                               \
413      DATA_UTYPE  hi;                                            \
414      DATA_UTYPE  lo                                             \
415         = NARROWtoU( ((DATA_UTYPE)CC_DEP1)                      \
416                      * ((DATA_UTYPE)CC_DEP2) );                 \
417      DATA_U2TYPE rr                                             \
418         = NARROWto2U(                                           \
419              ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1))               \
420              * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2)) );          \
421      hi = NARROWtoU(rr >>/*u*/ DATA_BITS);                      \
422      cf = (hi != 0);                                            \
423      pf = parity_table[(UChar)lo];                              \
424      af = 0; /* undefined */                                    \
425      zf = (lo == 0) << 6;                                       \
426      sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
427      of = cf << 11;                                             \
428      return cf | pf | af | zf | sf | of;                        \
429    }								\
430 }
431 
432 /*-------------------------------------------------------------*/
433 
434 #define ACTIONS_SMUL(DATA_BITS, DATA_STYPE,  NARROWtoS,         \
435                                 DATA_S2TYPE, NARROWto2S)        \
436 {                                                               \
437    PREAMBLE(DATA_BITS);                                         \
438    { Long cf, pf, af, zf, sf, of;                               \
439      DATA_STYPE  hi;                                            \
440      DATA_STYPE  lo                                             \
441         = NARROWtoS( ((DATA_STYPE)CC_DEP1)                      \
442                      * ((DATA_STYPE)CC_DEP2) );                 \
443      DATA_S2TYPE rr                                             \
444         = NARROWto2S(                                           \
445              ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1))               \
446              * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2)) );          \
447      hi = NARROWtoS(rr >>/*s*/ DATA_BITS);                      \
448      cf = (hi != (lo >>/*s*/ (DATA_BITS-1)));                   \
449      pf = parity_table[(UChar)lo];                              \
450      af = 0; /* undefined */                                    \
451      zf = (lo == 0) << 6;                                       \
452      sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
453      of = cf << 11;                                             \
454      return cf | pf | af | zf | sf | of;                        \
455    }								\
456 }
457 
458 /*-------------------------------------------------------------*/
459 
460 #define ACTIONS_UMULQ                                           \
461 {                                                               \
462    PREAMBLE(64);                                                \
463    { Long cf, pf, af, zf, sf, of;                               \
464      ULong lo, hi;                                              \
465      mullU64( (ULong)CC_DEP1, (ULong)CC_DEP2, &hi, &lo );       \
466      cf = (hi != 0);                                            \
467      pf = parity_table[(UChar)lo];                              \
468      af = 0; /* undefined */                                    \
469      zf = (lo == 0) << 6;                                       \
470      sf = lshift(lo, 8 - 64) & 0x80;                            \
471      of = cf << 11;                                             \
472      return cf | pf | af | zf | sf | of;                        \
473    }								\
474 }
475 
476 /*-------------------------------------------------------------*/
477 
478 #define ACTIONS_SMULQ                                           \
479 {                                                               \
480    PREAMBLE(64);                                                \
481    { Long cf, pf, af, zf, sf, of;                               \
482      Long lo, hi;                                               \
483      mullS64( (Long)CC_DEP1, (Long)CC_DEP2, &hi, &lo );         \
484      cf = (hi != (lo >>/*s*/ (64-1)));                          \
485      pf = parity_table[(UChar)lo];                              \
486      af = 0; /* undefined */                                    \
487      zf = (lo == 0) << 6;                                       \
488      sf = lshift(lo, 8 - 64) & 0x80;                            \
489      of = cf << 11;                                             \
490      return cf | pf | af | zf | sf | of;                        \
491    }								\
492 }
493 
494 
495 #if PROFILE_RFLAGS
496 
497 static Bool initted     = False;
498 
499 /* C flag, fast route */
500 static UInt tabc_fast[AMD64G_CC_OP_NUMBER];
501 /* C flag, slow route */
502 static UInt tabc_slow[AMD64G_CC_OP_NUMBER];
503 /* table for calculate_cond */
504 static UInt tab_cond[AMD64G_CC_OP_NUMBER][16];
505 /* total entry counts for calc_all, calc_c, calc_cond. */
506 static UInt n_calc_all  = 0;
507 static UInt n_calc_c    = 0;
508 static UInt n_calc_cond = 0;
509 
510 #define SHOW_COUNTS_NOW (0 == (0x3FFFFF & (n_calc_all+n_calc_c+n_calc_cond)))
511 
512 
showCounts(void)513 static void showCounts ( void )
514 {
515    Int op, co;
516    Char ch;
517    vex_printf("\nTotal calls: calc_all=%u   calc_cond=%u   calc_c=%u\n",
518               n_calc_all, n_calc_cond, n_calc_c);
519 
520    vex_printf("      cSLOW  cFAST    O   NO    B   NB    Z   NZ   BE  NBE"
521               "    S   NS    P   NP    L   NL   LE  NLE\n");
522    vex_printf("     -----------------------------------------------------"
523               "----------------------------------------\n");
524    for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
525 
526       ch = ' ';
527       if (op > 0 && (op-1) % 4 == 0)
528          ch = 'B';
529       if (op > 0 && (op-1) % 4 == 1)
530          ch = 'W';
531       if (op > 0 && (op-1) % 4 == 2)
532          ch = 'L';
533       if (op > 0 && (op-1) % 4 == 3)
534          ch = 'Q';
535 
536       vex_printf("%2d%c: ", op, ch);
537       vex_printf("%6u ", tabc_slow[op]);
538       vex_printf("%6u ", tabc_fast[op]);
539       for (co = 0; co < 16; co++) {
540          Int n = tab_cond[op][co];
541          if (n >= 1000) {
542             vex_printf(" %3dK", n / 1000);
543          } else
544          if (n >= 0) {
545             vex_printf(" %3d ", n );
546          } else {
547             vex_printf("     ");
548          }
549       }
550       vex_printf("\n");
551    }
552    vex_printf("\n");
553 }
554 
initCounts(void)555 static void initCounts ( void )
556 {
557    Int op, co;
558    initted = True;
559    for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
560       tabc_fast[op] = tabc_slow[op] = 0;
561       for (co = 0; co < 16; co++)
562          tab_cond[op][co] = 0;
563    }
564 }
565 
566 #endif /* PROFILE_RFLAGS */
567 
568 
569 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
570 /* Calculate all the 6 flags from the supplied thunk parameters.
571    Worker function, not directly called from generated code. */
572 static
amd64g_calculate_rflags_all_WRK(ULong cc_op,ULong cc_dep1_formal,ULong cc_dep2_formal,ULong cc_ndep_formal)573 ULong amd64g_calculate_rflags_all_WRK ( ULong cc_op,
574                                         ULong cc_dep1_formal,
575                                         ULong cc_dep2_formal,
576                                         ULong cc_ndep_formal )
577 {
578    switch (cc_op) {
579       case AMD64G_CC_OP_COPY:
580          return cc_dep1_formal
581                 & (AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z
582                    | AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P);
583 
584       case AMD64G_CC_OP_ADDB:   ACTIONS_ADD( 8,  UChar  );
585       case AMD64G_CC_OP_ADDW:   ACTIONS_ADD( 16, UShort );
586       case AMD64G_CC_OP_ADDL:   ACTIONS_ADD( 32, UInt   );
587       case AMD64G_CC_OP_ADDQ:   ACTIONS_ADD( 64, ULong  );
588 
589       case AMD64G_CC_OP_ADCB:   ACTIONS_ADC( 8,  UChar  );
590       case AMD64G_CC_OP_ADCW:   ACTIONS_ADC( 16, UShort );
591       case AMD64G_CC_OP_ADCL:   ACTIONS_ADC( 32, UInt   );
592       case AMD64G_CC_OP_ADCQ:   ACTIONS_ADC( 64, ULong  );
593 
594       case AMD64G_CC_OP_SUBB:   ACTIONS_SUB(  8, UChar  );
595       case AMD64G_CC_OP_SUBW:   ACTIONS_SUB( 16, UShort );
596       case AMD64G_CC_OP_SUBL:   ACTIONS_SUB( 32, UInt   );
597       case AMD64G_CC_OP_SUBQ:   ACTIONS_SUB( 64, ULong  );
598 
599       case AMD64G_CC_OP_SBBB:   ACTIONS_SBB(  8, UChar  );
600       case AMD64G_CC_OP_SBBW:   ACTIONS_SBB( 16, UShort );
601       case AMD64G_CC_OP_SBBL:   ACTIONS_SBB( 32, UInt   );
602       case AMD64G_CC_OP_SBBQ:   ACTIONS_SBB( 64, ULong  );
603 
604       case AMD64G_CC_OP_LOGICB: ACTIONS_LOGIC(  8, UChar  );
605       case AMD64G_CC_OP_LOGICW: ACTIONS_LOGIC( 16, UShort );
606       case AMD64G_CC_OP_LOGICL: ACTIONS_LOGIC( 32, UInt   );
607       case AMD64G_CC_OP_LOGICQ: ACTIONS_LOGIC( 64, ULong  );
608 
609       case AMD64G_CC_OP_INCB:   ACTIONS_INC(  8, UChar  );
610       case AMD64G_CC_OP_INCW:   ACTIONS_INC( 16, UShort );
611       case AMD64G_CC_OP_INCL:   ACTIONS_INC( 32, UInt   );
612       case AMD64G_CC_OP_INCQ:   ACTIONS_INC( 64, ULong  );
613 
614       case AMD64G_CC_OP_DECB:   ACTIONS_DEC(  8, UChar  );
615       case AMD64G_CC_OP_DECW:   ACTIONS_DEC( 16, UShort );
616       case AMD64G_CC_OP_DECL:   ACTIONS_DEC( 32, UInt   );
617       case AMD64G_CC_OP_DECQ:   ACTIONS_DEC( 64, ULong  );
618 
619       case AMD64G_CC_OP_SHLB:   ACTIONS_SHL(  8, UChar  );
620       case AMD64G_CC_OP_SHLW:   ACTIONS_SHL( 16, UShort );
621       case AMD64G_CC_OP_SHLL:   ACTIONS_SHL( 32, UInt   );
622       case AMD64G_CC_OP_SHLQ:   ACTIONS_SHL( 64, ULong  );
623 
624       case AMD64G_CC_OP_SHRB:   ACTIONS_SHR(  8, UChar  );
625       case AMD64G_CC_OP_SHRW:   ACTIONS_SHR( 16, UShort );
626       case AMD64G_CC_OP_SHRL:   ACTIONS_SHR( 32, UInt   );
627       case AMD64G_CC_OP_SHRQ:   ACTIONS_SHR( 64, ULong  );
628 
629       case AMD64G_CC_OP_ROLB:   ACTIONS_ROL(  8, UChar  );
630       case AMD64G_CC_OP_ROLW:   ACTIONS_ROL( 16, UShort );
631       case AMD64G_CC_OP_ROLL:   ACTIONS_ROL( 32, UInt   );
632       case AMD64G_CC_OP_ROLQ:   ACTIONS_ROL( 64, ULong  );
633 
634       case AMD64G_CC_OP_RORB:   ACTIONS_ROR(  8, UChar  );
635       case AMD64G_CC_OP_RORW:   ACTIONS_ROR( 16, UShort );
636       case AMD64G_CC_OP_RORL:   ACTIONS_ROR( 32, UInt   );
637       case AMD64G_CC_OP_RORQ:   ACTIONS_ROR( 64, ULong  );
638 
639       case AMD64G_CC_OP_UMULB:  ACTIONS_UMUL(  8, UChar,  toUChar,
640                                                   UShort, toUShort );
641       case AMD64G_CC_OP_UMULW:  ACTIONS_UMUL( 16, UShort, toUShort,
642                                                   UInt,   toUInt );
643       case AMD64G_CC_OP_UMULL:  ACTIONS_UMUL( 32, UInt,   toUInt,
644                                                   ULong,  idULong );
645 
646       case AMD64G_CC_OP_UMULQ:  ACTIONS_UMULQ;
647 
648       case AMD64G_CC_OP_SMULB:  ACTIONS_SMUL(  8, Char,   toUChar,
649                                                   Short,  toUShort );
650       case AMD64G_CC_OP_SMULW:  ACTIONS_SMUL( 16, Short,  toUShort,
651                                                   Int,    toUInt   );
652       case AMD64G_CC_OP_SMULL:  ACTIONS_SMUL( 32, Int,    toUInt,
653                                                   Long,   idULong );
654 
655       case AMD64G_CC_OP_SMULQ:  ACTIONS_SMULQ;
656 
657       default:
658          /* shouldn't really make these calls from generated code */
659          vex_printf("amd64g_calculate_rflags_all_WRK(AMD64)"
660                     "( %llu, 0x%llx, 0x%llx, 0x%llx )\n",
661                     cc_op, cc_dep1_formal, cc_dep2_formal, cc_ndep_formal );
662          vpanic("amd64g_calculate_rflags_all_WRK(AMD64)");
663    }
664 }
665 
666 
667 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
668 /* Calculate all the 6 flags from the supplied thunk parameters. */
amd64g_calculate_rflags_all(ULong cc_op,ULong cc_dep1,ULong cc_dep2,ULong cc_ndep)669 ULong amd64g_calculate_rflags_all ( ULong cc_op,
670                                     ULong cc_dep1,
671                                     ULong cc_dep2,
672                                     ULong cc_ndep )
673 {
674 #  if PROFILE_RFLAGS
675    if (!initted) initCounts();
676    n_calc_all++;
677    if (SHOW_COUNTS_NOW) showCounts();
678 #  endif
679    return
680       amd64g_calculate_rflags_all_WRK ( cc_op, cc_dep1, cc_dep2, cc_ndep );
681 }
682 
683 
684 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
685 /* Calculate just the carry flag from the supplied thunk parameters. */
amd64g_calculate_rflags_c(ULong cc_op,ULong cc_dep1,ULong cc_dep2,ULong cc_ndep)686 ULong amd64g_calculate_rflags_c ( ULong cc_op,
687                                   ULong cc_dep1,
688                                   ULong cc_dep2,
689                                   ULong cc_ndep )
690 {
691 #  if PROFILE_RFLAGS
692    if (!initted) initCounts();
693    n_calc_c++;
694    tabc_fast[cc_op]++;
695    if (SHOW_COUNTS_NOW) showCounts();
696 #  endif
697 
698    /* Fast-case some common ones. */
699    switch (cc_op) {
700       case AMD64G_CC_OP_COPY:
701          return (cc_dep1 >> AMD64G_CC_SHIFT_C) & 1;
702       case AMD64G_CC_OP_LOGICQ:
703       case AMD64G_CC_OP_LOGICL:
704       case AMD64G_CC_OP_LOGICW:
705       case AMD64G_CC_OP_LOGICB:
706          return 0;
707 	 //      case AMD64G_CC_OP_SUBL:
708 	 //         return ((UInt)cc_dep1) < ((UInt)cc_dep2)
709 	 //                   ? AMD64G_CC_MASK_C : 0;
710 	 //      case AMD64G_CC_OP_SUBW:
711 	 //         return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF))
712 	 //                   ? AMD64G_CC_MASK_C : 0;
713 	 //      case AMD64G_CC_OP_SUBB:
714 	 //         return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
715 	 //                   ? AMD64G_CC_MASK_C : 0;
716 	 //      case AMD64G_CC_OP_INCL:
717 	 //      case AMD64G_CC_OP_DECL:
718 	 //         return cc_ndep & AMD64G_CC_MASK_C;
719       default:
720          break;
721    }
722 
723 #  if PROFILE_RFLAGS
724    tabc_fast[cc_op]--;
725    tabc_slow[cc_op]++;
726 #  endif
727 
728    return amd64g_calculate_rflags_all_WRK(cc_op,cc_dep1,cc_dep2,cc_ndep)
729           & AMD64G_CC_MASK_C;
730 }
731 
732 
733 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
734 /* returns 1 or 0 */
amd64g_calculate_condition(ULong cond,ULong cc_op,ULong cc_dep1,ULong cc_dep2,ULong cc_ndep)735 ULong amd64g_calculate_condition ( ULong/*AMD64Condcode*/ cond,
736                                    ULong cc_op,
737                                    ULong cc_dep1,
738                                    ULong cc_dep2,
739                                    ULong cc_ndep )
740 {
741    ULong rflags = amd64g_calculate_rflags_all_WRK(cc_op, cc_dep1,
742                                                   cc_dep2, cc_ndep);
743    ULong of,sf,zf,cf,pf;
744    ULong inv = cond & 1;
745 
746 #  if PROFILE_RFLAGS
747    if (!initted) initCounts();
748    tab_cond[cc_op][cond]++;
749    n_calc_cond++;
750    if (SHOW_COUNTS_NOW) showCounts();
751 #  endif
752 
753    switch (cond) {
754       case AMD64CondNO:
755       case AMD64CondO: /* OF == 1 */
756          of = rflags >> AMD64G_CC_SHIFT_O;
757          return 1 & (inv ^ of);
758 
759       case AMD64CondNZ:
760       case AMD64CondZ: /* ZF == 1 */
761          zf = rflags >> AMD64G_CC_SHIFT_Z;
762          return 1 & (inv ^ zf);
763 
764       case AMD64CondNB:
765       case AMD64CondB: /* CF == 1 */
766          cf = rflags >> AMD64G_CC_SHIFT_C;
767          return 1 & (inv ^ cf);
768          break;
769 
770       case AMD64CondNBE:
771       case AMD64CondBE: /* (CF or ZF) == 1 */
772          cf = rflags >> AMD64G_CC_SHIFT_C;
773          zf = rflags >> AMD64G_CC_SHIFT_Z;
774          return 1 & (inv ^ (cf | zf));
775          break;
776 
777       case AMD64CondNS:
778       case AMD64CondS: /* SF == 1 */
779          sf = rflags >> AMD64G_CC_SHIFT_S;
780          return 1 & (inv ^ sf);
781 
782       case AMD64CondNP:
783       case AMD64CondP: /* PF == 1 */
784          pf = rflags >> AMD64G_CC_SHIFT_P;
785          return 1 & (inv ^ pf);
786 
787       case AMD64CondNL:
788       case AMD64CondL: /* (SF xor OF) == 1 */
789          sf = rflags >> AMD64G_CC_SHIFT_S;
790          of = rflags >> AMD64G_CC_SHIFT_O;
791          return 1 & (inv ^ (sf ^ of));
792          break;
793 
794       case AMD64CondNLE:
795       case AMD64CondLE: /* ((SF xor OF) or ZF)  == 1 */
796          sf = rflags >> AMD64G_CC_SHIFT_S;
797          of = rflags >> AMD64G_CC_SHIFT_O;
798          zf = rflags >> AMD64G_CC_SHIFT_Z;
799          return 1 & (inv ^ ((sf ^ of) | zf));
800          break;
801 
802       default:
803          /* shouldn't really make these calls from generated code */
804          vex_printf("amd64g_calculate_condition"
805                     "( %llu, %llu, 0x%llx, 0x%llx, 0x%llx )\n",
806                     cond, cc_op, cc_dep1, cc_dep2, cc_ndep );
807          vpanic("amd64g_calculate_condition");
808    }
809 }
810 
811 
812 /* VISIBLE TO LIBVEX CLIENT */
LibVEX_GuestAMD64_get_rflags(VexGuestAMD64State * vex_state)813 ULong LibVEX_GuestAMD64_get_rflags ( /*IN*/VexGuestAMD64State* vex_state )
814 {
815    ULong rflags = amd64g_calculate_rflags_all_WRK(
816                      vex_state->guest_CC_OP,
817                      vex_state->guest_CC_DEP1,
818                      vex_state->guest_CC_DEP2,
819                      vex_state->guest_CC_NDEP
820                   );
821    Long dflag = vex_state->guest_DFLAG;
822    vassert(dflag == 1 || dflag == -1);
823    if (dflag == -1)
824       rflags |= (1<<10);
825    if (vex_state->guest_IDFLAG == 1)
826       rflags |= (1<<21);
827    if (vex_state->guest_ACFLAG == 1)
828       rflags |= (1<<18);
829 
830    return rflags;
831 }
832 
833 /* VISIBLE TO LIBVEX CLIENT */
834 void
LibVEX_GuestAMD64_put_rflag_c(ULong new_carry_flag,VexGuestAMD64State * vex_state)835 LibVEX_GuestAMD64_put_rflag_c ( ULong new_carry_flag,
836                                /*MOD*/VexGuestAMD64State* vex_state )
837 {
838    ULong oszacp = amd64g_calculate_rflags_all_WRK(
839                      vex_state->guest_CC_OP,
840                      vex_state->guest_CC_DEP1,
841                      vex_state->guest_CC_DEP2,
842                      vex_state->guest_CC_NDEP
843                   );
844    if (new_carry_flag & 1) {
845       oszacp |= AMD64G_CC_MASK_C;
846    } else {
847       oszacp &= ~AMD64G_CC_MASK_C;
848    }
849    vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
850    vex_state->guest_CC_DEP1 = oszacp;
851    vex_state->guest_CC_DEP2 = 0;
852    vex_state->guest_CC_NDEP = 0;
853 }
854 
855 
856 /*---------------------------------------------------------------*/
857 /*--- %rflags translation-time function specialisers.         ---*/
858 /*--- These help iropt specialise calls the above run-time    ---*/
859 /*--- %rflags functions.                                      ---*/
860 /*---------------------------------------------------------------*/
861 
862 /* Used by the optimiser to try specialisations.  Returns an
863    equivalent expression, or NULL if none. */
864 
isU64(IRExpr * e,ULong n)865 static Bool isU64 ( IRExpr* e, ULong n )
866 {
867    return toBool( e->tag == Iex_Const
868                   && e->Iex.Const.con->tag == Ico_U64
869                   && e->Iex.Const.con->Ico.U64 == n );
870 }
871 
guest_amd64_spechelper(HChar * function_name,IRExpr ** args,IRStmt ** precedingStmts,Int n_precedingStmts)872 IRExpr* guest_amd64_spechelper ( HChar* function_name,
873                                  IRExpr** args,
874                                  IRStmt** precedingStmts,
875                                  Int      n_precedingStmts )
876 {
877 #  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
878 #  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
879 #  define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
880 #  define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
881 #  define mkU8(_n)  IRExpr_Const(IRConst_U8(_n))
882 
883    Int i, arity = 0;
884    for (i = 0; args[i]; i++)
885       arity++;
886 #  if 0
887    vex_printf("spec request:\n");
888    vex_printf("   %s  ", function_name);
889    for (i = 0; i < arity; i++) {
890       vex_printf("  ");
891       ppIRExpr(args[i]);
892    }
893    vex_printf("\n");
894 #  endif
895 
896    /* --------- specialising "amd64g_calculate_condition" --------- */
897 
898    if (vex_streq(function_name, "amd64g_calculate_condition")) {
899       /* specialise calls to above "calculate condition" function */
900       IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2;
901       vassert(arity == 5);
902       cond    = args[0];
903       cc_op   = args[1];
904       cc_dep1 = args[2];
905       cc_dep2 = args[3];
906 
907       /*---------------- ADDQ ----------------*/
908 
909       if (isU64(cc_op, AMD64G_CC_OP_ADDQ) && isU64(cond, AMD64CondZ)) {
910          /* long long add, then Z --> test (dst+src == 0) */
911          return unop(Iop_1Uto64,
912                      binop(Iop_CmpEQ64,
913                            binop(Iop_Add64, cc_dep1, cc_dep2),
914                            mkU64(0)));
915       }
916 
917       /*---------------- SUBQ ----------------*/
918 
919       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondZ)) {
920          /* long long sub/cmp, then Z --> test dst==src */
921          return unop(Iop_1Uto64,
922                      binop(Iop_CmpEQ64,cc_dep1,cc_dep2));
923       }
924       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNZ)) {
925          /* long long sub/cmp, then NZ --> test dst!=src */
926          return unop(Iop_1Uto64,
927                      binop(Iop_CmpNE64,cc_dep1,cc_dep2));
928       }
929 
930       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondL)) {
931          /* long long sub/cmp, then L (signed less than)
932             --> test dst <s src */
933          return unop(Iop_1Uto64,
934                      binop(Iop_CmpLT64S, cc_dep1, cc_dep2));
935       }
936 
937       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondB)) {
938          /* long long sub/cmp, then B (unsigned less than)
939             --> test dst <u src */
940          return unop(Iop_1Uto64,
941                      binop(Iop_CmpLT64U, cc_dep1, cc_dep2));
942       }
943       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNB)) {
944          /* long long sub/cmp, then NB (unsigned greater than or equal)
945             --> test src <=u dst */
946          /* Note, args are opposite way round from the usual */
947          return unop(Iop_1Uto64,
948                      binop(Iop_CmpLE64U, cc_dep2, cc_dep1));
949       }
950 
951       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondBE)) {
952          /* long long sub/cmp, then BE (unsigned less than or equal)
953             --> test dst <=u src */
954          return unop(Iop_1Uto64,
955                      binop(Iop_CmpLE64U, cc_dep1, cc_dep2));
956       }
957       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNBE)) {
958          /* long long sub/cmp, then NBE (unsigned greater than)
959             --> test !(dst <=u src) */
960          return binop(Iop_Xor64,
961                       unop(Iop_1Uto64,
962                            binop(Iop_CmpLE64U, cc_dep1, cc_dep2)),
963                       mkU64(1));
964       }
965 
966       /*---------------- SUBL ----------------*/
967 
968       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondZ)) {
969          /* long sub/cmp, then Z --> test dst==src */
970          return unop(Iop_1Uto64,
971                      binop(Iop_CmpEQ32,
972                            unop(Iop_64to32, cc_dep1),
973                            unop(Iop_64to32, cc_dep2)));
974       }
975       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNZ)) {
976          /* long sub/cmp, then NZ --> test dst!=src */
977          return unop(Iop_1Uto64,
978                      binop(Iop_CmpNE32,
979                            unop(Iop_64to32, cc_dep1),
980                            unop(Iop_64to32, cc_dep2)));
981       }
982 
983       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondL)) {
984          /* long sub/cmp, then L (signed less than)
985             --> test dst <s src */
986          return unop(Iop_1Uto64,
987                      binop(Iop_CmpLT32S,
988                            unop(Iop_64to32, cc_dep1),
989                            unop(Iop_64to32, cc_dep2)));
990       }
991 
992       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondLE)) {
993          /* long sub/cmp, then LE (signed less than or equal)
994             --> test dst <=s src */
995          return unop(Iop_1Uto64,
996                      binop(Iop_CmpLE32S,
997                            unop(Iop_64to32, cc_dep1),
998                            unop(Iop_64to32, cc_dep2)));
999 
1000       }
1001       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNLE)) {
1002          /* long sub/cmp, then NLE (signed greater than)
1003             --> test !(dst <=s src)
1004             --> test (dst >s src)
1005             --> test (src <s dst) */
1006          return unop(Iop_1Uto64,
1007                      binop(Iop_CmpLT32S,
1008                            unop(Iop_64to32, cc_dep2),
1009                            unop(Iop_64to32, cc_dep1)));
1010 
1011       }
1012 
1013       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondBE)) {
1014          /* long sub/cmp, then BE (unsigned less than or equal)
1015             --> test dst <=u src */
1016          return unop(Iop_1Uto64,
1017                      binop(Iop_CmpLE32U,
1018                            unop(Iop_64to32, cc_dep1),
1019                            unop(Iop_64to32, cc_dep2)));
1020       }
1021       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNBE)) {
1022          /* long sub/cmp, then NBE (unsigned greater than)
1023             --> test src <u dst */
1024          /* Note, args are opposite way round from the usual */
1025          return unop(Iop_1Uto64,
1026                      binop(Iop_CmpLT32U,
1027                            unop(Iop_64to32, cc_dep2),
1028                            unop(Iop_64to32, cc_dep1)));
1029       }
1030 
1031       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondS)) {
1032          /* long sub/cmp, then S (negative) --> test (dst-src <s 0) */
1033          return unop(Iop_1Uto64,
1034                      binop(Iop_CmpLT32S,
1035                            binop(Iop_Sub32,
1036                                  unop(Iop_64to32, cc_dep1),
1037                                  unop(Iop_64to32, cc_dep2)),
1038                            mkU32(0)));
1039       }
1040 
1041       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondB)) {
1042          /* long sub/cmp, then B (unsigned less than)
1043             --> test dst <u src */
1044          return unop(Iop_1Uto64,
1045                      binop(Iop_CmpLT32U,
1046                            unop(Iop_64to32, cc_dep1),
1047                            unop(Iop_64to32, cc_dep2)));
1048       }
1049 
1050       /*---------------- SUBW ----------------*/
1051 
1052       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondZ)) {
1053          /* word sub/cmp, then Z --> test dst==src */
1054          return unop(Iop_1Uto64,
1055                      binop(Iop_CmpEQ16,
1056                            unop(Iop_64to16,cc_dep1),
1057                            unop(Iop_64to16,cc_dep2)));
1058       }
1059       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNZ)) {
1060          /* word sub/cmp, then NZ --> test dst!=src */
1061          return unop(Iop_1Uto64,
1062                      binop(Iop_CmpNE16,
1063                            unop(Iop_64to16,cc_dep1),
1064                            unop(Iop_64to16,cc_dep2)));
1065       }
1066 
1067       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondLE)) {
1068          /* word sub/cmp, then LE (signed less than or equal)
1069             --> test dst <=s src */
1070          return unop(Iop_1Uto64,
1071                      binop(Iop_CmpLE64S,
1072                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
1073                            binop(Iop_Shl64,cc_dep2,mkU8(48))));
1074 
1075       }
1076 
1077       /*---------------- SUBB ----------------*/
1078 
1079       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondZ)) {
1080          /* byte sub/cmp, then Z --> test dst==src */
1081          return unop(Iop_1Uto64,
1082                      binop(Iop_CmpEQ8,
1083                            unop(Iop_64to8,cc_dep1),
1084                            unop(Iop_64to8,cc_dep2)));
1085       }
1086       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNZ)) {
1087          /* byte sub/cmp, then NZ --> test dst!=src */
1088          return unop(Iop_1Uto64,
1089                      binop(Iop_CmpNE8,
1090                            unop(Iop_64to8,cc_dep1),
1091                            unop(Iop_64to8,cc_dep2)));
1092       }
1093 
1094       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondBE)) {
1095          /* byte sub/cmp, then BE (unsigned less than or equal)
1096             --> test dst <=u src */
1097          return unop(Iop_1Uto64,
1098                      binop(Iop_CmpLE64U,
1099                            binop(Iop_And64, cc_dep1, mkU64(0xFF)),
1100                            binop(Iop_And64, cc_dep2, mkU64(0xFF))));
1101       }
1102 
1103       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondS)
1104                                           && isU64(cc_dep2, 0)) {
1105          /* byte sub/cmp of zero, then S --> test (dst-0 <s 0)
1106                                          --> test dst <s 0
1107                                          --> (ULong)dst[7]
1108             This is yet another scheme by which gcc figures out if the
1109             top bit of a byte is 1 or 0.  See also LOGICB/CondS below. */
1110          /* Note: isU64(cc_dep2, 0) is correct, even though this is
1111             for an 8-bit comparison, since the args to the helper
1112             function are always U64s. */
1113          return binop(Iop_And64,
1114                       binop(Iop_Shr64,cc_dep1,mkU8(7)),
1115                       mkU64(1));
1116       }
1117       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNS)
1118                                           && isU64(cc_dep2, 0)) {
1119          /* byte sub/cmp of zero, then NS --> test !(dst-0 <s 0)
1120                                           --> test !(dst <s 0)
1121                                           --> (ULong) !dst[7]
1122          */
1123          return binop(Iop_Xor64,
1124                       binop(Iop_And64,
1125                             binop(Iop_Shr64,cc_dep1,mkU8(7)),
1126                             mkU64(1)),
1127                       mkU64(1));
1128       }
1129 
1130       /*---------------- LOGICQ ----------------*/
1131 
1132       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondZ)) {
1133          /* long long and/or/xor, then Z --> test dst==0 */
1134          return unop(Iop_1Uto64,
1135                      binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
1136       }
1137       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondNZ)) {
1138          /* long long and/or/xor, then NZ --> test dst!=0 */
1139          return unop(Iop_1Uto64,
1140                      binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
1141       }
1142 
1143       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondL)) {
1144          /* long long and/or/xor, then L
1145             LOGIC sets SF and ZF according to the
1146             result and makes OF be zero.  L computes SF ^ OF, but
1147             OF is zero, so this reduces to SF -- which will be 1 iff
1148             the result is < signed 0.  Hence ...
1149          */
1150          return unop(Iop_1Uto64,
1151                      binop(Iop_CmpLT64S,
1152                            cc_dep1,
1153                            mkU64(0)));
1154       }
1155 
1156       /*---------------- LOGICL ----------------*/
1157 
1158       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondZ)) {
1159          /* long and/or/xor, then Z --> test dst==0 */
1160          return unop(Iop_1Uto64,
1161                      binop(Iop_CmpEQ32,
1162                            unop(Iop_64to32, cc_dep1),
1163                            mkU32(0)));
1164       }
1165       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNZ)) {
1166          /* long and/or/xor, then NZ --> test dst!=0 */
1167          return unop(Iop_1Uto64,
1168                      binop(Iop_CmpNE32,
1169                            unop(Iop_64to32, cc_dep1),
1170                            mkU32(0)));
1171       }
1172 
1173       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondLE)) {
1174          /* long and/or/xor, then LE
1175             This is pretty subtle.  LOGIC sets SF and ZF according to the
1176             result and makes OF be zero.  LE computes (SF ^ OF) | ZF, but
1177             OF is zero, so this reduces to SF | ZF -- which will be 1 iff
1178             the result is <=signed 0.  Hence ...
1179          */
1180          return unop(Iop_1Uto64,
1181                      binop(Iop_CmpLE32S,
1182                            unop(Iop_64to32, cc_dep1),
1183                            mkU32(0)));
1184       }
1185 
1186       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondS)) {
1187          /* long and/or/xor, then S --> (ULong)result[31] */
1188          return binop(Iop_And64,
1189                       binop(Iop_Shr64, cc_dep1, mkU8(31)),
1190                       mkU64(1));
1191       }
1192       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNS)) {
1193          /* long and/or/xor, then S --> (ULong) ~ result[31] */
1194          return binop(Iop_Xor64,
1195                 binop(Iop_And64,
1196                       binop(Iop_Shr64, cc_dep1, mkU8(31)),
1197                       mkU64(1)),
1198                 mkU64(1));
1199       }
1200 
1201       /*---------------- LOGICW ----------------*/
1202 
1203       if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondZ)) {
1204          /* word and/or/xor, then Z --> test dst==0 */
1205          return unop(Iop_1Uto64,
1206                      binop(Iop_CmpEQ64,
1207                            binop(Iop_And64, cc_dep1, mkU64(0xFFFF)),
1208                            mkU64(0)));
1209       }
1210       if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondNZ)) {
1211          /* word and/or/xor, then NZ --> test dst!=0 */
1212          return unop(Iop_1Uto64,
1213                      binop(Iop_CmpNE64,
1214                            binop(Iop_And64, cc_dep1, mkU64(0xFFFF)),
1215                            mkU64(0)));
1216       }
1217 
1218       /*---------------- LOGICB ----------------*/
1219 
1220       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondZ)) {
1221          /* byte and/or/xor, then Z --> test dst==0 */
1222          return unop(Iop_1Uto64,
1223                      binop(Iop_CmpEQ64, binop(Iop_And64,cc_dep1,mkU64(255)),
1224                                         mkU64(0)));
1225       }
1226       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNZ)) {
1227          /* byte and/or/xor, then NZ --> test dst!=0 */
1228          return unop(Iop_1Uto64,
1229                      binop(Iop_CmpNE64, binop(Iop_And64,cc_dep1,mkU64(255)),
1230                                         mkU64(0)));
1231       }
1232 
1233       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondS)) {
1234          /* this is an idiom gcc sometimes uses to find out if the top
1235             bit of a byte register is set: eg testb %al,%al; js ..
1236             Since it just depends on the top bit of the byte, extract
1237             that bit and explicitly get rid of all the rest.  This
1238             helps memcheck avoid false positives in the case where any
1239             of the other bits in the byte are undefined. */
1240          /* byte and/or/xor, then S --> (UInt)result[7] */
1241          return binop(Iop_And64,
1242                       binop(Iop_Shr64,cc_dep1,mkU8(7)),
1243                       mkU64(1));
1244       }
1245       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNS)) {
1246          /* byte and/or/xor, then NS --> (UInt)!result[7] */
1247          return binop(Iop_Xor64,
1248                       binop(Iop_And64,
1249                             binop(Iop_Shr64,cc_dep1,mkU8(7)),
1250                             mkU64(1)),
1251                       mkU64(1));
1252       }
1253 
1254       /*---------------- INCB ----------------*/
1255 
1256       if (isU64(cc_op, AMD64G_CC_OP_INCB) && isU64(cond, AMD64CondLE)) {
1257          /* 8-bit inc, then LE --> sign bit of the arg */
1258          return binop(Iop_And64,
1259                       binop(Iop_Shr64,
1260                             binop(Iop_Sub64, cc_dep1, mkU64(1)),
1261                             mkU8(7)),
1262                       mkU64(1));
1263       }
1264 
1265       /*---------------- INCW ----------------*/
1266 
1267       if (isU64(cc_op, AMD64G_CC_OP_INCW) && isU64(cond, AMD64CondZ)) {
1268          /* 16-bit inc, then Z --> test dst == 0 */
1269          return unop(Iop_1Uto64,
1270                      binop(Iop_CmpEQ64,
1271                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
1272                            mkU64(0)));
1273       }
1274 
1275       /*---------------- DECL ----------------*/
1276 
1277       if (isU64(cc_op, AMD64G_CC_OP_DECL) && isU64(cond, AMD64CondZ)) {
1278          /* dec L, then Z --> test dst == 0 */
1279          return unop(Iop_1Uto64,
1280                      binop(Iop_CmpEQ32,
1281                            unop(Iop_64to32, cc_dep1),
1282                            mkU32(0)));
1283       }
1284 
1285       /*---------------- DECW ----------------*/
1286 
1287       if (isU64(cc_op, AMD64G_CC_OP_DECW) && isU64(cond, AMD64CondNZ)) {
1288          /* 16-bit dec, then NZ --> test dst != 0 */
1289          return unop(Iop_1Uto64,
1290                      binop(Iop_CmpNE64,
1291                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
1292                            mkU64(0)));
1293       }
1294 
1295       /*---------------- COPY ----------------*/
1296       /* This can happen, as a result of amd64 FP compares: "comisd ... ;
1297          jbe" for example. */
1298 
1299       if (isU64(cc_op, AMD64G_CC_OP_COPY) &&
1300           (isU64(cond, AMD64CondBE) || isU64(cond, AMD64CondNBE))) {
1301          /* COPY, then BE --> extract C and Z from dep1, and test (C
1302             or Z == 1). */
1303          /* COPY, then NBE --> extract C and Z from dep1, and test (C
1304             or Z == 0). */
1305          ULong nnn = isU64(cond, AMD64CondBE) ? 1 : 0;
1306          return
1307             unop(
1308                Iop_1Uto64,
1309                binop(
1310                   Iop_CmpEQ64,
1311                   binop(
1312                      Iop_And64,
1313                      binop(
1314                         Iop_Or64,
1315                         binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
1316                         binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z))
1317                      ),
1318                      mkU64(1)
1319                   ),
1320                   mkU64(nnn)
1321                )
1322             );
1323       }
1324 
1325       if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondB)) {
1326          /* COPY, then B --> extract C dep1, and test (C == 1). */
1327          return
1328             unop(
1329                Iop_1Uto64,
1330                binop(
1331                   Iop_CmpNE64,
1332                   binop(
1333                      Iop_And64,
1334                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
1335                      mkU64(1)
1336                   ),
1337                   mkU64(0)
1338                )
1339             );
1340       }
1341 
1342       if (isU64(cc_op, AMD64G_CC_OP_COPY)
1343           && (isU64(cond, AMD64CondZ) || isU64(cond, AMD64CondNZ))) {
1344          /* COPY, then Z --> extract Z from dep1, and test (Z == 1). */
1345          /* COPY, then NZ --> extract Z from dep1, and test (Z == 0). */
1346          UInt nnn = isU64(cond, AMD64CondZ) ? 1 : 0;
1347          return
1348             unop(
1349                Iop_1Uto64,
1350                binop(
1351                   Iop_CmpEQ64,
1352                   binop(
1353                      Iop_And64,
1354                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z)),
1355                      mkU64(1)
1356                   ),
1357                   mkU64(nnn)
1358                )
1359             );
1360       }
1361 
1362       if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondP)) {
1363          /* COPY, then P --> extract P from dep1, and test (P == 1). */
1364          return
1365             unop(
1366                Iop_1Uto64,
1367                binop(
1368                   Iop_CmpNE64,
1369                   binop(
1370                      Iop_And64,
1371                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_P)),
1372                      mkU64(1)
1373                   ),
1374                   mkU64(0)
1375                )
1376             );
1377       }
1378 
1379       return NULL;
1380    }
1381 
1382    /* --------- specialising "amd64g_calculate_rflags_c" --------- */
1383 
1384    if (vex_streq(function_name, "amd64g_calculate_rflags_c")) {
1385       /* specialise calls to above "calculate_rflags_c" function */
1386       IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
1387       vassert(arity == 4);
1388       cc_op   = args[0];
1389       cc_dep1 = args[1];
1390       cc_dep2 = args[2];
1391       cc_ndep = args[3];
1392 
1393       if (isU64(cc_op, AMD64G_CC_OP_SUBQ)) {
1394          /* C after sub denotes unsigned less than */
1395          return unop(Iop_1Uto64,
1396                      binop(Iop_CmpLT64U,
1397                            cc_dep1,
1398                            cc_dep2));
1399       }
1400       if (isU64(cc_op, AMD64G_CC_OP_SUBL)) {
1401          /* C after sub denotes unsigned less than */
1402          return unop(Iop_1Uto64,
1403                      binop(Iop_CmpLT32U,
1404                            unop(Iop_64to32, cc_dep1),
1405                            unop(Iop_64to32, cc_dep2)));
1406       }
1407       if (isU64(cc_op, AMD64G_CC_OP_SUBB)) {
1408          /* C after sub denotes unsigned less than */
1409          return unop(Iop_1Uto64,
1410                      binop(Iop_CmpLT64U,
1411                            binop(Iop_And64,cc_dep1,mkU64(0xFF)),
1412                            binop(Iop_And64,cc_dep2,mkU64(0xFF))));
1413       }
1414       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ)
1415           || isU64(cc_op, AMD64G_CC_OP_LOGICL)
1416           || isU64(cc_op, AMD64G_CC_OP_LOGICW)
1417           || isU64(cc_op, AMD64G_CC_OP_LOGICB)) {
1418          /* cflag after logic is zero */
1419          return mkU64(0);
1420       }
1421       if (isU64(cc_op, AMD64G_CC_OP_DECL) || isU64(cc_op, AMD64G_CC_OP_INCL)
1422           || isU64(cc_op, AMD64G_CC_OP_DECQ) || isU64(cc_op, AMD64G_CC_OP_INCQ)) {
1423          /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
1424          return cc_ndep;
1425       }
1426 
1427 #     if 0
1428       if (cc_op->tag == Iex_Const) {
1429          vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n");
1430       }
1431 #     endif
1432 
1433       return NULL;
1434    }
1435 
1436 #  undef unop
1437 #  undef binop
1438 #  undef mkU64
1439 #  undef mkU32
1440 #  undef mkU8
1441 
1442    return NULL;
1443 }
1444 
1445 
1446 /*---------------------------------------------------------------*/
1447 /*--- Supporting functions for x87 FPU activities.            ---*/
1448 /*---------------------------------------------------------------*/
1449 
host_is_little_endian(void)1450 static inline Bool host_is_little_endian ( void )
1451 {
1452    UInt x = 0x76543210;
1453    UChar* p = (UChar*)(&x);
1454    return toBool(*p == 0x10);
1455 }
1456 
1457 /* Inspect a value and its tag, as per the x87 'FXAM' instruction. */
1458 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_FXAM(ULong tag,ULong dbl)1459 ULong amd64g_calculate_FXAM ( ULong tag, ULong dbl )
1460 {
1461    Bool   mantissaIsZero;
1462    Int    bexp;
1463    UChar  sign;
1464    UChar* f64;
1465 
1466    vassert(host_is_little_endian());
1467 
1468    /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */
1469 
1470    f64  = (UChar*)(&dbl);
1471    sign = toUChar( (f64[7] >> 7) & 1 );
1472 
1473    /* First off, if the tag indicates the register was empty,
1474       return 1,0,sign,1 */
1475    if (tag == 0) {
1476       /* vex_printf("Empty\n"); */
1477       return AMD64G_FC_MASK_C3 | 0 | (sign << AMD64G_FC_SHIFT_C1)
1478                                    | AMD64G_FC_MASK_C0;
1479    }
1480 
1481    bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
1482    bexp &= 0x7FF;
1483 
1484    mantissaIsZero
1485       = toBool(
1486            (f64[6] & 0x0F) == 0
1487            && (f64[5] | f64[4] | f64[3] | f64[2] | f64[1] | f64[0]) == 0
1488         );
1489 
1490    /* If both exponent and mantissa are zero, the value is zero.
1491       Return 1,0,sign,0. */
1492    if (bexp == 0 && mantissaIsZero) {
1493       /* vex_printf("Zero\n"); */
1494       return AMD64G_FC_MASK_C3 | 0
1495                                | (sign << AMD64G_FC_SHIFT_C1) | 0;
1496    }
1497 
1498    /* If exponent is zero but mantissa isn't, it's a denormal.
1499       Return 1,1,sign,0. */
1500    if (bexp == 0 && !mantissaIsZero) {
1501       /* vex_printf("Denormal\n"); */
1502       return AMD64G_FC_MASK_C3 | AMD64G_FC_MASK_C2
1503                                | (sign << AMD64G_FC_SHIFT_C1) | 0;
1504    }
1505 
1506    /* If the exponent is 7FF and the mantissa is zero, this is an infinity.
1507       Return 0,1,sign,1. */
1508    if (bexp == 0x7FF && mantissaIsZero) {
1509       /* vex_printf("Inf\n"); */
1510       return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1)
1511                                    | AMD64G_FC_MASK_C0;
1512    }
1513 
1514    /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN.
1515       Return 0,0,sign,1. */
1516    if (bexp == 0x7FF && !mantissaIsZero) {
1517       /* vex_printf("NaN\n"); */
1518       return 0 | 0 | (sign << AMD64G_FC_SHIFT_C1) | AMD64G_FC_MASK_C0;
1519    }
1520 
1521    /* Uh, ok, we give up.  It must be a normal finite number.
1522       Return 0,1,sign,0.
1523    */
1524    /* vex_printf("normal\n"); */
1525    return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1) | 0;
1526 }
1527 
1528 
1529 /* This is used to implement both 'frstor' and 'fldenv'.  The latter
1530    appears to differ from the former only in that the 8 FP registers
1531    themselves are not transferred into the guest state. */
1532 static
do_put_x87(Bool moveRegs,UChar * x87_state,VexGuestAMD64State * vex_state)1533 VexEmWarn do_put_x87 ( Bool moveRegs,
1534                        /*IN*/UChar* x87_state,
1535                        /*OUT*/VexGuestAMD64State* vex_state )
1536 {
1537    Int        stno, preg;
1538    UInt       tag;
1539    ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
1540    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
1541    Fpu_State* x87     = (Fpu_State*)x87_state;
1542    UInt       ftop    = (x87->env[FP_ENV_STAT] >> 11) & 7;
1543    UInt       tagw    = x87->env[FP_ENV_TAG];
1544    UInt       fpucw   = x87->env[FP_ENV_CTRL];
1545    UInt       c3210   = x87->env[FP_ENV_STAT] & 0x4700;
1546    VexEmWarn  ew;
1547    UInt       fpround;
1548    ULong      pair;
1549 
1550    /* Copy registers and tags */
1551    for (stno = 0; stno < 8; stno++) {
1552       preg = (stno + ftop) & 7;
1553       tag = (tagw >> (2*preg)) & 3;
1554       if (tag == 3) {
1555          /* register is empty */
1556          /* hmm, if it's empty, does it still get written?  Probably
1557             safer to say it does.  If we don't, memcheck could get out
1558             of sync, in that it thinks all FP registers are defined by
1559             this helper, but in reality some have not been updated. */
1560          if (moveRegs)
1561             vexRegs[preg] = 0; /* IEEE754 64-bit zero */
1562          vexTags[preg] = 0;
1563       } else {
1564          /* register is non-empty */
1565          if (moveRegs)
1566             convert_f80le_to_f64le( &x87->reg[10*stno],
1567                                     (UChar*)&vexRegs[preg] );
1568          vexTags[preg] = 1;
1569       }
1570    }
1571 
1572    /* stack pointer */
1573    vex_state->guest_FTOP = ftop;
1574 
1575    /* status word */
1576    vex_state->guest_FC3210 = c3210;
1577 
1578    /* handle the control word, setting FPROUND and detecting any
1579       emulation warnings. */
1580    pair    = amd64g_check_fldcw ( (ULong)fpucw );
1581    fpround = (UInt)pair & 0xFFFFFFFFULL;
1582    ew      = (VexEmWarn)(pair >> 32);
1583 
1584    vex_state->guest_FPROUND = fpround & 3;
1585 
1586    /* emulation warnings --> caller */
1587    return ew;
1588 }
1589 
1590 
1591 /* Create an x87 FPU state from the guest state, as close as
1592    we can approximate it. */
1593 static
do_get_x87(VexGuestAMD64State * vex_state,UChar * x87_state)1594 void do_get_x87 ( /*IN*/VexGuestAMD64State* vex_state,
1595                   /*OUT*/UChar* x87_state )
1596 {
1597    Int        i, stno, preg;
1598    UInt       tagw;
1599    ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
1600    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
1601    Fpu_State* x87     = (Fpu_State*)x87_state;
1602    UInt       ftop    = vex_state->guest_FTOP;
1603    UInt       c3210   = vex_state->guest_FC3210;
1604 
1605    for (i = 0; i < 14; i++)
1606       x87->env[i] = 0;
1607 
1608    x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
1609    x87->env[FP_ENV_STAT]
1610       = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
1611    x87->env[FP_ENV_CTRL]
1612       = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
1613 
1614    /* Dump the register stack in ST order. */
1615    tagw = 0;
1616    for (stno = 0; stno < 8; stno++) {
1617       preg = (stno + ftop) & 7;
1618       if (vexTags[preg] == 0) {
1619          /* register is empty */
1620          tagw |= (3 << (2*preg));
1621          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
1622                                  &x87->reg[10*stno] );
1623       } else {
1624          /* register is full. */
1625          tagw |= (0 << (2*preg));
1626          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
1627                                  &x87->reg[10*stno] );
1628       }
1629    }
1630    x87->env[FP_ENV_TAG] = toUShort(tagw);
1631 }
1632 
1633 
1634 /* CALLED FROM GENERATED CODE */
1635 /* DIRTY HELPER (reads guest state, writes guest mem) */
1636 /* NOTE: only handles 32-bit format (no REX.W on the insn) */
amd64g_dirtyhelper_FXSAVE(VexGuestAMD64State * gst,HWord addr)1637 void amd64g_dirtyhelper_FXSAVE ( VexGuestAMD64State* gst, HWord addr )
1638 {
1639    /* Derived from values obtained from
1640       vendor_id       : AuthenticAMD
1641       cpu family      : 15
1642       model           : 12
1643       model name      : AMD Athlon(tm) 64 Processor 3200+
1644       stepping        : 0
1645       cpu MHz         : 2200.000
1646       cache size      : 512 KB
1647    */
1648    /* Somewhat roundabout, but at least it's simple. */
1649    Fpu_State tmp;
1650    UShort*   addrS = (UShort*)addr;
1651    UChar*    addrC = (UChar*)addr;
1652    U128*     xmm   = (U128*)(addr + 160);
1653    UInt      mxcsr;
1654    UShort    fp_tags;
1655    UInt      summary_tags;
1656    Int       r, stno;
1657    UShort    *srcS, *dstS;
1658 
1659    do_get_x87( gst, (UChar*)&tmp );
1660    mxcsr = amd64g_create_mxcsr( gst->guest_SSEROUND );
1661 
1662    /* Now build the proper fxsave image from the x87 image we just
1663       made. */
1664 
1665    addrS[0]  = tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */
1666    addrS[1]  = tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */
1667 
1668    /* set addrS[2] in an endian-independent way */
1669    summary_tags = 0;
1670    fp_tags = tmp.env[FP_ENV_TAG];
1671    for (r = 0; r < 8; r++) {
1672       if ( ((fp_tags >> (2*r)) & 3) != 3 )
1673          summary_tags |= (1 << r);
1674    }
1675    addrC[4]  = toUChar(summary_tags); /* FTW: tag summary byte */
1676    addrC[5]  = 0; /* pad */
1677 
1678    /* FOP: faulting fpu opcode.  From experimentation, the real CPU
1679       does not write this field. (?!) */
1680    addrS[3]  = 0; /* BOGUS */
1681 
1682    /* RIP (Last x87 instruction pointer).  From experimentation, the
1683       real CPU does not write this field. (?!) */
1684    addrS[4]  = 0; /* BOGUS */
1685    addrS[5]  = 0; /* BOGUS */
1686    addrS[6]  = 0; /* BOGUS */
1687    addrS[7]  = 0; /* BOGUS */
1688 
1689    /* RDP (Last x87 data pointer).  From experimentation, the real CPU
1690       does not write this field. (?!) */
1691    addrS[8]  = 0; /* BOGUS */
1692    addrS[9]  = 0; /* BOGUS */
1693    addrS[10] = 0; /* BOGUS */
1694    addrS[11] = 0; /* BOGUS */
1695 
1696    addrS[12] = toUShort(mxcsr);  /* MXCSR */
1697    addrS[13] = toUShort(mxcsr >> 16);
1698 
1699    addrS[14] = 0xFFFF; /* MXCSR mask (lo16) */
1700    addrS[15] = 0x0000; /* MXCSR mask (hi16) */
1701 
1702    /* Copy in the FP registers, in ST order. */
1703    for (stno = 0; stno < 8; stno++) {
1704       srcS = (UShort*)(&tmp.reg[10*stno]);
1705       dstS = (UShort*)(&addrS[16 + 8*stno]);
1706       dstS[0] = srcS[0];
1707       dstS[1] = srcS[1];
1708       dstS[2] = srcS[2];
1709       dstS[3] = srcS[3];
1710       dstS[4] = srcS[4];
1711       dstS[5] = 0;
1712       dstS[6] = 0;
1713       dstS[7] = 0;
1714    }
1715 
1716    /* That's the first 160 bytes of the image done.  Now only %xmm0
1717       .. %xmm15 remain to be copied.  If the host is big-endian, these
1718       need to be byte-swapped. */
1719    vassert(host_is_little_endian());
1720 
1721 #  define COPY_U128(_dst,_src)                       \
1722       do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
1723            _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
1724       while (0)
1725 
1726    COPY_U128( xmm[0],  gst->guest_YMM0 );
1727    COPY_U128( xmm[1],  gst->guest_YMM1 );
1728    COPY_U128( xmm[2],  gst->guest_YMM2 );
1729    COPY_U128( xmm[3],  gst->guest_YMM3 );
1730    COPY_U128( xmm[4],  gst->guest_YMM4 );
1731    COPY_U128( xmm[5],  gst->guest_YMM5 );
1732    COPY_U128( xmm[6],  gst->guest_YMM6 );
1733    COPY_U128( xmm[7],  gst->guest_YMM7 );
1734    COPY_U128( xmm[8],  gst->guest_YMM8 );
1735    COPY_U128( xmm[9],  gst->guest_YMM9 );
1736    COPY_U128( xmm[10], gst->guest_YMM10 );
1737    COPY_U128( xmm[11], gst->guest_YMM11 );
1738    COPY_U128( xmm[12], gst->guest_YMM12 );
1739    COPY_U128( xmm[13], gst->guest_YMM13 );
1740    COPY_U128( xmm[14], gst->guest_YMM14 );
1741    COPY_U128( xmm[15], gst->guest_YMM15 );
1742 
1743 #  undef COPY_U128
1744 }
1745 
1746 
1747 /* CALLED FROM GENERATED CODE */
1748 /* DIRTY HELPER (writes guest state, reads guest mem) */
amd64g_dirtyhelper_FXRSTOR(VexGuestAMD64State * gst,HWord addr)1749 VexEmWarn amd64g_dirtyhelper_FXRSTOR ( VexGuestAMD64State* gst, HWord addr )
1750 {
1751    Fpu_State tmp;
1752    VexEmWarn warnX87 = EmWarn_NONE;
1753    VexEmWarn warnXMM = EmWarn_NONE;
1754    UShort*   addrS   = (UShort*)addr;
1755    UChar*    addrC   = (UChar*)addr;
1756    U128*     xmm     = (U128*)(addr + 160);
1757    UShort    fp_tags;
1758    Int       r, stno, i;
1759 
1760    /* Restore %xmm0 .. %xmm15.  If the host is big-endian, these need
1761       to be byte-swapped. */
1762    vassert(host_is_little_endian());
1763 
1764 #  define COPY_U128(_dst,_src)                       \
1765       do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
1766            _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
1767       while (0)
1768 
1769    COPY_U128( gst->guest_YMM0, xmm[0] );
1770    COPY_U128( gst->guest_YMM1, xmm[1] );
1771    COPY_U128( gst->guest_YMM2, xmm[2] );
1772    COPY_U128( gst->guest_YMM3, xmm[3] );
1773    COPY_U128( gst->guest_YMM4, xmm[4] );
1774    COPY_U128( gst->guest_YMM5, xmm[5] );
1775    COPY_U128( gst->guest_YMM6, xmm[6] );
1776    COPY_U128( gst->guest_YMM7, xmm[7] );
1777    COPY_U128( gst->guest_YMM8, xmm[8] );
1778    COPY_U128( gst->guest_YMM9, xmm[9] );
1779    COPY_U128( gst->guest_YMM10, xmm[10] );
1780    COPY_U128( gst->guest_YMM11, xmm[11] );
1781    COPY_U128( gst->guest_YMM12, xmm[12] );
1782    COPY_U128( gst->guest_YMM13, xmm[13] );
1783    COPY_U128( gst->guest_YMM14, xmm[14] );
1784    COPY_U128( gst->guest_YMM15, xmm[15] );
1785 
1786 #  undef COPY_U128
1787 
1788    /* Copy the x87 registers out of the image, into a temporary
1789       Fpu_State struct. */
1790    for (i = 0; i < 14; i++) tmp.env[i] = 0;
1791    for (i = 0; i < 80; i++) tmp.reg[i] = 0;
1792    /* fill in tmp.reg[0..7] */
1793    for (stno = 0; stno < 8; stno++) {
1794       UShort* dstS = (UShort*)(&tmp.reg[10*stno]);
1795       UShort* srcS = (UShort*)(&addrS[16 + 8*stno]);
1796       dstS[0] = srcS[0];
1797       dstS[1] = srcS[1];
1798       dstS[2] = srcS[2];
1799       dstS[3] = srcS[3];
1800       dstS[4] = srcS[4];
1801    }
1802    /* fill in tmp.env[0..13] */
1803    tmp.env[FP_ENV_CTRL] = addrS[0]; /* FCW: fpu control word */
1804    tmp.env[FP_ENV_STAT] = addrS[1]; /* FCW: fpu status word */
1805 
1806    fp_tags = 0;
1807    for (r = 0; r < 8; r++) {
1808       if (addrC[4] & (1<<r))
1809          fp_tags |= (0 << (2*r)); /* EMPTY */
1810       else
1811          fp_tags |= (3 << (2*r)); /* VALID -- not really precise enough. */
1812    }
1813    tmp.env[FP_ENV_TAG] = fp_tags;
1814 
1815    /* Now write 'tmp' into the guest state. */
1816    warnX87 = do_put_x87( True/*moveRegs*/, (UChar*)&tmp, gst );
1817 
1818    { UInt w32 = (((UInt)addrS[12]) & 0xFFFF)
1819                 | ((((UInt)addrS[13]) & 0xFFFF) << 16);
1820      ULong w64 = amd64g_check_ldmxcsr( (ULong)w32 );
1821 
1822      warnXMM = (VexEmWarn)(w64 >> 32);
1823 
1824      gst->guest_SSEROUND = w64 & 0xFFFFFFFFULL;
1825    }
1826 
1827    /* Prefer an X87 emwarn over an XMM one, if both exist. */
1828    if (warnX87 != EmWarn_NONE)
1829       return warnX87;
1830    else
1831       return warnXMM;
1832 }
1833 
1834 
1835 /* DIRTY HELPER (writes guest state) */
1836 /* Initialise the x87 FPU state as per 'finit'. */
amd64g_dirtyhelper_FINIT(VexGuestAMD64State * gst)1837 void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* gst )
1838 {
1839    Int i;
1840    gst->guest_FTOP = 0;
1841    for (i = 0; i < 8; i++) {
1842       gst->guest_FPTAG[i] = 0; /* empty */
1843       gst->guest_FPREG[i] = 0; /* IEEE754 64-bit zero */
1844    }
1845    gst->guest_FPROUND = (ULong)Irrm_NEAREST;
1846    gst->guest_FC3210  = 0;
1847 }
1848 
1849 
1850 /* CALLED FROM GENERATED CODE */
1851 /* DIRTY HELPER (reads guest memory) */
amd64g_dirtyhelper_loadF80le(ULong addrU)1852 ULong amd64g_dirtyhelper_loadF80le ( ULong addrU )
1853 {
1854    ULong f64;
1855    convert_f80le_to_f64le ( (UChar*)ULong_to_Ptr(addrU), (UChar*)&f64 );
1856    return f64;
1857 }
1858 
1859 /* CALLED FROM GENERATED CODE */
1860 /* DIRTY HELPER (writes guest memory) */
amd64g_dirtyhelper_storeF80le(ULong addrU,ULong f64)1861 void amd64g_dirtyhelper_storeF80le ( ULong addrU, ULong f64 )
1862 {
1863    convert_f64le_to_f80le( (UChar*)&f64, (UChar*)ULong_to_Ptr(addrU) );
1864 }
1865 
1866 
1867 /* CALLED FROM GENERATED CODE */
1868 /* CLEAN HELPER */
1869 /* mxcsr[15:0] contains a SSE native format MXCSR value.
1870    Extract from it the required SSEROUND value and any resulting
1871    emulation warning, and return (warn << 32) | sseround value.
1872 */
amd64g_check_ldmxcsr(ULong mxcsr)1873 ULong amd64g_check_ldmxcsr ( ULong mxcsr )
1874 {
1875    /* Decide on a rounding mode.  mxcsr[14:13] holds it. */
1876    /* NOTE, encoded exactly as per enum IRRoundingMode. */
1877    ULong rmode = (mxcsr >> 13) & 3;
1878 
1879    /* Detect any required emulation warnings. */
1880    VexEmWarn ew = EmWarn_NONE;
1881 
1882    if ((mxcsr & 0x1F80) != 0x1F80) {
1883       /* unmasked exceptions! */
1884       ew = EmWarn_X86_sseExns;
1885    }
1886    else
1887    if (mxcsr & (1<<15)) {
1888       /* FZ is set */
1889       ew = EmWarn_X86_fz;
1890    }
1891    else
1892    if (mxcsr & (1<<6)) {
1893       /* DAZ is set */
1894       ew = EmWarn_X86_daz;
1895    }
1896 
1897    return (((ULong)ew) << 32) | ((ULong)rmode);
1898 }
1899 
1900 
1901 /* CALLED FROM GENERATED CODE */
1902 /* CLEAN HELPER */
1903 /* Given sseround as an IRRoundingMode value, create a suitable SSE
1904    native format MXCSR value. */
amd64g_create_mxcsr(ULong sseround)1905 ULong amd64g_create_mxcsr ( ULong sseround )
1906 {
1907    sseround &= 3;
1908    return 0x1F80 | (sseround << 13);
1909 }
1910 
1911 
1912 /* CLEAN HELPER */
1913 /* fpucw[15:0] contains a x87 native format FPU control word.
1914    Extract from it the required FPROUND value and any resulting
1915    emulation warning, and return (warn << 32) | fpround value.
1916 */
amd64g_check_fldcw(ULong fpucw)1917 ULong amd64g_check_fldcw ( ULong fpucw )
1918 {
1919    /* Decide on a rounding mode.  fpucw[11:10] holds it. */
1920    /* NOTE, encoded exactly as per enum IRRoundingMode. */
1921    ULong rmode = (fpucw >> 10) & 3;
1922 
1923    /* Detect any required emulation warnings. */
1924    VexEmWarn ew = EmWarn_NONE;
1925 
1926    if ((fpucw & 0x3F) != 0x3F) {
1927       /* unmasked exceptions! */
1928       ew = EmWarn_X86_x87exns;
1929    }
1930    else
1931    if (((fpucw >> 8) & 3) != 3) {
1932       /* unsupported precision */
1933       ew = EmWarn_X86_x87precision;
1934    }
1935 
1936    return (((ULong)ew) << 32) | ((ULong)rmode);
1937 }
1938 
1939 
1940 /* CLEAN HELPER */
1941 /* Given fpround as an IRRoundingMode value, create a suitable x87
1942    native format FPU control word. */
amd64g_create_fpucw(ULong fpround)1943 ULong amd64g_create_fpucw ( ULong fpround )
1944 {
1945    fpround &= 3;
1946    return 0x037F | (fpround << 10);
1947 }
1948 
1949 
1950 /* This is used to implement 'fldenv'.
1951    Reads 28 bytes at x87_state[0 .. 27]. */
1952 /* CALLED FROM GENERATED CODE */
1953 /* DIRTY HELPER */
amd64g_dirtyhelper_FLDENV(VexGuestAMD64State * vex_state,HWord x87_state)1954 VexEmWarn amd64g_dirtyhelper_FLDENV ( /*OUT*/VexGuestAMD64State* vex_state,
1955                                       /*IN*/HWord x87_state)
1956 {
1957    return do_put_x87( False, (UChar*)x87_state, vex_state );
1958 }
1959 
1960 
1961 /* CALLED FROM GENERATED CODE */
1962 /* DIRTY HELPER */
1963 /* Create an x87 FPU env from the guest state, as close as we can
1964    approximate it.  Writes 28 bytes at x87_state[0..27]. */
amd64g_dirtyhelper_FSTENV(VexGuestAMD64State * vex_state,HWord x87_state)1965 void amd64g_dirtyhelper_FSTENV ( /*IN*/VexGuestAMD64State* vex_state,
1966                                  /*OUT*/HWord x87_state )
1967 {
1968    Int        i, stno, preg;
1969    UInt       tagw;
1970    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
1971    Fpu_State* x87     = (Fpu_State*)x87_state;
1972    UInt       ftop    = vex_state->guest_FTOP;
1973    ULong      c3210   = vex_state->guest_FC3210;
1974 
1975    for (i = 0; i < 14; i++)
1976       x87->env[i] = 0;
1977 
1978    x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
1979    x87->env[FP_ENV_STAT]
1980       = toUShort(toUInt( ((ftop & 7) << 11) | (c3210 & 0x4700) ));
1981    x87->env[FP_ENV_CTRL]
1982       = toUShort(toUInt( amd64g_create_fpucw( vex_state->guest_FPROUND ) ));
1983 
1984    /* Compute the x87 tag word. */
1985    tagw = 0;
1986    for (stno = 0; stno < 8; stno++) {
1987       preg = (stno + ftop) & 7;
1988       if (vexTags[preg] == 0) {
1989          /* register is empty */
1990          tagw |= (3 << (2*preg));
1991       } else {
1992          /* register is full. */
1993          tagw |= (0 << (2*preg));
1994       }
1995    }
1996    x87->env[FP_ENV_TAG] = toUShort(tagw);
1997 
1998    /* We don't dump the x87 registers, tho. */
1999 }
2000 
2001 
2002 /* This is used to implement 'fnsave'.
2003    Writes 108 bytes at x87_state[0 .. 107]. */
2004 /* CALLED FROM GENERATED CODE */
2005 /* DIRTY HELPER */
amd64g_dirtyhelper_FNSAVE(VexGuestAMD64State * vex_state,HWord x87_state)2006 void amd64g_dirtyhelper_FNSAVE ( /*IN*/VexGuestAMD64State* vex_state,
2007                                  /*OUT*/HWord x87_state)
2008 {
2009    do_get_x87( vex_state, (UChar*)x87_state );
2010 }
2011 
2012 
2013 /* This is used to implement 'fnsaves'.
2014    Writes 94 bytes at x87_state[0 .. 93]. */
2015 /* CALLED FROM GENERATED CODE */
2016 /* DIRTY HELPER */
amd64g_dirtyhelper_FNSAVES(VexGuestAMD64State * vex_state,HWord x87_state)2017 void amd64g_dirtyhelper_FNSAVES ( /*IN*/VexGuestAMD64State* vex_state,
2018                                   /*OUT*/HWord x87_state)
2019 {
2020    Int           i, stno, preg;
2021    UInt          tagw;
2022    ULong*        vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2023    UChar*        vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2024    Fpu_State_16* x87     = (Fpu_State_16*)x87_state;
2025    UInt          ftop    = vex_state->guest_FTOP;
2026    UInt          c3210   = vex_state->guest_FC3210;
2027 
2028    for (i = 0; i < 7; i++)
2029       x87->env[i] = 0;
2030 
2031    x87->env[FPS_ENV_STAT]
2032       = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
2033    x87->env[FPS_ENV_CTRL]
2034       = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
2035 
2036    /* Dump the register stack in ST order. */
2037    tagw = 0;
2038    for (stno = 0; stno < 8; stno++) {
2039       preg = (stno + ftop) & 7;
2040       if (vexTags[preg] == 0) {
2041          /* register is empty */
2042          tagw |= (3 << (2*preg));
2043          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2044                                  &x87->reg[10*stno] );
2045       } else {
2046          /* register is full. */
2047          tagw |= (0 << (2*preg));
2048          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2049                                  &x87->reg[10*stno] );
2050       }
2051    }
2052    x87->env[FPS_ENV_TAG] = toUShort(tagw);
2053 }
2054 
2055 
2056 /* This is used to implement 'frstor'.
2057    Reads 108 bytes at x87_state[0 .. 107]. */
2058 /* CALLED FROM GENERATED CODE */
2059 /* DIRTY HELPER */
amd64g_dirtyhelper_FRSTOR(VexGuestAMD64State * vex_state,HWord x87_state)2060 VexEmWarn amd64g_dirtyhelper_FRSTOR ( /*OUT*/VexGuestAMD64State* vex_state,
2061                                       /*IN*/HWord x87_state)
2062 {
2063    return do_put_x87( True, (UChar*)x87_state, vex_state );
2064 }
2065 
2066 
2067 /* This is used to implement 'frstors'.
2068    Reads 94 bytes at x87_state[0 .. 93]. */
2069 /* CALLED FROM GENERATED CODE */
2070 /* DIRTY HELPER */
amd64g_dirtyhelper_FRSTORS(VexGuestAMD64State * vex_state,HWord x87_state)2071 VexEmWarn amd64g_dirtyhelper_FRSTORS ( /*OUT*/VexGuestAMD64State* vex_state,
2072                                        /*IN*/HWord x87_state)
2073 {
2074    Int           stno, preg;
2075    UInt          tag;
2076    ULong*        vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2077    UChar*        vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2078    Fpu_State_16* x87     = (Fpu_State_16*)x87_state;
2079    UInt          ftop    = (x87->env[FPS_ENV_STAT] >> 11) & 7;
2080    UInt          tagw    = x87->env[FPS_ENV_TAG];
2081    UInt          fpucw   = x87->env[FPS_ENV_CTRL];
2082    UInt          c3210   = x87->env[FPS_ENV_STAT] & 0x4700;
2083    VexEmWarn     ew;
2084    UInt          fpround;
2085    ULong         pair;
2086 
2087    /* Copy registers and tags */
2088    for (stno = 0; stno < 8; stno++) {
2089       preg = (stno + ftop) & 7;
2090       tag = (tagw >> (2*preg)) & 3;
2091       if (tag == 3) {
2092          /* register is empty */
2093          /* hmm, if it's empty, does it still get written?  Probably
2094             safer to say it does.  If we don't, memcheck could get out
2095             of sync, in that it thinks all FP registers are defined by
2096             this helper, but in reality some have not been updated. */
2097          vexRegs[preg] = 0; /* IEEE754 64-bit zero */
2098          vexTags[preg] = 0;
2099       } else {
2100          /* register is non-empty */
2101          convert_f80le_to_f64le( &x87->reg[10*stno],
2102                                  (UChar*)&vexRegs[preg] );
2103          vexTags[preg] = 1;
2104       }
2105    }
2106 
2107    /* stack pointer */
2108    vex_state->guest_FTOP = ftop;
2109 
2110    /* status word */
2111    vex_state->guest_FC3210 = c3210;
2112 
2113    /* handle the control word, setting FPROUND and detecting any
2114       emulation warnings. */
2115    pair    = amd64g_check_fldcw ( (ULong)fpucw );
2116    fpround = (UInt)pair & 0xFFFFFFFFULL;
2117    ew      = (VexEmWarn)(pair >> 32);
2118 
2119    vex_state->guest_FPROUND = fpround & 3;
2120 
2121    /* emulation warnings --> caller */
2122    return ew;
2123 }
2124 
2125 
2126 /*---------------------------------------------------------------*/
2127 /*--- Misc integer helpers, including rotates and CPUID.      ---*/
2128 /*---------------------------------------------------------------*/
2129 
2130 /* Claim to be the following CPU, which is probably representative of
2131    the lowliest (earliest) amd64 offerings.  It can do neither sse3
2132    nor cx16.
2133 
2134    vendor_id       : AuthenticAMD
2135    cpu family      : 15
2136    model           : 5
2137    model name      : AMD Opteron (tm) Processor 848
2138    stepping        : 10
2139    cpu MHz         : 1797.682
2140    cache size      : 1024 KB
2141    fpu             : yes
2142    fpu_exception   : yes
2143    cpuid level     : 1
2144    wp              : yes
2145    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
2146                      mtrr pge mca cmov pat pse36 clflush mmx fxsr
2147                      sse sse2 syscall nx mmxext lm 3dnowext 3dnow
2148    bogomips        : 3600.62
2149    TLB size        : 1088 4K pages
2150    clflush size    : 64
2151    cache_alignment : 64
2152    address sizes   : 40 bits physical, 48 bits virtual
2153    power management: ts fid vid ttp
2154 
2155    2012-Feb-21: don't claim 3dnow or 3dnowext, since in fact
2156    we don't support them.  See #291568.  3dnow is 80000001.EDX.31
2157    and 3dnowext is 80000001.EDX.30.
2158 */
amd64g_dirtyhelper_CPUID_baseline(VexGuestAMD64State * st)2159 void amd64g_dirtyhelper_CPUID_baseline ( VexGuestAMD64State* st )
2160 {
2161 #  define SET_ABCD(_a,_b,_c,_d)                \
2162       do { st->guest_RAX = (ULong)(_a);        \
2163            st->guest_RBX = (ULong)(_b);        \
2164            st->guest_RCX = (ULong)(_c);        \
2165            st->guest_RDX = (ULong)(_d);        \
2166       } while (0)
2167 
2168    switch (0xFFFFFFFF & st->guest_RAX) {
2169       case 0x00000000:
2170          SET_ABCD(0x00000001, 0x68747541, 0x444d4163, 0x69746e65);
2171          break;
2172       case 0x00000001:
2173          SET_ABCD(0x00000f5a, 0x01000800, 0x00000000, 0x078bfbff);
2174          break;
2175       case 0x80000000:
2176          SET_ABCD(0x80000018, 0x68747541, 0x444d4163, 0x69746e65);
2177          break;
2178       case 0x80000001:
2179          /* Don't claim to support 3dnow or 3dnowext.  0xe1d3fbff is
2180             the original it-is-supported value that the h/w provides.
2181             See #291568. */
2182          SET_ABCD(0x00000f5a, 0x00000505, 0x00000000, /*0xe1d3fbff*/
2183                                                       0x21d3fbff);
2184          break;
2185       case 0x80000002:
2186          SET_ABCD(0x20444d41, 0x6574704f, 0x206e6f72, 0x296d7428);
2187          break;
2188       case 0x80000003:
2189          SET_ABCD(0x6f725020, 0x73736563, 0x3820726f, 0x00003834);
2190          break;
2191       case 0x80000004:
2192          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2193          break;
2194       case 0x80000005:
2195          SET_ABCD(0xff08ff08, 0xff20ff20, 0x40020140, 0x40020140);
2196          break;
2197       case 0x80000006:
2198          SET_ABCD(0x00000000, 0x42004200, 0x04008140, 0x00000000);
2199          break;
2200       case 0x80000007:
2201          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x0000000f);
2202          break;
2203       case 0x80000008:
2204          SET_ABCD(0x00003028, 0x00000000, 0x00000000, 0x00000000);
2205          break;
2206       default:
2207          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2208          break;
2209    }
2210 #  undef SET_ABCD
2211 }
2212 
2213 
2214 /* Claim to be the following CPU (2 x ...), which is sse3 and cx16
2215    capable.
2216 
2217    vendor_id       : GenuineIntel
2218    cpu family      : 6
2219    model           : 15
2220    model name      : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
2221    stepping        : 6
2222    cpu MHz         : 2394.000
2223    cache size      : 4096 KB
2224    physical id     : 0
2225    siblings        : 2
2226    core id         : 0
2227    cpu cores       : 2
2228    fpu             : yes
2229    fpu_exception   : yes
2230    cpuid level     : 10
2231    wp              : yes
2232    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
2233                      mtrr pge mca cmov pat pse36 clflush dts acpi
2234                      mmx fxsr sse sse2 ss ht tm syscall nx lm
2235                      constant_tsc pni monitor ds_cpl vmx est tm2
2236                      cx16 xtpr lahf_lm
2237    bogomips        : 4798.78
2238    clflush size    : 64
2239    cache_alignment : 64
2240    address sizes   : 36 bits physical, 48 bits virtual
2241    power management:
2242 */
amd64g_dirtyhelper_CPUID_sse3_and_cx16(VexGuestAMD64State * st)2243 void amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State* st )
2244 {
2245 #  define SET_ABCD(_a,_b,_c,_d)                \
2246       do { st->guest_RAX = (ULong)(_a);        \
2247            st->guest_RBX = (ULong)(_b);        \
2248            st->guest_RCX = (ULong)(_c);        \
2249            st->guest_RDX = (ULong)(_d);        \
2250       } while (0)
2251 
2252    switch (0xFFFFFFFF & st->guest_RAX) {
2253       case 0x00000000:
2254          SET_ABCD(0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69);
2255          break;
2256       case 0x00000001:
2257          SET_ABCD(0x000006f6, 0x00020800, 0x0000e3bd, 0xbfebfbff);
2258          break;
2259       case 0x00000002:
2260          SET_ABCD(0x05b0b101, 0x005657f0, 0x00000000, 0x2cb43049);
2261          break;
2262       case 0x00000003:
2263          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2264          break;
2265       case 0x00000004: {
2266          switch (0xFFFFFFFF & st->guest_RCX) {
2267             case 0x00000000: SET_ABCD(0x04000121, 0x01c0003f,
2268                                       0x0000003f, 0x00000001); break;
2269             case 0x00000001: SET_ABCD(0x04000122, 0x01c0003f,
2270                                       0x0000003f, 0x00000001); break;
2271             case 0x00000002: SET_ABCD(0x04004143, 0x03c0003f,
2272                                       0x00000fff, 0x00000001); break;
2273             default:         SET_ABCD(0x00000000, 0x00000000,
2274                                       0x00000000, 0x00000000); break;
2275          }
2276          break;
2277       }
2278       case 0x00000005:
2279          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000020);
2280          break;
2281       case 0x00000006:
2282          SET_ABCD(0x00000001, 0x00000002, 0x00000001, 0x00000000);
2283          break;
2284       case 0x00000007:
2285          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2286          break;
2287       case 0x00000008:
2288          SET_ABCD(0x00000400, 0x00000000, 0x00000000, 0x00000000);
2289          break;
2290       case 0x00000009:
2291          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2292          break;
2293       case 0x0000000a:
2294       unhandled_eax_value:
2295          SET_ABCD(0x07280202, 0x00000000, 0x00000000, 0x00000000);
2296          break;
2297       case 0x80000000:
2298          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
2299          break;
2300       case 0x80000001:
2301          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x20100800);
2302          break;
2303       case 0x80000002:
2304          SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
2305          break;
2306       case 0x80000003:
2307          SET_ABCD(0x43203229, 0x20205550, 0x20202020, 0x20202020);
2308          break;
2309       case 0x80000004:
2310          SET_ABCD(0x30303636, 0x20402020, 0x30342e32, 0x007a4847);
2311          break;
2312       case 0x80000005:
2313          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2314          break;
2315       case 0x80000006:
2316          SET_ABCD(0x00000000, 0x00000000, 0x10008040, 0x00000000);
2317          break;
2318       case 0x80000007:
2319          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2320          break;
2321       case 0x80000008:
2322          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
2323          break;
2324       default:
2325          goto unhandled_eax_value;
2326    }
2327 #  undef SET_ABCD
2328 }
2329 
2330 
2331 /* Claim to be the following CPU (4 x ...), which is sse4.2 and cx16
2332    capable.
2333 
2334    vendor_id       : GenuineIntel
2335    cpu family      : 6
2336    model           : 37
2337    model name      : Intel(R) Core(TM) i5 CPU         670  @ 3.47GHz
2338    stepping        : 2
2339    cpu MHz         : 3334.000
2340    cache size      : 4096 KB
2341    physical id     : 0
2342    siblings        : 4
2343    core id         : 0
2344    cpu cores       : 2
2345    apicid          : 0
2346    initial apicid  : 0
2347    fpu             : yes
2348    fpu_exception   : yes
2349    cpuid level     : 11
2350    wp              : yes
2351    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
2352                      mtrr pge mca cmov pat pse36 clflush dts acpi
2353                      mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
2354                      lm constant_tsc arch_perfmon pebs bts rep_good
2355                      xtopology nonstop_tsc aperfmperf pni pclmulqdq
2356                      dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16
2357                      xtpr pdcm sse4_1 sse4_2 popcnt aes lahf_lm ida
2358                      arat tpr_shadow vnmi flexpriority ept vpid
2359    bogomips        : 6957.57
2360    clflush size    : 64
2361    cache_alignment : 64
2362    address sizes   : 36 bits physical, 48 bits virtual
2363    power management:
2364 */
amd64g_dirtyhelper_CPUID_sse42_and_cx16(VexGuestAMD64State * st)2365 void amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st )
2366 {
2367 #  define SET_ABCD(_a,_b,_c,_d)                \
2368       do { st->guest_RAX = (ULong)(_a);        \
2369            st->guest_RBX = (ULong)(_b);        \
2370            st->guest_RCX = (ULong)(_c);        \
2371            st->guest_RDX = (ULong)(_d);        \
2372       } while (0)
2373 
2374    UInt old_eax = (UInt)st->guest_RAX;
2375    UInt old_ecx = (UInt)st->guest_RCX;
2376 
2377    switch (old_eax) {
2378       case 0x00000000:
2379          SET_ABCD(0x0000000b, 0x756e6547, 0x6c65746e, 0x49656e69);
2380          break;
2381       case 0x00000001:
2382          SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff, 0xbfebfbff);
2383          break;
2384       case 0x00000002:
2385          SET_ABCD(0x55035a01, 0x00f0b2e3, 0x00000000, 0x09ca212c);
2386          break;
2387       case 0x00000003:
2388          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2389          break;
2390       case 0x00000004:
2391          switch (old_ecx) {
2392             case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
2393                                       0x0000003f, 0x00000000); break;
2394             case 0x00000001: SET_ABCD(0x1c004122, 0x00c0003f,
2395                                       0x0000007f, 0x00000000); break;
2396             case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
2397                                       0x000001ff, 0x00000000); break;
2398             case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
2399                                       0x00000fff, 0x00000002); break;
2400             default:         SET_ABCD(0x00000000, 0x00000000,
2401                                       0x00000000, 0x00000000); break;
2402          }
2403          break;
2404       case 0x00000005:
2405          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
2406          break;
2407       case 0x00000006:
2408          SET_ABCD(0x00000007, 0x00000002, 0x00000001, 0x00000000);
2409          break;
2410       case 0x00000007:
2411          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2412          break;
2413       case 0x00000008:
2414          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2415          break;
2416       case 0x00000009:
2417          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2418          break;
2419       case 0x0000000a:
2420          SET_ABCD(0x07300403, 0x00000004, 0x00000000, 0x00000603);
2421          break;
2422       case 0x0000000b:
2423          switch (old_ecx) {
2424             case 0x00000000:
2425                SET_ABCD(0x00000001, 0x00000002,
2426                         0x00000100, 0x00000000); break;
2427             case 0x00000001:
2428                SET_ABCD(0x00000004, 0x00000004,
2429                         0x00000201, 0x00000000); break;
2430             default:
2431                SET_ABCD(0x00000000, 0x00000000,
2432                         old_ecx,    0x00000000); break;
2433          }
2434          break;
2435       case 0x0000000c:
2436          SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
2437          break;
2438       case 0x0000000d:
2439          switch (old_ecx) {
2440             case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
2441                                       0x00000100, 0x00000000); break;
2442             case 0x00000001: SET_ABCD(0x00000004, 0x00000004,
2443                                       0x00000201, 0x00000000); break;
2444             default:         SET_ABCD(0x00000000, 0x00000000,
2445                                       old_ecx,    0x00000000); break;
2446          }
2447          break;
2448       case 0x80000000:
2449          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
2450          break;
2451       case 0x80000001:
2452          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
2453          break;
2454       case 0x80000002:
2455          SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
2456          break;
2457       case 0x80000003:
2458          SET_ABCD(0x35692029, 0x55504320, 0x20202020, 0x20202020);
2459          break;
2460       case 0x80000004:
2461          SET_ABCD(0x30373620, 0x20402020, 0x37342e33, 0x007a4847);
2462          break;
2463       case 0x80000005:
2464          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2465          break;
2466       case 0x80000006:
2467          SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
2468          break;
2469       case 0x80000007:
2470          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
2471          break;
2472       case 0x80000008:
2473          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
2474          break;
2475       default:
2476          SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
2477          break;
2478    }
2479 #  undef SET_ABCD
2480 }
2481 
2482 
2483 /* Claim to be the following CPU (4 x ...), which is AVX and cx16
2484    capable.
2485 
2486    vendor_id       : GenuineIntel
2487    cpu family      : 6
2488    model           : 42
2489    model name      : Intel(R) Core(TM) i5-2300 CPU @ 2.80GHz
2490    stepping        : 7
2491    cpu MHz         : 1600.000
2492    cache size      : 6144 KB
2493    physical id     : 0
2494    siblings        : 4
2495    core id         : 3
2496    cpu cores       : 4
2497    apicid          : 6
2498    initial apicid  : 6
2499    fpu             : yes
2500    fpu_exception   : yes
2501    cpuid level     : 13
2502    wp              : yes
2503    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
2504                      mtrr pge mca cmov pat pse36 clflush dts acpi
2505                      mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
2506                      lm constant_tsc arch_perfmon pebs bts rep_good
2507                      nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq
2508                      dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16
2509                      xtpr pdcm sse4_1 sse4_2 popcnt aes xsave avx
2510                      lahf_lm ida arat epb xsaveopt pln pts dts
2511                      tpr_shadow vnmi flexpriority ept vpid
2512 
2513    bogomips        : 5768.94
2514    clflush size    : 64
2515    cache_alignment : 64
2516    address sizes   : 36 bits physical, 48 bits virtual
2517    power management:
2518 */
amd64g_dirtyhelper_CPUID_avx_and_cx16(VexGuestAMD64State * st)2519 void amd64g_dirtyhelper_CPUID_avx_and_cx16 ( VexGuestAMD64State* st )
2520 {
2521 #  define SET_ABCD(_a,_b,_c,_d)                \
2522       do { st->guest_RAX = (ULong)(_a);        \
2523            st->guest_RBX = (ULong)(_b);        \
2524            st->guest_RCX = (ULong)(_c);        \
2525            st->guest_RDX = (ULong)(_d);        \
2526       } while (0)
2527 
2528    UInt old_eax = (UInt)st->guest_RAX;
2529    UInt old_ecx = (UInt)st->guest_RCX;
2530 
2531    switch (old_eax) {
2532       case 0x00000000:
2533          SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
2534          break;
2535       case 0x00000001:
2536          SET_ABCD(0x000206a7, 0x00100800, 0x1f9ae3bf, 0xbfebfbff);
2537          break;
2538       case 0x00000002:
2539          SET_ABCD(0x76035a01, 0x00f0b0ff, 0x00000000, 0x00ca0000);
2540          break;
2541       case 0x00000003:
2542          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2543          break;
2544       case 0x00000004:
2545          switch (old_ecx) {
2546             case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
2547                                       0x0000003f, 0x00000000); break;
2548             case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
2549                                       0x0000003f, 0x00000000); break;
2550             case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
2551                                       0x000001ff, 0x00000000); break;
2552             case 0x00000003: SET_ABCD(0x1c03c163, 0x02c0003f,
2553                                       0x00001fff, 0x00000006); break;
2554             default:         SET_ABCD(0x00000000, 0x00000000,
2555                                       0x00000000, 0x00000000); break;
2556          }
2557          break;
2558       case 0x00000005:
2559          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
2560          break;
2561       case 0x00000006:
2562          SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
2563          break;
2564       case 0x00000007:
2565          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2566          break;
2567       case 0x00000008:
2568          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2569          break;
2570       case 0x00000009:
2571          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2572          break;
2573       case 0x0000000a:
2574          SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
2575          break;
2576       case 0x0000000b:
2577          switch (old_ecx) {
2578             case 0x00000000:
2579                SET_ABCD(0x00000001, 0x00000001,
2580                         0x00000100, 0x00000000); break;
2581             case 0x00000001:
2582                SET_ABCD(0x00000004, 0x00000004,
2583                         0x00000201, 0x00000000); break;
2584             default:
2585                SET_ABCD(0x00000000, 0x00000000,
2586                         old_ecx,    0x00000000); break;
2587          }
2588          break;
2589       case 0x0000000c:
2590          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2591          break;
2592       case 0x0000000d:
2593          switch (old_ecx) {
2594             case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
2595                                       0x00000340, 0x00000000); break;
2596             case 0x00000001: SET_ABCD(0x00000001, 0x00000000,
2597                                       0x00000000, 0x00000000); break;
2598             case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
2599                                       0x00000000, 0x00000000); break;
2600             default:         SET_ABCD(0x00000000, 0x00000000,
2601                                       0x00000000, 0x00000000); break;
2602          }
2603          break;
2604       case 0x0000000e:
2605          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
2606          break;
2607       case 0x0000000f:
2608          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
2609          break;
2610       case 0x80000000:
2611          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
2612          break;
2613       case 0x80000001:
2614          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
2615          break;
2616       case 0x80000002:
2617          SET_ABCD(0x20202020, 0x20202020, 0x65746e49, 0x2952286c);
2618          break;
2619       case 0x80000003:
2620          SET_ABCD(0x726f4320, 0x4d542865, 0x35692029, 0x3033322d);
2621          break;
2622       case 0x80000004:
2623          SET_ABCD(0x50432030, 0x20402055, 0x30382e32, 0x007a4847);
2624          break;
2625       case 0x80000005:
2626          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
2627          break;
2628       case 0x80000006:
2629          SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
2630          break;
2631       case 0x80000007:
2632          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
2633          break;
2634       case 0x80000008:
2635          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
2636          break;
2637       default:
2638          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
2639          break;
2640    }
2641 #  undef SET_ABCD
2642 }
2643 
2644 
amd64g_calculate_RCR(ULong arg,ULong rot_amt,ULong rflags_in,Long szIN)2645 ULong amd64g_calculate_RCR ( ULong arg,
2646                              ULong rot_amt,
2647                              ULong rflags_in,
2648                              Long  szIN )
2649 {
2650    Bool  wantRflags = toBool(szIN < 0);
2651    ULong sz         = wantRflags ? (-szIN) : szIN;
2652    ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
2653    ULong cf=0, of=0, tempcf;
2654 
2655    switch (sz) {
2656       case 8:
2657          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2658          of        = ((arg >> 63) ^ cf) & 1;
2659          while (tempCOUNT > 0) {
2660             tempcf = arg & 1;
2661             arg    = (arg >> 1) | (cf << 63);
2662             cf     = tempcf;
2663             tempCOUNT--;
2664          }
2665          break;
2666       case 4:
2667          while (tempCOUNT >= 33) tempCOUNT -= 33;
2668          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2669          of        = ((arg >> 31) ^ cf) & 1;
2670          while (tempCOUNT > 0) {
2671             tempcf = arg & 1;
2672             arg    = ((arg >> 1) & 0x7FFFFFFFULL) | (cf << 31);
2673             cf     = tempcf;
2674             tempCOUNT--;
2675          }
2676          break;
2677       case 2:
2678          while (tempCOUNT >= 17) tempCOUNT -= 17;
2679          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2680          of        = ((arg >> 15) ^ cf) & 1;
2681          while (tempCOUNT > 0) {
2682             tempcf = arg & 1;
2683             arg    = ((arg >> 1) & 0x7FFFULL) | (cf << 15);
2684             cf     = tempcf;
2685             tempCOUNT--;
2686          }
2687          break;
2688       case 1:
2689          while (tempCOUNT >= 9) tempCOUNT -= 9;
2690          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2691          of        = ((arg >> 7) ^ cf) & 1;
2692          while (tempCOUNT > 0) {
2693             tempcf = arg & 1;
2694             arg    = ((arg >> 1) & 0x7FULL) | (cf << 7);
2695             cf     = tempcf;
2696             tempCOUNT--;
2697          }
2698          break;
2699       default:
2700          vpanic("calculate_RCR(amd64g): invalid size");
2701    }
2702 
2703    cf &= 1;
2704    of &= 1;
2705    rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
2706    rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
2707 
2708    /* caller can ask to have back either the resulting flags or
2709       resulting value, but not both */
2710    return wantRflags ? rflags_in : arg;
2711 }
2712 
amd64g_calculate_RCL(ULong arg,ULong rot_amt,ULong rflags_in,Long szIN)2713 ULong amd64g_calculate_RCL ( ULong arg,
2714                              ULong rot_amt,
2715                              ULong rflags_in,
2716                              Long  szIN )
2717 {
2718    Bool  wantRflags = toBool(szIN < 0);
2719    ULong sz         = wantRflags ? (-szIN) : szIN;
2720    ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
2721    ULong cf=0, of=0, tempcf;
2722 
2723    switch (sz) {
2724       case 8:
2725          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2726          while (tempCOUNT > 0) {
2727             tempcf = (arg >> 63) & 1;
2728             arg    = (arg << 1) | (cf & 1);
2729             cf     = tempcf;
2730             tempCOUNT--;
2731          }
2732          of = ((arg >> 63) ^ cf) & 1;
2733          break;
2734       case 4:
2735          while (tempCOUNT >= 33) tempCOUNT -= 33;
2736          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2737          while (tempCOUNT > 0) {
2738             tempcf = (arg >> 31) & 1;
2739             arg    = 0xFFFFFFFFULL & ((arg << 1) | (cf & 1));
2740             cf     = tempcf;
2741             tempCOUNT--;
2742          }
2743          of = ((arg >> 31) ^ cf) & 1;
2744          break;
2745       case 2:
2746          while (tempCOUNT >= 17) tempCOUNT -= 17;
2747          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2748          while (tempCOUNT > 0) {
2749             tempcf = (arg >> 15) & 1;
2750             arg    = 0xFFFFULL & ((arg << 1) | (cf & 1));
2751             cf     = tempcf;
2752             tempCOUNT--;
2753          }
2754          of = ((arg >> 15) ^ cf) & 1;
2755          break;
2756       case 1:
2757          while (tempCOUNT >= 9) tempCOUNT -= 9;
2758          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
2759          while (tempCOUNT > 0) {
2760             tempcf = (arg >> 7) & 1;
2761             arg    = 0xFFULL & ((arg << 1) | (cf & 1));
2762             cf     = tempcf;
2763             tempCOUNT--;
2764          }
2765          of = ((arg >> 7) ^ cf) & 1;
2766          break;
2767       default:
2768          vpanic("calculate_RCL(amd64g): invalid size");
2769    }
2770 
2771    cf &= 1;
2772    of &= 1;
2773    rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
2774    rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
2775 
2776    return wantRflags ? rflags_in : arg;
2777 }
2778 
2779 /* Taken from gf2x-0.9.5, released under GPLv2+ (later versions LGPLv2+)
2780  * svn://scm.gforge.inria.fr/svn/gf2x/trunk/hardware/opteron/gf2x_mul1.h@25
2781  */
amd64g_calculate_pclmul(ULong a,ULong b,ULong which)2782 ULong amd64g_calculate_pclmul(ULong a, ULong b, ULong which)
2783 {
2784     ULong hi, lo, tmp, A[16];
2785 
2786    A[0] = 0;            A[1] = a;
2787    A[2] = A[1] << 1;    A[3] = A[2] ^ a;
2788    A[4] = A[2] << 1;    A[5] = A[4] ^ a;
2789    A[6] = A[3] << 1;    A[7] = A[6] ^ a;
2790    A[8] = A[4] << 1;    A[9] = A[8] ^ a;
2791    A[10] = A[5] << 1;   A[11] = A[10] ^ a;
2792    A[12] = A[6] << 1;   A[13] = A[12] ^ a;
2793    A[14] = A[7] << 1;   A[15] = A[14] ^ a;
2794 
2795    lo = (A[b >> 60] << 4) ^ A[(b >> 56) & 15];
2796    hi = lo >> 56;
2797    lo = (lo << 8) ^ (A[(b >> 52) & 15] << 4) ^ A[(b >> 48) & 15];
2798    hi = (hi << 8) | (lo >> 56);
2799    lo = (lo << 8) ^ (A[(b >> 44) & 15] << 4) ^ A[(b >> 40) & 15];
2800    hi = (hi << 8) | (lo >> 56);
2801    lo = (lo << 8) ^ (A[(b >> 36) & 15] << 4) ^ A[(b >> 32) & 15];
2802    hi = (hi << 8) | (lo >> 56);
2803    lo = (lo << 8) ^ (A[(b >> 28) & 15] << 4) ^ A[(b >> 24) & 15];
2804    hi = (hi << 8) | (lo >> 56);
2805    lo = (lo << 8) ^ (A[(b >> 20) & 15] << 4) ^ A[(b >> 16) & 15];
2806    hi = (hi << 8) | (lo >> 56);
2807    lo = (lo << 8) ^ (A[(b >> 12) & 15] << 4) ^ A[(b >> 8) & 15];
2808    hi = (hi << 8) | (lo >> 56);
2809    lo = (lo << 8) ^ (A[(b >> 4) & 15] << 4) ^ A[b & 15];
2810 
2811    ULong m0 = -1;
2812    m0 /= 255;
2813    tmp = -((a >> 63) & 1); tmp &= ((b & (m0 * 0xfe)) >> 1); hi = hi ^ tmp;
2814    tmp = -((a >> 62) & 1); tmp &= ((b & (m0 * 0xfc)) >> 2); hi = hi ^ tmp;
2815    tmp = -((a >> 61) & 1); tmp &= ((b & (m0 * 0xf8)) >> 3); hi = hi ^ tmp;
2816    tmp = -((a >> 60) & 1); tmp &= ((b & (m0 * 0xf0)) >> 4); hi = hi ^ tmp;
2817    tmp = -((a >> 59) & 1); tmp &= ((b & (m0 * 0xe0)) >> 5); hi = hi ^ tmp;
2818    tmp = -((a >> 58) & 1); tmp &= ((b & (m0 * 0xc0)) >> 6); hi = hi ^ tmp;
2819    tmp = -((a >> 57) & 1); tmp &= ((b & (m0 * 0x80)) >> 7); hi = hi ^ tmp;
2820 
2821    return which ? hi : lo;
2822 }
2823 
2824 
2825 /* CALLED FROM GENERATED CODE */
2826 /* DIRTY HELPER (non-referentially-transparent) */
2827 /* Horrible hack.  On non-amd64 platforms, return 1. */
amd64g_dirtyhelper_RDTSC(void)2828 ULong amd64g_dirtyhelper_RDTSC ( void )
2829 {
2830 #  if defined(__x86_64__)
2831    UInt  eax, edx;
2832    __asm__ __volatile__("rdtsc" : "=a" (eax), "=d" (edx));
2833    return (((ULong)edx) << 32) | ((ULong)eax);
2834 #  else
2835    return 1ULL;
2836 #  endif
2837 }
2838 
2839 
2840 /* CALLED FROM GENERATED CODE */
2841 /* DIRTY HELPER (non-referentially-transparent) */
2842 /* Horrible hack.  On non-amd64 platforms, return 0. */
amd64g_dirtyhelper_IN(ULong portno,ULong sz)2843 ULong amd64g_dirtyhelper_IN ( ULong portno, ULong sz/*1,2 or 4*/ )
2844 {
2845 #  if defined(__x86_64__)
2846    ULong r = 0;
2847    portno &= 0xFFFF;
2848    switch (sz) {
2849       case 4:
2850          __asm__ __volatile__("movq $0,%%rax; inl %w1,%%eax; movq %%rax,%0"
2851                               : "=a" (r) : "Nd" (portno));
2852 	 break;
2853       case 2:
2854          __asm__ __volatile__("movq $0,%%rax; inw %w1,%w0"
2855                               : "=a" (r) : "Nd" (portno));
2856 	 break;
2857       case 1:
2858          __asm__ __volatile__("movq $0,%%rax; inb %w1,%b0"
2859                               : "=a" (r) : "Nd" (portno));
2860 	 break;
2861       default:
2862          break; /* note: no 64-bit version of insn exists */
2863    }
2864    return r;
2865 #  else
2866    return 0;
2867 #  endif
2868 }
2869 
2870 
2871 /* CALLED FROM GENERATED CODE */
2872 /* DIRTY HELPER (non-referentially-transparent) */
2873 /* Horrible hack.  On non-amd64 platforms, do nothing. */
amd64g_dirtyhelper_OUT(ULong portno,ULong data,ULong sz)2874 void amd64g_dirtyhelper_OUT ( ULong portno, ULong data, ULong sz/*1,2 or 4*/ )
2875 {
2876 #  if defined(__x86_64__)
2877    portno &= 0xFFFF;
2878    switch (sz) {
2879       case 4:
2880          __asm__ __volatile__("movq %0,%%rax; outl %%eax, %w1"
2881                               : : "a" (data), "Nd" (portno));
2882 	 break;
2883       case 2:
2884          __asm__ __volatile__("outw %w0, %w1"
2885                               : : "a" (data), "Nd" (portno));
2886 	 break;
2887       case 1:
2888          __asm__ __volatile__("outb %b0, %w1"
2889                               : : "a" (data), "Nd" (portno));
2890 	 break;
2891       default:
2892          break; /* note: no 64-bit version of insn exists */
2893    }
2894 #  else
2895    /* do nothing */
2896 #  endif
2897 }
2898 
2899 /* CALLED FROM GENERATED CODE */
2900 /* DIRTY HELPER (non-referentially-transparent) */
2901 /* Horrible hack.  On non-amd64 platforms, do nothing. */
2902 /* op = 0: call the native SGDT instruction.
2903    op = 1: call the native SIDT instruction.
2904 */
amd64g_dirtyhelper_SxDT(void * address,ULong op)2905 void amd64g_dirtyhelper_SxDT ( void *address, ULong op ) {
2906 #  if defined(__x86_64__)
2907    switch (op) {
2908       case 0:
2909          __asm__ __volatile__("sgdt (%0)" : : "r" (address) : "memory");
2910          break;
2911       case 1:
2912          __asm__ __volatile__("sidt (%0)" : : "r" (address) : "memory");
2913          break;
2914       default:
2915          vpanic("amd64g_dirtyhelper_SxDT");
2916    }
2917 #  else
2918    /* do nothing */
2919    UChar* p = (UChar*)address;
2920    p[0] = p[1] = p[2] = p[3] = p[4] = p[5] = 0;
2921    p[6] = p[7] = p[8] = p[9] = 0;
2922 #  endif
2923 }
2924 
2925 /*---------------------------------------------------------------*/
2926 /*--- Helpers for MMX/SSE/SSE2.                               ---*/
2927 /*---------------------------------------------------------------*/
2928 
abdU8(UChar xx,UChar yy)2929 static inline UChar abdU8 ( UChar xx, UChar yy ) {
2930    return toUChar(xx>yy ? xx-yy : yy-xx);
2931 }
2932 
mk32x2(UInt w1,UInt w0)2933 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
2934    return (((ULong)w1) << 32) | ((ULong)w0);
2935 }
2936 
sel16x4_3(ULong w64)2937 static inline UShort sel16x4_3 ( ULong w64 ) {
2938    UInt hi32 = toUInt(w64 >> 32);
2939    return toUShort(hi32 >> 16);
2940 }
sel16x4_2(ULong w64)2941 static inline UShort sel16x4_2 ( ULong w64 ) {
2942    UInt hi32 = toUInt(w64 >> 32);
2943    return toUShort(hi32);
2944 }
sel16x4_1(ULong w64)2945 static inline UShort sel16x4_1 ( ULong w64 ) {
2946    UInt lo32 = toUInt(w64);
2947    return toUShort(lo32 >> 16);
2948 }
sel16x4_0(ULong w64)2949 static inline UShort sel16x4_0 ( ULong w64 ) {
2950    UInt lo32 = toUInt(w64);
2951    return toUShort(lo32);
2952 }
2953 
sel8x8_7(ULong w64)2954 static inline UChar sel8x8_7 ( ULong w64 ) {
2955    UInt hi32 = toUInt(w64 >> 32);
2956    return toUChar(hi32 >> 24);
2957 }
sel8x8_6(ULong w64)2958 static inline UChar sel8x8_6 ( ULong w64 ) {
2959    UInt hi32 = toUInt(w64 >> 32);
2960    return toUChar(hi32 >> 16);
2961 }
sel8x8_5(ULong w64)2962 static inline UChar sel8x8_5 ( ULong w64 ) {
2963    UInt hi32 = toUInt(w64 >> 32);
2964    return toUChar(hi32 >> 8);
2965 }
sel8x8_4(ULong w64)2966 static inline UChar sel8x8_4 ( ULong w64 ) {
2967    UInt hi32 = toUInt(w64 >> 32);
2968    return toUChar(hi32 >> 0);
2969 }
sel8x8_3(ULong w64)2970 static inline UChar sel8x8_3 ( ULong w64 ) {
2971    UInt lo32 = toUInt(w64);
2972    return toUChar(lo32 >> 24);
2973 }
sel8x8_2(ULong w64)2974 static inline UChar sel8x8_2 ( ULong w64 ) {
2975    UInt lo32 = toUInt(w64);
2976    return toUChar(lo32 >> 16);
2977 }
sel8x8_1(ULong w64)2978 static inline UChar sel8x8_1 ( ULong w64 ) {
2979    UInt lo32 = toUInt(w64);
2980    return toUChar(lo32 >> 8);
2981 }
sel8x8_0(ULong w64)2982 static inline UChar sel8x8_0 ( ULong w64 ) {
2983    UInt lo32 = toUInt(w64);
2984    return toUChar(lo32 >> 0);
2985 }
2986 
2987 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_mmx_pmaddwd(ULong xx,ULong yy)2988 ULong amd64g_calculate_mmx_pmaddwd ( ULong xx, ULong yy )
2989 {
2990    return
2991       mk32x2(
2992          (((Int)(Short)sel16x4_3(xx)) * ((Int)(Short)sel16x4_3(yy)))
2993             + (((Int)(Short)sel16x4_2(xx)) * ((Int)(Short)sel16x4_2(yy))),
2994          (((Int)(Short)sel16x4_1(xx)) * ((Int)(Short)sel16x4_1(yy)))
2995             + (((Int)(Short)sel16x4_0(xx)) * ((Int)(Short)sel16x4_0(yy)))
2996       );
2997 }
2998 
2999 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_mmx_pmovmskb(ULong xx)3000 ULong amd64g_calculate_mmx_pmovmskb ( ULong xx )
3001 {
3002    ULong r = 0;
3003    if (xx & (1ULL << (64-1))) r |= (1<<7);
3004    if (xx & (1ULL << (56-1))) r |= (1<<6);
3005    if (xx & (1ULL << (48-1))) r |= (1<<5);
3006    if (xx & (1ULL << (40-1))) r |= (1<<4);
3007    if (xx & (1ULL << (32-1))) r |= (1<<3);
3008    if (xx & (1ULL << (24-1))) r |= (1<<2);
3009    if (xx & (1ULL << (16-1))) r |= (1<<1);
3010    if (xx & (1ULL << ( 8-1))) r |= (1<<0);
3011    return r;
3012 }
3013 
3014 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_mmx_psadbw(ULong xx,ULong yy)3015 ULong amd64g_calculate_mmx_psadbw ( ULong xx, ULong yy )
3016 {
3017    UInt t = 0;
3018    t += (UInt)abdU8( sel8x8_7(xx), sel8x8_7(yy) );
3019    t += (UInt)abdU8( sel8x8_6(xx), sel8x8_6(yy) );
3020    t += (UInt)abdU8( sel8x8_5(xx), sel8x8_5(yy) );
3021    t += (UInt)abdU8( sel8x8_4(xx), sel8x8_4(yy) );
3022    t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
3023    t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
3024    t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
3025    t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
3026    t &= 0xFFFF;
3027    return (ULong)t;
3028 }
3029 
3030 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_sse_pmovmskb(ULong w64hi,ULong w64lo)3031 ULong amd64g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo )
3032 {
3033    ULong rHi8 = amd64g_calculate_mmx_pmovmskb ( w64hi );
3034    ULong rLo8 = amd64g_calculate_mmx_pmovmskb ( w64lo );
3035    return ((rHi8 & 0xFF) << 8) | (rLo8 & 0xFF);
3036 }
3037 
3038 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calculate_sse_phminposuw(ULong sLo,ULong sHi)3039 ULong amd64g_calculate_sse_phminposuw ( ULong sLo, ULong sHi )
3040 {
3041    UShort t, min;
3042    UInt   idx;
3043    t = sel16x4_0(sLo); if (True)    { min = t; idx = 0; }
3044    t = sel16x4_1(sLo); if (t < min) { min = t; idx = 1; }
3045    t = sel16x4_2(sLo); if (t < min) { min = t; idx = 2; }
3046    t = sel16x4_3(sLo); if (t < min) { min = t; idx = 3; }
3047    t = sel16x4_0(sHi); if (t < min) { min = t; idx = 4; }
3048    t = sel16x4_1(sHi); if (t < min) { min = t; idx = 5; }
3049    t = sel16x4_2(sHi); if (t < min) { min = t; idx = 6; }
3050    t = sel16x4_3(sHi); if (t < min) { min = t; idx = 7; }
3051    return ((ULong)(idx << 16)) | ((ULong)min);
3052 }
3053 
3054 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calc_crc32b(ULong crcIn,ULong b)3055 ULong amd64g_calc_crc32b ( ULong crcIn, ULong b )
3056 {
3057    UInt  i;
3058    ULong crc = (b & 0xFFULL) ^ crcIn;
3059    for (i = 0; i < 8; i++)
3060       crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3061    return crc;
3062 }
3063 
3064 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calc_crc32w(ULong crcIn,ULong w)3065 ULong amd64g_calc_crc32w ( ULong crcIn, ULong w )
3066 {
3067    UInt  i;
3068    ULong crc = (w & 0xFFFFULL) ^ crcIn;
3069    for (i = 0; i < 16; i++)
3070       crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3071    return crc;
3072 }
3073 
3074 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calc_crc32l(ULong crcIn,ULong l)3075 ULong amd64g_calc_crc32l ( ULong crcIn, ULong l )
3076 {
3077    UInt i;
3078    ULong crc = (l & 0xFFFFFFFFULL) ^ crcIn;
3079    for (i = 0; i < 32; i++)
3080       crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
3081    return crc;
3082 }
3083 
3084 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calc_crc32q(ULong crcIn,ULong q)3085 ULong amd64g_calc_crc32q ( ULong crcIn, ULong q )
3086 {
3087    ULong crc = amd64g_calc_crc32l(crcIn, q);
3088    return amd64g_calc_crc32l(crc, q >> 32);
3089 }
3090 
3091 
3092 /* .. helper for next fn .. */
sad_8x4(ULong xx,ULong yy)3093 static inline ULong sad_8x4 ( ULong xx, ULong yy )
3094 {
3095    UInt t = 0;
3096    t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
3097    t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
3098    t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
3099    t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
3100    return (ULong)t;
3101 }
3102 
3103 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
amd64g_calc_mpsadbw(ULong sHi,ULong sLo,ULong dHi,ULong dLo,ULong imm_and_return_control_bit)3104 ULong amd64g_calc_mpsadbw ( ULong sHi, ULong sLo,
3105                             ULong dHi, ULong dLo,
3106                             ULong imm_and_return_control_bit )
3107 {
3108    UInt imm8     = imm_and_return_control_bit & 7;
3109    Bool calcHi   = (imm_and_return_control_bit >> 7) & 1;
3110    UInt srcOffsL = imm8 & 3; /* src offs in 32-bit (L) chunks */
3111    UInt dstOffsL = (imm8 >> 2) & 1; /* dst offs in ditto chunks */
3112    /* For src we only need 32 bits, so get them into the
3113       lower half of a 64 bit word. */
3114    ULong src = ((srcOffsL & 2) ? sHi : sLo) >> (32 * (srcOffsL & 1));
3115    /* For dst we need to get hold of 56 bits (7 bytes) from a total of
3116       11 bytes.  If calculating the low part of the result, need bytes
3117       dstOffsL * 4 + (0 .. 6); if calculating the high part,
3118       dstOffsL * 4 + (4 .. 10). */
3119    ULong dst;
3120    /* dstOffL = 0, Lo  ->  0 .. 6
3121       dstOffL = 1, Lo  ->  4 .. 10
3122       dstOffL = 0, Hi  ->  4 .. 10
3123       dstOffL = 1, Hi  ->  8 .. 14
3124    */
3125    if (calcHi && dstOffsL) {
3126       /* 8 .. 14 */
3127       dst = dHi & 0x00FFFFFFFFFFFFFFULL;
3128    }
3129    else if (!calcHi && !dstOffsL) {
3130       /* 0 .. 6 */
3131       dst = dLo & 0x00FFFFFFFFFFFFFFULL;
3132    }
3133    else {
3134       /* 4 .. 10 */
3135       dst = (dLo >> 32) | ((dHi & 0x00FFFFFFULL) << 32);
3136    }
3137    ULong r0  = sad_8x4( dst >>  0, src );
3138    ULong r1  = sad_8x4( dst >>  8, src );
3139    ULong r2  = sad_8x4( dst >> 16, src );
3140    ULong r3  = sad_8x4( dst >> 24, src );
3141    ULong res = (r3 << 48) | (r2 << 32) | (r1 << 16) | r0;
3142    return res;
3143 }
3144 
3145 /*---------------------------------------------------------------*/
3146 /*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M}                    ---*/
3147 /*---------------------------------------------------------------*/
3148 
zmask_from_V128(V128 * arg)3149 static UInt zmask_from_V128 ( V128* arg )
3150 {
3151    UInt i, res = 0;
3152    for (i = 0; i < 16; i++) {
3153       res |=  ((arg->w8[i] == 0) ? 1 : 0) << i;
3154    }
3155    return res;
3156 }
3157 
zmask_from_V128_wide(V128 * arg)3158 static UInt zmask_from_V128_wide ( V128* arg )
3159 {
3160    UInt i, res = 0;
3161    for (i = 0; i < 8; i++) {
3162       res |=  ((arg->w16[i] == 0) ? 1 : 0) << i;
3163    }
3164    return res;
3165 }
3166 
3167 /* Helps with PCMP{I,E}STR{I,M}.
3168 
3169    CALLED FROM GENERATED CODE: DIRTY HELPER(s).  (But not really,
3170    actually it could be a clean helper, but for the fact that we can't
3171    pass by value 2 x V128 to a clean helper, nor have one returned.)
3172    Reads guest state, writes to guest state for the xSTRM cases, no
3173    accesses of memory, is a pure function.
3174 
3175    opc_and_imm contains (4th byte of opcode << 8) | the-imm8-byte so
3176    the callee knows which I/E and I/M variant it is dealing with and
3177    what the specific operation is.  4th byte of opcode is in the range
3178    0x60 to 0x63:
3179        istri  66 0F 3A 63
3180        istrm  66 0F 3A 62
3181        estri  66 0F 3A 61
3182        estrm  66 0F 3A 60
3183 
3184    gstOffL and gstOffR are the guest state offsets for the two XMM
3185    register inputs.  We never have to deal with the memory case since
3186    that is handled by pre-loading the relevant value into the fake
3187    XMM16 register.
3188 
3189    For ESTRx variants, edxIN and eaxIN hold the values of those two
3190    registers.
3191 
3192    In all cases, the bottom 16 bits of the result contain the new
3193    OSZACP %rflags values.  For xSTRI variants, bits[31:16] of the
3194    result hold the new %ecx value.  For xSTRM variants, the helper
3195    writes the result directly to the guest XMM0.
3196 
3197    Declarable side effects: in all cases, reads guest state at
3198    [gstOffL, +16) and [gstOffR, +16).  For xSTRM variants, also writes
3199    guest_XMM0.
3200 
3201    Is expected to be called with opc_and_imm combinations which have
3202    actually been validated, and will assert if otherwise.  The front
3203    end should ensure we're only called with verified values.
3204 */
amd64g_dirtyhelper_PCMPxSTRx(VexGuestAMD64State * gst,HWord opc4_and_imm,HWord gstOffL,HWord gstOffR,HWord edxIN,HWord eaxIN)3205 ULong amd64g_dirtyhelper_PCMPxSTRx (
3206           VexGuestAMD64State* gst,
3207           HWord opc4_and_imm,
3208           HWord gstOffL, HWord gstOffR,
3209           HWord edxIN, HWord eaxIN
3210        )
3211 {
3212    HWord opc4 = (opc4_and_imm >> 8) & 0xFF;
3213    HWord imm8 = opc4_and_imm & 0xFF;
3214    HWord isISTRx = opc4 & 2;
3215    HWord isxSTRM = (opc4 & 1) ^ 1;
3216    vassert((opc4 & 0xFC) == 0x60); /* 0x60 .. 0x63 */
3217    HWord wide = (imm8 & 1);
3218 
3219    // where the args are
3220    V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
3221    V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
3222 
3223    /* Create the arg validity masks, either from the vectors
3224       themselves or from the supplied edx/eax values. */
3225    // FIXME: this is only right for the 8-bit data cases.
3226    // At least that is asserted above.
3227    UInt zmaskL, zmaskR;
3228 
3229    // temp spot for the resulting flags and vector.
3230    V128 resV;
3231    UInt resOSZACP;
3232 
3233    // for checking whether case was handled
3234    Bool ok = False;
3235 
3236    if (wide) {
3237       if (isISTRx) {
3238          zmaskL = zmask_from_V128_wide(argL);
3239          zmaskR = zmask_from_V128_wide(argR);
3240       } else {
3241          Int tmp;
3242          tmp = edxIN & 0xFFFFFFFF;
3243          if (tmp < -8) tmp = -8;
3244          if (tmp > 8)  tmp = 8;
3245          if (tmp < 0)  tmp = -tmp;
3246          vassert(tmp >= 0 && tmp <= 8);
3247          zmaskL = (1 << tmp) & 0xFF;
3248          tmp = eaxIN & 0xFFFFFFFF;
3249          if (tmp < -8) tmp = -8;
3250          if (tmp > 8)  tmp = 8;
3251          if (tmp < 0)  tmp = -tmp;
3252          vassert(tmp >= 0 && tmp <= 8);
3253          zmaskR = (1 << tmp) & 0xFF;
3254       }
3255       // do the meyaath
3256       ok = compute_PCMPxSTRx_wide (
3257               &resV, &resOSZACP, argL, argR,
3258               zmaskL, zmaskR, imm8, (Bool)isxSTRM
3259            );
3260    } else {
3261       if (isISTRx) {
3262          zmaskL = zmask_from_V128(argL);
3263          zmaskR = zmask_from_V128(argR);
3264       } else {
3265          Int tmp;
3266          tmp = edxIN & 0xFFFFFFFF;
3267          if (tmp < -16) tmp = -16;
3268          if (tmp > 16)  tmp = 16;
3269          if (tmp < 0)   tmp = -tmp;
3270          vassert(tmp >= 0 && tmp <= 16);
3271          zmaskL = (1 << tmp) & 0xFFFF;
3272          tmp = eaxIN & 0xFFFFFFFF;
3273          if (tmp < -16) tmp = -16;
3274          if (tmp > 16)  tmp = 16;
3275          if (tmp < 0)   tmp = -tmp;
3276          vassert(tmp >= 0 && tmp <= 16);
3277          zmaskR = (1 << tmp) & 0xFFFF;
3278       }
3279       // do the meyaath
3280       ok = compute_PCMPxSTRx (
3281               &resV, &resOSZACP, argL, argR,
3282               zmaskL, zmaskR, imm8, (Bool)isxSTRM
3283            );
3284    }
3285 
3286    // front end shouldn't pass us any imm8 variants we can't
3287    // handle.  Hence:
3288    vassert(ok);
3289 
3290    // So, finally we need to get the results back to the caller.
3291    // In all cases, the new OSZACP value is the lowest 16 of
3292    // the return value.
3293    if (isxSTRM) {
3294       gst->guest_YMM0[0] = resV.w32[0];
3295       gst->guest_YMM0[1] = resV.w32[1];
3296       gst->guest_YMM0[2] = resV.w32[2];
3297       gst->guest_YMM0[3] = resV.w32[3];
3298       return resOSZACP & 0x8D5;
3299    } else {
3300       UInt newECX = resV.w32[0] & 0xFFFF;
3301       return (newECX << 16) | (resOSZACP & 0x8D5);
3302    }
3303 }
3304 
3305 /*---------------------------------------------------------------*/
3306 /*--- AES primitives and helpers                              ---*/
3307 /*---------------------------------------------------------------*/
3308 /* a 16 x 16 matrix */
3309 static const UChar sbox[256] = {                   // row nr
3310    0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, // 1
3311    0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
3312    0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, // 2
3313    0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
3314    0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, // 3
3315    0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
3316    0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, // 4
3317    0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
3318    0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, // 5
3319    0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
3320    0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, // 6
3321    0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
3322    0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, // 7
3323    0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
3324    0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, // 8
3325    0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
3326    0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, // 9
3327    0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
3328    0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, //10
3329    0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
3330    0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, //11
3331    0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
3332    0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, //12
3333    0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
3334    0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, //13
3335    0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
3336    0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, //14
3337    0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
3338    0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, //15
3339    0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
3340    0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, //16
3341    0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
3342 };
SubBytes(V128 * v)3343 static void SubBytes (V128* v)
3344 {
3345    V128 r;
3346    UInt i;
3347    for (i = 0; i < 16; i++)
3348       r.w8[i] = sbox[v->w8[i]];
3349    *v = r;
3350 }
3351 
3352 /* a 16 x 16 matrix */
3353 static const UChar invsbox[256] = {                // row nr
3354    0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, // 1
3355    0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
3356    0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, // 2
3357    0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
3358    0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, // 3
3359    0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
3360    0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, // 4
3361    0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
3362    0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, // 5
3363    0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
3364    0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, // 6
3365    0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
3366    0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, // 7
3367    0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
3368    0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, // 8
3369    0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
3370    0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, // 9
3371    0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
3372    0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, //10
3373    0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
3374    0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, //11
3375    0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
3376    0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, //12
3377    0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
3378    0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, //13
3379    0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
3380    0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, //14
3381    0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
3382    0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, //15
3383    0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
3384    0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, //16
3385    0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
3386 };
InvSubBytes(V128 * v)3387 static void InvSubBytes (V128* v)
3388 {
3389    V128 r;
3390    UInt i;
3391    for (i = 0; i < 16; i++)
3392       r.w8[i] = invsbox[v->w8[i]];
3393    *v = r;
3394 }
3395 
3396 static const UChar ShiftRows_op[16] =
3397    {11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0};
ShiftRows(V128 * v)3398 static void ShiftRows (V128* v)
3399 {
3400    V128 r;
3401    UInt i;
3402    for (i = 0; i < 16; i++)
3403       r.w8[i] = v->w8[ShiftRows_op[15-i]];
3404    *v = r;
3405 }
3406 
3407 static const UChar InvShiftRows_op[16] =
3408    {3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0};
InvShiftRows(V128 * v)3409 static void InvShiftRows (V128* v)
3410 {
3411    V128 r;
3412    UInt i;
3413    for (i = 0; i < 16; i++)
3414       r.w8[i] = v->w8[InvShiftRows_op[15-i]];
3415    *v = r;
3416 }
3417 
3418 /* Multiplication of the finite fields elements of AES.
3419    See "A Specification for The AES Algorithm Rijndael
3420         (by Joan Daemen & Vincent Rijmen)"
3421         Dr. Brian Gladman, v3.1, 3rd March 2001. */
3422 /* N values so that (hex) xy = 0x03^N.
3423    0x00 cannot be used. We put 0xff for this value.*/
3424 /* a 16 x 16 matrix */
3425 static const UChar Nxy[256] = {                    // row nr
3426    0xff, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, // 1
3427    0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03,
3428    0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, // 2
3429    0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1,
3430    0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, // 3
3431    0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78,
3432    0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, // 4
3433    0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e,
3434    0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, // 5
3435    0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38,
3436    0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, // 6
3437    0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10,
3438    0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, // 7
3439    0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba,
3440    0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, // 8
3441    0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57,
3442    0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, // 9
3443    0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8,
3444    0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, //10
3445    0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0,
3446    0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, //11
3447    0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7,
3448    0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, //12
3449    0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d,
3450    0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, //13
3451    0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1,
3452    0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, //14
3453    0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab,
3454    0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, //15
3455    0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5,
3456    0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, //16
3457    0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07
3458 };
3459 
3460 /* E values so that E = 0x03^xy. */
3461 static const UChar Exy[256] = {                    // row nr
3462    0x01, 0x03, 0x05, 0x0f, 0x11, 0x33, 0x55, 0xff, // 1
3463    0x1a, 0x2e, 0x72, 0x96, 0xa1, 0xf8, 0x13, 0x35,
3464    0x5f, 0xe1, 0x38, 0x48, 0xd8, 0x73, 0x95, 0xa4, // 2
3465    0xf7, 0x02, 0x06, 0x0a, 0x1e, 0x22, 0x66, 0xaa,
3466    0xe5, 0x34, 0x5c, 0xe4, 0x37, 0x59, 0xeb, 0x26, // 3
3467    0x6a, 0xbe, 0xd9, 0x70, 0x90, 0xab, 0xe6, 0x31,
3468    0x53, 0xf5, 0x04, 0x0c, 0x14, 0x3c, 0x44, 0xcc, // 4
3469    0x4f, 0xd1, 0x68, 0xb8, 0xd3, 0x6e, 0xb2, 0xcd,
3470    0x4c, 0xd4, 0x67, 0xa9, 0xe0, 0x3b, 0x4d, 0xd7, // 5
3471    0x62, 0xa6, 0xf1, 0x08, 0x18, 0x28, 0x78, 0x88,
3472    0x83, 0x9e, 0xb9, 0xd0, 0x6b, 0xbd, 0xdc, 0x7f, // 6
3473    0x81, 0x98, 0xb3, 0xce, 0x49, 0xdb, 0x76, 0x9a,
3474    0xb5, 0xc4, 0x57, 0xf9, 0x10, 0x30, 0x50, 0xf0, // 7
3475    0x0b, 0x1d, 0x27, 0x69, 0xbb, 0xd6, 0x61, 0xa3,
3476    0xfe, 0x19, 0x2b, 0x7d, 0x87, 0x92, 0xad, 0xec, // 8
3477    0x2f, 0x71, 0x93, 0xae, 0xe9, 0x20, 0x60, 0xa0,
3478    0xfb, 0x16, 0x3a, 0x4e, 0xd2, 0x6d, 0xb7, 0xc2, // 9
3479    0x5d, 0xe7, 0x32, 0x56, 0xfa, 0x15, 0x3f, 0x41,
3480    0xc3, 0x5e, 0xe2, 0x3d, 0x47, 0xc9, 0x40, 0xc0, //10
3481    0x5b, 0xed, 0x2c, 0x74, 0x9c, 0xbf, 0xda, 0x75,
3482    0x9f, 0xba, 0xd5, 0x64, 0xac, 0xef, 0x2a, 0x7e, //11
3483    0x82, 0x9d, 0xbc, 0xdf, 0x7a, 0x8e, 0x89, 0x80,
3484    0x9b, 0xb6, 0xc1, 0x58, 0xe8, 0x23, 0x65, 0xaf, //12
3485    0xea, 0x25, 0x6f, 0xb1, 0xc8, 0x43, 0xc5, 0x54,
3486    0xfc, 0x1f, 0x21, 0x63, 0xa5, 0xf4, 0x07, 0x09, //13
3487    0x1b, 0x2d, 0x77, 0x99, 0xb0, 0xcb, 0x46, 0xca,
3488    0x45, 0xcf, 0x4a, 0xde, 0x79, 0x8b, 0x86, 0x91, //14
3489    0xa8, 0xe3, 0x3e, 0x42, 0xc6, 0x51, 0xf3, 0x0e,
3490    0x12, 0x36, 0x5a, 0xee, 0x29, 0x7b, 0x8d, 0x8c, //15
3491    0x8f, 0x8a, 0x85, 0x94, 0xa7, 0xf2, 0x0d, 0x17,
3492    0x39, 0x4b, 0xdd, 0x7c, 0x84, 0x97, 0xa2, 0xfd, //16
3493    0x1c, 0x24, 0x6c, 0xb4, 0xc7, 0x52, 0xf6, 0x01};
3494 
ff_mul(UChar u1,UChar u2)3495 static inline UChar ff_mul(UChar u1, UChar u2)
3496 {
3497    if ((u1 > 0) && (u2 > 0)) {
3498       UInt ui = Nxy[u1] + Nxy[u2];
3499       if (ui >= 255)
3500          ui = ui - 255;
3501       return Exy[ui];
3502    } else {
3503       return 0;
3504    };
3505 }
3506 
MixColumns(V128 * v)3507 static void MixColumns (V128* v)
3508 {
3509    V128 r;
3510    Int j;
3511 #define P(x,row,col) (x)->w8[((row)*4+(col))]
3512    for (j = 0; j < 4; j++) {
3513       P(&r,j,0) = ff_mul(0x02, P(v,j,0)) ^ ff_mul(0x03, P(v,j,1))
3514          ^ P(v,j,2) ^ P(v,j,3);
3515       P(&r,j,1) = P(v,j,0) ^ ff_mul( 0x02, P(v,j,1) )
3516          ^ ff_mul(0x03, P(v,j,2) ) ^ P(v,j,3);
3517       P(&r,j,2) = P(v,j,0) ^ P(v,j,1) ^ ff_mul( 0x02, P(v,j,2) )
3518          ^ ff_mul(0x03, P(v,j,3) );
3519       P(&r,j,3) = ff_mul(0x03, P(v,j,0) ) ^ P(v,j,1) ^ P(v,j,2)
3520          ^ ff_mul( 0x02, P(v,j,3) );
3521    }
3522    *v = r;
3523 #undef P
3524 }
3525 
InvMixColumns(V128 * v)3526 static void InvMixColumns (V128* v)
3527 {
3528    V128 r;
3529    Int j;
3530 #define P(x,row,col) (x)->w8[((row)*4+(col))]
3531    for (j = 0; j < 4; j++) {
3532       P(&r,j,0) = ff_mul(0x0e, P(v,j,0) ) ^ ff_mul(0x0b, P(v,j,1) )
3533          ^ ff_mul(0x0d,P(v,j,2) ) ^ ff_mul(0x09, P(v,j,3) );
3534       P(&r,j,1) = ff_mul(0x09, P(v,j,0) ) ^ ff_mul(0x0e, P(v,j,1) )
3535          ^ ff_mul(0x0b,P(v,j,2) ) ^ ff_mul(0x0d, P(v,j,3) );
3536       P(&r,j,2) = ff_mul(0x0d, P(v,j,0) ) ^ ff_mul(0x09, P(v,j,1) )
3537          ^ ff_mul(0x0e,P(v,j,2) ) ^ ff_mul(0x0b, P(v,j,3) );
3538       P(&r,j,3) = ff_mul(0x0b, P(v,j,0) ) ^ ff_mul(0x0d, P(v,j,1) )
3539          ^ ff_mul(0x09,P(v,j,2) ) ^ ff_mul(0x0e, P(v,j,3) );
3540    }
3541    *v = r;
3542 #undef P
3543 
3544 }
3545 
3546 /* For description, see definition in guest_amd64_defs.h */
amd64g_dirtyhelper_AES(VexGuestAMD64State * gst,HWord opc4,HWord gstOffD,HWord gstOffL,HWord gstOffR)3547 void amd64g_dirtyhelper_AES (
3548           VexGuestAMD64State* gst,
3549           HWord opc4, HWord gstOffD,
3550           HWord gstOffL, HWord gstOffR
3551        )
3552 {
3553    // where the args are
3554    V128* argD = (V128*)( ((UChar*)gst) + gstOffD );
3555    V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
3556    V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
3557    V128  r;
3558 
3559    switch (opc4) {
3560       case 0xDC: /* AESENC */
3561       case 0xDD: /* AESENCLAST */
3562          r = *argR;
3563          ShiftRows (&r);
3564          SubBytes  (&r);
3565          if (opc4 == 0xDC)
3566             MixColumns (&r);
3567          argD->w64[0] = r.w64[0] ^ argL->w64[0];
3568          argD->w64[1] = r.w64[1] ^ argL->w64[1];
3569          break;
3570 
3571       case 0xDE: /* AESDEC */
3572       case 0xDF: /* AESDECLAST */
3573          r = *argR;
3574          InvShiftRows (&r);
3575          InvSubBytes (&r);
3576          if (opc4 == 0xDE)
3577             InvMixColumns (&r);
3578          argD->w64[0] = r.w64[0] ^ argL->w64[0];
3579          argD->w64[1] = r.w64[1] ^ argL->w64[1];
3580          break;
3581 
3582       case 0xDB: /* AESIMC */
3583          *argD = *argL;
3584          InvMixColumns (argD);
3585          break;
3586       default: vassert(0);
3587    }
3588 }
3589 
RotWord(UInt w32)3590 static inline UInt RotWord (UInt   w32)
3591 {
3592    return ((w32 >> 8) | (w32 << 24));
3593 }
3594 
SubWord(UInt w32)3595 static inline UInt SubWord (UInt   w32)
3596 {
3597    UChar *w8;
3598    UChar *r8;
3599    UInt res;
3600    w8 = (UChar*) &w32;
3601    r8 = (UChar*) &res;
3602    r8[0] = sbox[w8[0]];
3603    r8[1] = sbox[w8[1]];
3604    r8[2] = sbox[w8[2]];
3605    r8[3] = sbox[w8[3]];
3606    return res;
3607 }
3608 
3609 /* For description, see definition in guest_amd64_defs.h */
amd64g_dirtyhelper_AESKEYGENASSIST(VexGuestAMD64State * gst,HWord imm8,HWord gstOffL,HWord gstOffR)3610 extern void amd64g_dirtyhelper_AESKEYGENASSIST (
3611           VexGuestAMD64State* gst,
3612           HWord imm8,
3613           HWord gstOffL, HWord gstOffR
3614        )
3615 {
3616    // where the args are
3617    V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
3618    V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
3619 
3620    argR->w32[3] = RotWord (SubWord (argL->w32[3])) ^ imm8;
3621    argR->w32[2] = SubWord (argL->w32[3]);
3622    argR->w32[1] = RotWord (SubWord (argL->w32[1])) ^ imm8;
3623    argR->w32[0] = SubWord (argL->w32[1]);
3624 }
3625 
3626 
3627 
3628 /*---------------------------------------------------------------*/
3629 /*--- Helpers for dealing with, and describing,               ---*/
3630 /*--- guest state as a whole.                                 ---*/
3631 /*---------------------------------------------------------------*/
3632 
3633 /* Initialise the entire amd64 guest state. */
3634 /* VISIBLE TO LIBVEX CLIENT */
LibVEX_GuestAMD64_initialise(VexGuestAMD64State * vex_state)3635 void LibVEX_GuestAMD64_initialise ( /*OUT*/VexGuestAMD64State* vex_state )
3636 {
3637    vex_state->host_EvC_FAILADDR = 0;
3638    vex_state->host_EvC_COUNTER = 0;
3639    vex_state->pad0 = 0;
3640 
3641    vex_state->guest_RAX = 0;
3642    vex_state->guest_RCX = 0;
3643    vex_state->guest_RDX = 0;
3644    vex_state->guest_RBX = 0;
3645    vex_state->guest_RSP = 0;
3646    vex_state->guest_RBP = 0;
3647    vex_state->guest_RSI = 0;
3648    vex_state->guest_RDI = 0;
3649    vex_state->guest_R8  = 0;
3650    vex_state->guest_R9  = 0;
3651    vex_state->guest_R10 = 0;
3652    vex_state->guest_R11 = 0;
3653    vex_state->guest_R12 = 0;
3654    vex_state->guest_R13 = 0;
3655    vex_state->guest_R14 = 0;
3656    vex_state->guest_R15 = 0;
3657 
3658    vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
3659    vex_state->guest_CC_DEP1 = 0;
3660    vex_state->guest_CC_DEP2 = 0;
3661    vex_state->guest_CC_NDEP = 0;
3662 
3663    vex_state->guest_DFLAG   = 1; /* forwards */
3664    vex_state->guest_IDFLAG  = 0;
3665 
3666    /* HACK: represent the offset associated with %fs==0. This
3667       assumes that %fs is only ever zero. */
3668    vex_state->guest_FS_ZERO = 0;
3669 
3670    vex_state->guest_RIP = 0;
3671 
3672    /* Initialise the simulated FPU */
3673    amd64g_dirtyhelper_FINIT( vex_state );
3674 
3675    /* Initialise the AVX state. */
3676 #  define AVXZERO(_ymm) \
3677       do { _ymm[0]=_ymm[1]=_ymm[2]=_ymm[3] = 0; \
3678            _ymm[4]=_ymm[5]=_ymm[6]=_ymm[7] = 0; \
3679       } while (0)
3680    vex_state->guest_SSEROUND = (ULong)Irrm_NEAREST;
3681    AVXZERO(vex_state->guest_YMM0);
3682    AVXZERO(vex_state->guest_YMM1);
3683    AVXZERO(vex_state->guest_YMM2);
3684    AVXZERO(vex_state->guest_YMM3);
3685    AVXZERO(vex_state->guest_YMM4);
3686    AVXZERO(vex_state->guest_YMM5);
3687    AVXZERO(vex_state->guest_YMM6);
3688    AVXZERO(vex_state->guest_YMM7);
3689    AVXZERO(vex_state->guest_YMM8);
3690    AVXZERO(vex_state->guest_YMM9);
3691    AVXZERO(vex_state->guest_YMM10);
3692    AVXZERO(vex_state->guest_YMM11);
3693    AVXZERO(vex_state->guest_YMM12);
3694    AVXZERO(vex_state->guest_YMM13);
3695    AVXZERO(vex_state->guest_YMM14);
3696    AVXZERO(vex_state->guest_YMM15);
3697    AVXZERO(vex_state->guest_YMM16);
3698 
3699 #  undef AVXZERO
3700 
3701    vex_state->guest_EMWARN = EmWarn_NONE;
3702 
3703    /* These should not ever be either read or written, but we
3704       initialise them anyway. */
3705    vex_state->guest_TISTART = 0;
3706    vex_state->guest_TILEN   = 0;
3707 
3708    vex_state->guest_NRADDR   = 0;
3709    vex_state->guest_SC_CLASS = 0;
3710    vex_state->guest_GS_0x60  = 0;
3711 
3712    vex_state->guest_IP_AT_SYSCALL = 0;
3713    vex_state->pad1 = 0;
3714 }
3715 
3716 
3717 /* Figure out if any part of the guest state contained in minoff
3718    .. maxoff requires precise memory exceptions.  If in doubt return
3719    True (but this is generates significantly slower code).
3720 
3721    By default we enforce precise exns for guest %RSP, %RBP and %RIP
3722    only.  These are the minimum needed to extract correct stack
3723    backtraces from amd64 code.
3724 */
guest_amd64_state_requires_precise_mem_exns(Int minoff,Int maxoff)3725 Bool guest_amd64_state_requires_precise_mem_exns ( Int minoff,
3726                                                    Int maxoff)
3727 {
3728    Int rbp_min = offsetof(VexGuestAMD64State, guest_RBP);
3729    Int rbp_max = rbp_min + 8 - 1;
3730    Int rsp_min = offsetof(VexGuestAMD64State, guest_RSP);
3731    Int rsp_max = rsp_min + 8 - 1;
3732    Int rip_min = offsetof(VexGuestAMD64State, guest_RIP);
3733    Int rip_max = rip_min + 8 - 1;
3734 
3735    if (maxoff < rbp_min || minoff > rbp_max) {
3736       /* no overlap with rbp */
3737    } else {
3738       return True;
3739    }
3740 
3741    if (maxoff < rsp_min || minoff > rsp_max) {
3742       /* no overlap with rsp */
3743    } else {
3744       return True;
3745    }
3746 
3747    if (maxoff < rip_min || minoff > rip_max) {
3748       /* no overlap with eip */
3749    } else {
3750       return True;
3751    }
3752 
3753    return False;
3754 }
3755 
3756 
3757 #define ALWAYSDEFD(field)                             \
3758     { offsetof(VexGuestAMD64State, field),            \
3759       (sizeof ((VexGuestAMD64State*)0)->field) }
3760 
3761 VexGuestLayout
3762    amd64guest_layout
3763       = {
3764           /* Total size of the guest state, in bytes. */
3765           .total_sizeB = sizeof(VexGuestAMD64State),
3766 
3767           /* Describe the stack pointer. */
3768           .offset_SP = offsetof(VexGuestAMD64State,guest_RSP),
3769           .sizeof_SP = 8,
3770 
3771           /* Describe the frame pointer. */
3772           .offset_FP = offsetof(VexGuestAMD64State,guest_RBP),
3773           .sizeof_FP = 8,
3774 
3775           /* Describe the instruction pointer. */
3776           .offset_IP = offsetof(VexGuestAMD64State,guest_RIP),
3777           .sizeof_IP = 8,
3778 
3779           /* Describe any sections to be regarded by Memcheck as
3780              'always-defined'. */
3781           .n_alwaysDefd = 16,
3782 
3783           /* flags thunk: OP and NDEP are always defd, whereas DEP1
3784              and DEP2 have to be tracked.  See detailed comment in
3785              gdefs.h on meaning of thunk fields. */
3786           .alwaysDefd
3787              = { /*  0 */ ALWAYSDEFD(guest_CC_OP),
3788                  /*  1 */ ALWAYSDEFD(guest_CC_NDEP),
3789 		 /*  2 */ ALWAYSDEFD(guest_DFLAG),
3790                  /*  3 */ ALWAYSDEFD(guest_IDFLAG),
3791                  /*  4 */ ALWAYSDEFD(guest_RIP),
3792                  /*  5 */ ALWAYSDEFD(guest_FS_ZERO),
3793                  /*  6 */ ALWAYSDEFD(guest_FTOP),
3794                  /*  7 */ ALWAYSDEFD(guest_FPTAG),
3795                  /*  8 */ ALWAYSDEFD(guest_FPROUND),
3796                  /*  9 */ ALWAYSDEFD(guest_FC3210),
3797                  // /* */ ALWAYSDEFD(guest_CS),
3798                  // /* */ ALWAYSDEFD(guest_DS),
3799                  // /* */ ALWAYSDEFD(guest_ES),
3800                  // /* */ ALWAYSDEFD(guest_FS),
3801                  // /* */ ALWAYSDEFD(guest_GS),
3802                  // /* */ ALWAYSDEFD(guest_SS),
3803                  // /* */ ALWAYSDEFD(guest_LDT),
3804                  // /* */ ALWAYSDEFD(guest_GDT),
3805                  /* 10 */ ALWAYSDEFD(guest_EMWARN),
3806                  /* 11 */ ALWAYSDEFD(guest_SSEROUND),
3807                  /* 12 */ ALWAYSDEFD(guest_TISTART),
3808                  /* 13 */ ALWAYSDEFD(guest_TILEN),
3809                  /* 14 */ ALWAYSDEFD(guest_SC_CLASS),
3810                  /* 15 */ ALWAYSDEFD(guest_IP_AT_SYSCALL)
3811                }
3812         };
3813 
3814 
3815 /*---------------------------------------------------------------*/
3816 /*--- end                               guest_amd64_helpers.c ---*/
3817 /*---------------------------------------------------------------*/
3818