• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 
2 /*--------------------------------------------------------------------*/
3 /*--- Instrument IR to perform memory checking operations.         ---*/
4 /*---                                               mc_translate.c ---*/
5 /*--------------------------------------------------------------------*/
6 
7 /*
8    This file is part of MemCheck, a heavyweight Valgrind tool for
9    detecting memory errors.
10 
11    Copyright (C) 2000-2017 Julian Seward
12       jseward@acm.org
13 
14    This program is free software; you can redistribute it and/or
15    modify it under the terms of the GNU General Public License as
16    published by the Free Software Foundation; either version 2 of the
17    License, or (at your option) any later version.
18 
19    This program is distributed in the hope that it will be useful, but
20    WITHOUT ANY WARRANTY; without even the implied warranty of
21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22    General Public License for more details.
23 
24    You should have received a copy of the GNU General Public License
25    along with this program; if not, write to the Free Software
26    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
27    02111-1307, USA.
28 
29    The GNU General Public License is contained in the file COPYING.
30 */
31 
32 #include "pub_tool_basics.h"
33 #include "pub_tool_poolalloc.h"     // For mc_include.h
34 #include "pub_tool_hashtable.h"     // For mc_include.h
35 #include "pub_tool_libcassert.h"
36 #include "pub_tool_libcprint.h"
37 #include "pub_tool_tooliface.h"
38 #include "pub_tool_machine.h"     // VG_(fnptr_to_fnentry)
39 #include "pub_tool_xarray.h"
40 #include "pub_tool_mallocfree.h"
41 #include "pub_tool_libcbase.h"
42 
43 #include "mc_include.h"
44 
45 
46 /* FIXMEs JRS 2011-June-16.
47 
48    Check the interpretation for vector narrowing and widening ops,
49    particularly the saturating ones.  I suspect they are either overly
50    pessimistic and/or wrong.
51 
52    Iop_QandSQsh64x2 and friends (vector-by-vector bidirectional
53    saturating shifts): the interpretation is overly pessimistic.
54    See comments on the relevant cases below for details.
55 
56    Iop_Sh64Sx2 and friends (vector-by-vector bidirectional shifts,
57    both rounding and non-rounding variants): ditto
58 */
59 
60 /* This file implements the Memcheck instrumentation, and in
61    particular contains the core of its undefined value detection
62    machinery.  For a comprehensive background of the terminology,
63    algorithms and rationale used herein, read:
64 
65      Using Valgrind to detect undefined value errors with
66      bit-precision
67 
68      Julian Seward and Nicholas Nethercote
69 
70      2005 USENIX Annual Technical Conference (General Track),
71      Anaheim, CA, USA, April 10-15, 2005.
72 
73    ----
74 
75    Here is as good a place as any to record exactly when V bits are and
76    should be checked, why, and what function is responsible.
77 
78 
79    Memcheck complains when an undefined value is used:
80 
81    1. In the condition of a conditional branch.  Because it could cause
82       incorrect control flow, and thus cause incorrect externally-visible
83       behaviour.  [mc_translate.c:complainIfUndefined]
84 
85    2. As an argument to a system call, or as the value that specifies
86       the system call number.  Because it could cause an incorrect
87       externally-visible side effect.  [mc_translate.c:mc_pre_reg_read]
88 
89    3. As the address in a load or store.  Because it could cause an
90       incorrect value to be used later, which could cause externally-visible
91       behaviour (eg. via incorrect control flow or an incorrect system call
92       argument)  [complainIfUndefined]
93 
94    4. As the target address of a branch.  Because it could cause incorrect
95       control flow.  [complainIfUndefined]
96 
97    5. As an argument to setenv, unsetenv, or putenv.  Because it could put
98       an incorrect value into the external environment.
99       [mc_replace_strmem.c:VG_WRAP_FUNCTION_ZU(*, *env)]
100 
101    6. As the index in a GETI or PUTI operation.  I'm not sure why... (njn).
102       [complainIfUndefined]
103 
104    7. As an argument to the VALGRIND_CHECK_MEM_IS_DEFINED and
105       VALGRIND_CHECK_VALUE_IS_DEFINED client requests.  Because the user
106       requested it.  [in memcheck.h]
107 
108 
109    Memcheck also complains, but should not, when an undefined value is used:
110 
111    8. As the shift value in certain SIMD shift operations (but not in the
112       standard integer shift operations).  This inconsistency is due to
113       historical reasons.)  [complainIfUndefined]
114 
115 
116    Memcheck does not complain, but should, when an undefined value is used:
117 
118    9. As an input to a client request.  Because the client request may
119       affect the visible behaviour -- see bug #144362 for an example
120       involving the malloc replacements in vg_replace_malloc.c and
121       VALGRIND_NON_SIMD_CALL* requests, where an uninitialised argument
122       isn't identified.  That bug report also has some info on how to solve
123       the problem.  [valgrind.h:VALGRIND_DO_CLIENT_REQUEST]
124 
125 
126    In practice, 1 and 2 account for the vast majority of cases.
127 */
128 
129 /* Generation of addr-definedness, addr-validity and
130    guard-definedness checks pertaining to loads and stores (Iex_Load,
131    Ist_Store, IRLoadG, IRStoreG, LLSC, CAS and Dirty memory
132    loads/stores) was re-checked 11 May 2013. */
133 
134 /*------------------------------------------------------------*/
135 /*--- Forward decls                                        ---*/
136 /*------------------------------------------------------------*/
137 
138 struct _MCEnv;
139 
140 static IRType  shadowTypeV ( IRType ty );
141 static IRExpr* expr2vbits ( struct _MCEnv* mce, IRExpr* e );
142 static IRTemp  findShadowTmpB ( struct _MCEnv* mce, IRTemp orig );
143 
144 static IRExpr *i128_const_zero(void);
145 
146 /*------------------------------------------------------------*/
147 /*--- Memcheck running state, and tmp management.          ---*/
148 /*------------------------------------------------------------*/
149 
150 /* Carries info about a particular tmp.  The tmp's number is not
151    recorded, as this is implied by (equal to) its index in the tmpMap
152    in MCEnv.  The tmp's type is also not recorded, as this is present
153    in MCEnv.sb->tyenv.
154 
155    When .kind is Orig, .shadowV and .shadowB may give the identities
156    of the temps currently holding the associated definedness (shadowV)
157    and origin (shadowB) values, or these may be IRTemp_INVALID if code
158    to compute such values has not yet been emitted.
159 
160    When .kind is VSh or BSh then the tmp is holds a V- or B- value,
161    and so .shadowV and .shadowB must be IRTemp_INVALID, since it is
162    illogical for a shadow tmp itself to be shadowed.
163 */
164 typedef
165    enum { Orig=1, VSh=2, BSh=3 }
166    TempKind;
167 
168 typedef
169    struct {
170       TempKind kind;
171       IRTemp   shadowV;
172       IRTemp   shadowB;
173    }
174    TempMapEnt;
175 
176 
177 /* Carries around state during memcheck instrumentation. */
178 typedef
179    struct _MCEnv {
180       /* MODIFIED: the superblock being constructed.  IRStmts are
181          added. */
182       IRSB* sb;
183       Bool  trace;
184 
185       /* MODIFIED: a table [0 .. #temps_in_sb-1] which gives the
186          current kind and possibly shadow temps for each temp in the
187          IRSB being constructed.  Note that it does not contain the
188          type of each tmp.  If you want to know the type, look at the
189          relevant entry in sb->tyenv.  It follows that at all times
190          during the instrumentation process, the valid indices for
191          tmpMap and sb->tyenv are identical, being 0 .. N-1 where N is
192          total number of Orig, V- and B- temps allocated so far.
193 
194          The reason for this strange split (types in one place, all
195          other info in another) is that we need the types to be
196          attached to sb so as to make it possible to do
197          "typeOfIRExpr(mce->bb->tyenv, ...)" at various places in the
198          instrumentation process. */
199       XArray* /* of TempMapEnt */ tmpMap;
200 
201       /* MODIFIED: indicates whether "bogus" literals have so far been
202          found.  Starts off False, and may change to True. */
203       Bool bogusLiterals;
204 
205       /* READONLY: indicates whether we should use expensive
206          interpretations of integer adds, since unfortunately LLVM
207          uses them to do ORs in some circumstances.  Defaulted to True
208          on MacOS and False everywhere else. */
209       Bool useLLVMworkarounds;
210 
211       /* READONLY: the guest layout.  This indicates which parts of
212          the guest state should be regarded as 'always defined'. */
213       const VexGuestLayout* layout;
214 
215       /* READONLY: the host word type.  Needed for constructing
216          arguments of type 'HWord' to be passed to helper functions.
217          Ity_I32 or Ity_I64 only. */
218       IRType hWordTy;
219    }
220    MCEnv;
221 
222 /* SHADOW TMP MANAGEMENT.  Shadow tmps are allocated lazily (on
223    demand), as they are encountered.  This is for two reasons.
224 
225    (1) (less important reason): Many original tmps are unused due to
226    initial IR optimisation, and we do not want to spaces in tables
227    tracking them.
228 
229    Shadow IRTemps are therefore allocated on demand.  mce.tmpMap is a
230    table indexed [0 .. n_types-1], which gives the current shadow for
231    each original tmp, or INVALID_IRTEMP if none is so far assigned.
232    It is necessary to support making multiple assignments to a shadow
233    -- specifically, after testing a shadow for definedness, it needs
234    to be made defined.  But IR's SSA property disallows this.
235 
236    (2) (more important reason): Therefore, when a shadow needs to get
237    a new value, a new temporary is created, the value is assigned to
238    that, and the tmpMap is updated to reflect the new binding.
239 
240    A corollary is that if the tmpMap maps a given tmp to
241    IRTemp_INVALID and we are hoping to read that shadow tmp, it means
242    there's a read-before-write error in the original tmps.  The IR
243    sanity checker should catch all such anomalies, however.
244 */
245 
246 /* Create a new IRTemp of type 'ty' and kind 'kind', and add it to
247    both the table in mce->sb and to our auxiliary mapping.  Note that
248    newTemp may cause mce->tmpMap to resize, hence previous results
249    from VG_(indexXA)(mce->tmpMap) are invalidated. */
newTemp(MCEnv * mce,IRType ty,TempKind kind)250 static IRTemp newTemp ( MCEnv* mce, IRType ty, TempKind kind )
251 {
252    Word       newIx;
253    TempMapEnt ent;
254    IRTemp     tmp = newIRTemp(mce->sb->tyenv, ty);
255    ent.kind    = kind;
256    ent.shadowV = IRTemp_INVALID;
257    ent.shadowB = IRTemp_INVALID;
258    newIx = VG_(addToXA)( mce->tmpMap, &ent );
259    tl_assert(newIx == (Word)tmp);
260    return tmp;
261 }
262 
263 
264 /* Find the tmp currently shadowing the given original tmp.  If none
265    so far exists, allocate one.  */
findShadowTmpV(MCEnv * mce,IRTemp orig)266 static IRTemp findShadowTmpV ( MCEnv* mce, IRTemp orig )
267 {
268    TempMapEnt* ent;
269    /* VG_(indexXA) range-checks 'orig', hence no need to check
270       here. */
271    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
272    tl_assert(ent->kind == Orig);
273    if (ent->shadowV == IRTemp_INVALID) {
274       IRTemp tmpV
275         = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
276       /* newTemp may cause mce->tmpMap to resize, hence previous results
277          from VG_(indexXA) are invalid. */
278       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
279       tl_assert(ent->kind == Orig);
280       tl_assert(ent->shadowV == IRTemp_INVALID);
281       ent->shadowV = tmpV;
282    }
283    return ent->shadowV;
284 }
285 
286 /* Allocate a new shadow for the given original tmp.  This means any
287    previous shadow is abandoned.  This is needed because it is
288    necessary to give a new value to a shadow once it has been tested
289    for undefinedness, but unfortunately IR's SSA property disallows
290    this.  Instead we must abandon the old shadow, allocate a new one
291    and use that instead.
292 
293    This is the same as findShadowTmpV, except we don't bother to see
294    if a shadow temp already existed -- we simply allocate a new one
295    regardless. */
newShadowTmpV(MCEnv * mce,IRTemp orig)296 static void newShadowTmpV ( MCEnv* mce, IRTemp orig )
297 {
298    TempMapEnt* ent;
299    /* VG_(indexXA) range-checks 'orig', hence no need to check
300       here. */
301    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
302    tl_assert(ent->kind == Orig);
303    if (1) {
304       IRTemp tmpV
305         = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
306       /* newTemp may cause mce->tmpMap to resize, hence previous results
307          from VG_(indexXA) are invalid. */
308       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
309       tl_assert(ent->kind == Orig);
310       ent->shadowV = tmpV;
311    }
312 }
313 
314 
315 /*------------------------------------------------------------*/
316 /*--- IRAtoms -- a subset of IRExprs                       ---*/
317 /*------------------------------------------------------------*/
318 
319 /* An atom is either an IRExpr_Const or an IRExpr_Tmp, as defined by
320    isIRAtom() in libvex_ir.h.  Because this instrumenter expects flat
321    input, most of this code deals in atoms.  Usefully, a value atom
322    always has a V-value which is also an atom: constants are shadowed
323    by constants, and temps are shadowed by the corresponding shadow
324    temporary. */
325 
326 typedef  IRExpr  IRAtom;
327 
328 /* (used for sanity checks only): is this an atom which looks
329    like it's from original code? */
isOriginalAtom(MCEnv * mce,IRAtom * a1)330 static Bool isOriginalAtom ( MCEnv* mce, IRAtom* a1 )
331 {
332    if (a1->tag == Iex_Const)
333       return True;
334    if (a1->tag == Iex_RdTmp) {
335       TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
336       return ent->kind == Orig;
337    }
338    return False;
339 }
340 
341 /* (used for sanity checks only): is this an atom which looks
342    like it's from shadow code? */
isShadowAtom(MCEnv * mce,IRAtom * a1)343 static Bool isShadowAtom ( MCEnv* mce, IRAtom* a1 )
344 {
345    if (a1->tag == Iex_Const)
346       return True;
347    if (a1->tag == Iex_RdTmp) {
348       TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
349       return ent->kind == VSh || ent->kind == BSh;
350    }
351    return False;
352 }
353 
354 /* (used for sanity checks only): check that both args are atoms and
355    are identically-kinded. */
sameKindedAtoms(IRAtom * a1,IRAtom * a2)356 static Bool sameKindedAtoms ( IRAtom* a1, IRAtom* a2 )
357 {
358    if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
359       return True;
360    if (a1->tag == Iex_Const && a2->tag == Iex_Const)
361       return True;
362    return False;
363 }
364 
365 
366 /*------------------------------------------------------------*/
367 /*--- Type management                                      ---*/
368 /*------------------------------------------------------------*/
369 
370 /* Shadow state is always accessed using integer types.  This returns
371    an integer type with the same size (as per sizeofIRType) as the
372    given type.  The only valid shadow types are Bit, I8, I16, I32,
373    I64, I128, V128, V256. */
374 
shadowTypeV(IRType ty)375 static IRType shadowTypeV ( IRType ty )
376 {
377    switch (ty) {
378       case Ity_I1:
379       case Ity_I8:
380       case Ity_I16:
381       case Ity_I32:
382       case Ity_I64:
383       case Ity_I128: return ty;
384       case Ity_F16:  return Ity_I16;
385       case Ity_F32:  return Ity_I32;
386       case Ity_D32:  return Ity_I32;
387       case Ity_F64:  return Ity_I64;
388       case Ity_D64:  return Ity_I64;
389       case Ity_F128: return Ity_I128;
390       case Ity_D128: return Ity_I128;
391       case Ity_V128: return Ity_V128;
392       case Ity_V256: return Ity_V256;
393       default: ppIRType(ty);
394                VG_(tool_panic)("memcheck:shadowTypeV");
395    }
396 }
397 
398 /* Produce a 'defined' value of the given shadow type.  Should only be
399    supplied shadow types (Bit/I8/I16/I32/UI64). */
definedOfType(IRType ty)400 static IRExpr* definedOfType ( IRType ty ) {
401    switch (ty) {
402       case Ity_I1:   return IRExpr_Const(IRConst_U1(False));
403       case Ity_I8:   return IRExpr_Const(IRConst_U8(0));
404       case Ity_I16:  return IRExpr_Const(IRConst_U16(0));
405       case Ity_I32:  return IRExpr_Const(IRConst_U32(0));
406       case Ity_I64:  return IRExpr_Const(IRConst_U64(0));
407       case Ity_I128: return i128_const_zero();
408       case Ity_V128: return IRExpr_Const(IRConst_V128(0x0000));
409       case Ity_V256: return IRExpr_Const(IRConst_V256(0x00000000));
410       default:       VG_(tool_panic)("memcheck:definedOfType");
411    }
412 }
413 
414 
415 /*------------------------------------------------------------*/
416 /*--- Constructing IR fragments                            ---*/
417 /*------------------------------------------------------------*/
418 
419 /* add stmt to a bb */
stmt(HChar cat,MCEnv * mce,IRStmt * st)420 static inline void stmt ( HChar cat, MCEnv* mce, IRStmt* st ) {
421    if (mce->trace) {
422       VG_(printf)("  %c: ", cat);
423       ppIRStmt(st);
424       VG_(printf)("\n");
425    }
426    addStmtToIRSB(mce->sb, st);
427 }
428 
429 /* assign value to tmp */
430 static inline
assign(HChar cat,MCEnv * mce,IRTemp tmp,IRExpr * expr)431 void assign ( HChar cat, MCEnv* mce, IRTemp tmp, IRExpr* expr ) {
432    stmt(cat, mce, IRStmt_WrTmp(tmp,expr));
433 }
434 
435 /* build various kinds of expressions */
436 #define triop(_op, _arg1, _arg2, _arg3) \
437                                  IRExpr_Triop((_op),(_arg1),(_arg2),(_arg3))
438 #define binop(_op, _arg1, _arg2) IRExpr_Binop((_op),(_arg1),(_arg2))
439 #define unop(_op, _arg)          IRExpr_Unop((_op),(_arg))
440 #define mkU1(_n)                 IRExpr_Const(IRConst_U1(_n))
441 #define mkU8(_n)                 IRExpr_Const(IRConst_U8(_n))
442 #define mkU16(_n)                IRExpr_Const(IRConst_U16(_n))
443 #define mkU32(_n)                IRExpr_Const(IRConst_U32(_n))
444 #define mkU64(_n)                IRExpr_Const(IRConst_U64(_n))
445 #define mkV128(_n)               IRExpr_Const(IRConst_V128(_n))
446 #define mkexpr(_tmp)             IRExpr_RdTmp((_tmp))
447 
448 /* Bind the given expression to a new temporary, and return the
449    temporary.  This effectively converts an arbitrary expression into
450    an atom.
451 
452    'ty' is the type of 'e' and hence the type that the new temporary
453    needs to be.  But passing it in is redundant, since we can deduce
454    the type merely by inspecting 'e'.  So at least use that fact to
455    assert that the two types agree. */
assignNew(HChar cat,MCEnv * mce,IRType ty,IRExpr * e)456 static IRAtom* assignNew ( HChar cat, MCEnv* mce, IRType ty, IRExpr* e )
457 {
458    TempKind k;
459    IRTemp   t;
460    IRType   tyE = typeOfIRExpr(mce->sb->tyenv, e);
461 
462    tl_assert(tyE == ty); /* so 'ty' is redundant (!) */
463    switch (cat) {
464       case 'V': k = VSh;  break;
465       case 'B': k = BSh;  break;
466       case 'C': k = Orig; break;
467                 /* happens when we are making up new "orig"
468                    expressions, for IRCAS handling */
469       default: tl_assert(0);
470    }
471    t = newTemp(mce, ty, k);
472    assign(cat, mce, t, e);
473    return mkexpr(t);
474 }
475 
476 
477 /*------------------------------------------------------------*/
478 /*--- Helper functions for 128-bit ops                     ---*/
479 /*------------------------------------------------------------*/
480 
i128_const_zero(void)481 static IRExpr *i128_const_zero(void)
482 {
483    IRAtom* z64 = IRExpr_Const(IRConst_U64(0));
484    return binop(Iop_64HLto128, z64, z64);
485 }
486 
487 /* There are no I128-bit loads and/or stores [as generated by any
488    current front ends].  So we do not need to worry about that in
489    expr2vbits_Load */
490 
491 
492 /*------------------------------------------------------------*/
493 /*--- Constructing definedness primitive ops               ---*/
494 /*------------------------------------------------------------*/
495 
496 /* --------- Defined-if-either-defined --------- */
497 
mkDifD8(MCEnv * mce,IRAtom * a1,IRAtom * a2)498 static IRAtom* mkDifD8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
499    tl_assert(isShadowAtom(mce,a1));
500    tl_assert(isShadowAtom(mce,a2));
501    return assignNew('V', mce, Ity_I8, binop(Iop_And8, a1, a2));
502 }
503 
mkDifD16(MCEnv * mce,IRAtom * a1,IRAtom * a2)504 static IRAtom* mkDifD16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
505    tl_assert(isShadowAtom(mce,a1));
506    tl_assert(isShadowAtom(mce,a2));
507    return assignNew('V', mce, Ity_I16, binop(Iop_And16, a1, a2));
508 }
509 
mkDifD32(MCEnv * mce,IRAtom * a1,IRAtom * a2)510 static IRAtom* mkDifD32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
511    tl_assert(isShadowAtom(mce,a1));
512    tl_assert(isShadowAtom(mce,a2));
513    return assignNew('V', mce, Ity_I32, binop(Iop_And32, a1, a2));
514 }
515 
mkDifD64(MCEnv * mce,IRAtom * a1,IRAtom * a2)516 static IRAtom* mkDifD64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
517    tl_assert(isShadowAtom(mce,a1));
518    tl_assert(isShadowAtom(mce,a2));
519    return assignNew('V', mce, Ity_I64, binop(Iop_And64, a1, a2));
520 }
521 
mkDifDV128(MCEnv * mce,IRAtom * a1,IRAtom * a2)522 static IRAtom* mkDifDV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
523    tl_assert(isShadowAtom(mce,a1));
524    tl_assert(isShadowAtom(mce,a2));
525    return assignNew('V', mce, Ity_V128, binop(Iop_AndV128, a1, a2));
526 }
527 
mkDifDV256(MCEnv * mce,IRAtom * a1,IRAtom * a2)528 static IRAtom* mkDifDV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
529    tl_assert(isShadowAtom(mce,a1));
530    tl_assert(isShadowAtom(mce,a2));
531    return assignNew('V', mce, Ity_V256, binop(Iop_AndV256, a1, a2));
532 }
533 
534 /* --------- Undefined-if-either-undefined --------- */
535 
mkUifU8(MCEnv * mce,IRAtom * a1,IRAtom * a2)536 static IRAtom* mkUifU8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
537    tl_assert(isShadowAtom(mce,a1));
538    tl_assert(isShadowAtom(mce,a2));
539    return assignNew('V', mce, Ity_I8, binop(Iop_Or8, a1, a2));
540 }
541 
mkUifU16(MCEnv * mce,IRAtom * a1,IRAtom * a2)542 static IRAtom* mkUifU16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
543    tl_assert(isShadowAtom(mce,a1));
544    tl_assert(isShadowAtom(mce,a2));
545    return assignNew('V', mce, Ity_I16, binop(Iop_Or16, a1, a2));
546 }
547 
mkUifU32(MCEnv * mce,IRAtom * a1,IRAtom * a2)548 static IRAtom* mkUifU32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
549    tl_assert(isShadowAtom(mce,a1));
550    tl_assert(isShadowAtom(mce,a2));
551    return assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, a2));
552 }
553 
mkUifU64(MCEnv * mce,IRAtom * a1,IRAtom * a2)554 static IRAtom* mkUifU64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
555    tl_assert(isShadowAtom(mce,a1));
556    tl_assert(isShadowAtom(mce,a2));
557    return assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, a2));
558 }
559 
mkUifU128(MCEnv * mce,IRAtom * a1,IRAtom * a2)560 static IRAtom* mkUifU128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
561    IRAtom *tmp1, *tmp2, *tmp3, *tmp4, *tmp5, *tmp6;
562    tl_assert(isShadowAtom(mce,a1));
563    tl_assert(isShadowAtom(mce,a2));
564    tmp1 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a1));
565    tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a1));
566    tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a2));
567    tmp4 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a2));
568    tmp5 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp1, tmp3));
569    tmp6 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp4));
570 
571    return assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp6, tmp5));
572 }
573 
mkUifUV128(MCEnv * mce,IRAtom * a1,IRAtom * a2)574 static IRAtom* mkUifUV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
575    tl_assert(isShadowAtom(mce,a1));
576    tl_assert(isShadowAtom(mce,a2));
577    return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, a1, a2));
578 }
579 
mkUifUV256(MCEnv * mce,IRAtom * a1,IRAtom * a2)580 static IRAtom* mkUifUV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
581    tl_assert(isShadowAtom(mce,a1));
582    tl_assert(isShadowAtom(mce,a2));
583    return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, a1, a2));
584 }
585 
mkUifU(MCEnv * mce,IRType vty,IRAtom * a1,IRAtom * a2)586 static IRAtom* mkUifU ( MCEnv* mce, IRType vty, IRAtom* a1, IRAtom* a2 ) {
587    switch (vty) {
588       case Ity_I8:   return mkUifU8(mce, a1, a2);
589       case Ity_I16:  return mkUifU16(mce, a1, a2);
590       case Ity_I32:  return mkUifU32(mce, a1, a2);
591       case Ity_I64:  return mkUifU64(mce, a1, a2);
592       case Ity_I128: return mkUifU128(mce, a1, a2);
593       case Ity_V128: return mkUifUV128(mce, a1, a2);
594       case Ity_V256: return mkUifUV256(mce, a1, a2);
595       default:
596          VG_(printf)("\n"); ppIRType(vty); VG_(printf)("\n");
597          VG_(tool_panic)("memcheck:mkUifU");
598    }
599 }
600 
601 /* --------- The Left-family of operations. --------- */
602 
mkLeft8(MCEnv * mce,IRAtom * a1)603 static IRAtom* mkLeft8 ( MCEnv* mce, IRAtom* a1 ) {
604    tl_assert(isShadowAtom(mce,a1));
605    return assignNew('V', mce, Ity_I8, unop(Iop_Left8, a1));
606 }
607 
mkLeft16(MCEnv * mce,IRAtom * a1)608 static IRAtom* mkLeft16 ( MCEnv* mce, IRAtom* a1 ) {
609    tl_assert(isShadowAtom(mce,a1));
610    return assignNew('V', mce, Ity_I16, unop(Iop_Left16, a1));
611 }
612 
mkLeft32(MCEnv * mce,IRAtom * a1)613 static IRAtom* mkLeft32 ( MCEnv* mce, IRAtom* a1 ) {
614    tl_assert(isShadowAtom(mce,a1));
615    return assignNew('V', mce, Ity_I32, unop(Iop_Left32, a1));
616 }
617 
mkLeft64(MCEnv * mce,IRAtom * a1)618 static IRAtom* mkLeft64 ( MCEnv* mce, IRAtom* a1 ) {
619    tl_assert(isShadowAtom(mce,a1));
620    return assignNew('V', mce, Ity_I64, unop(Iop_Left64, a1));
621 }
622 
623 /* --------- 'Improvement' functions for AND/OR. --------- */
624 
625 /* ImproveAND(data, vbits) = data OR vbits.  Defined (0) data 0s give
626    defined (0); all other -> undefined (1).
627 */
mkImproveAND8(MCEnv * mce,IRAtom * data,IRAtom * vbits)628 static IRAtom* mkImproveAND8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
629 {
630    tl_assert(isOriginalAtom(mce, data));
631    tl_assert(isShadowAtom(mce, vbits));
632    tl_assert(sameKindedAtoms(data, vbits));
633    return assignNew('V', mce, Ity_I8, binop(Iop_Or8, data, vbits));
634 }
635 
mkImproveAND16(MCEnv * mce,IRAtom * data,IRAtom * vbits)636 static IRAtom* mkImproveAND16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
637 {
638    tl_assert(isOriginalAtom(mce, data));
639    tl_assert(isShadowAtom(mce, vbits));
640    tl_assert(sameKindedAtoms(data, vbits));
641    return assignNew('V', mce, Ity_I16, binop(Iop_Or16, data, vbits));
642 }
643 
mkImproveAND32(MCEnv * mce,IRAtom * data,IRAtom * vbits)644 static IRAtom* mkImproveAND32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
645 {
646    tl_assert(isOriginalAtom(mce, data));
647    tl_assert(isShadowAtom(mce, vbits));
648    tl_assert(sameKindedAtoms(data, vbits));
649    return assignNew('V', mce, Ity_I32, binop(Iop_Or32, data, vbits));
650 }
651 
mkImproveAND64(MCEnv * mce,IRAtom * data,IRAtom * vbits)652 static IRAtom* mkImproveAND64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
653 {
654    tl_assert(isOriginalAtom(mce, data));
655    tl_assert(isShadowAtom(mce, vbits));
656    tl_assert(sameKindedAtoms(data, vbits));
657    return assignNew('V', mce, Ity_I64, binop(Iop_Or64, data, vbits));
658 }
659 
mkImproveANDV128(MCEnv * mce,IRAtom * data,IRAtom * vbits)660 static IRAtom* mkImproveANDV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
661 {
662    tl_assert(isOriginalAtom(mce, data));
663    tl_assert(isShadowAtom(mce, vbits));
664    tl_assert(sameKindedAtoms(data, vbits));
665    return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, data, vbits));
666 }
667 
mkImproveANDV256(MCEnv * mce,IRAtom * data,IRAtom * vbits)668 static IRAtom* mkImproveANDV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
669 {
670    tl_assert(isOriginalAtom(mce, data));
671    tl_assert(isShadowAtom(mce, vbits));
672    tl_assert(sameKindedAtoms(data, vbits));
673    return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, data, vbits));
674 }
675 
676 /* ImproveOR(data, vbits) = ~data OR vbits.  Defined (0) data 1s give
677    defined (0); all other -> undefined (1).
678 */
mkImproveOR8(MCEnv * mce,IRAtom * data,IRAtom * vbits)679 static IRAtom* mkImproveOR8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
680 {
681    tl_assert(isOriginalAtom(mce, data));
682    tl_assert(isShadowAtom(mce, vbits));
683    tl_assert(sameKindedAtoms(data, vbits));
684    return assignNew(
685              'V', mce, Ity_I8,
686              binop(Iop_Or8,
687                    assignNew('V', mce, Ity_I8, unop(Iop_Not8, data)),
688                    vbits) );
689 }
690 
mkImproveOR16(MCEnv * mce,IRAtom * data,IRAtom * vbits)691 static IRAtom* mkImproveOR16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
692 {
693    tl_assert(isOriginalAtom(mce, data));
694    tl_assert(isShadowAtom(mce, vbits));
695    tl_assert(sameKindedAtoms(data, vbits));
696    return assignNew(
697              'V', mce, Ity_I16,
698              binop(Iop_Or16,
699                    assignNew('V', mce, Ity_I16, unop(Iop_Not16, data)),
700                    vbits) );
701 }
702 
mkImproveOR32(MCEnv * mce,IRAtom * data,IRAtom * vbits)703 static IRAtom* mkImproveOR32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
704 {
705    tl_assert(isOriginalAtom(mce, data));
706    tl_assert(isShadowAtom(mce, vbits));
707    tl_assert(sameKindedAtoms(data, vbits));
708    return assignNew(
709              'V', mce, Ity_I32,
710              binop(Iop_Or32,
711                    assignNew('V', mce, Ity_I32, unop(Iop_Not32, data)),
712                    vbits) );
713 }
714 
mkImproveOR64(MCEnv * mce,IRAtom * data,IRAtom * vbits)715 static IRAtom* mkImproveOR64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
716 {
717    tl_assert(isOriginalAtom(mce, data));
718    tl_assert(isShadowAtom(mce, vbits));
719    tl_assert(sameKindedAtoms(data, vbits));
720    return assignNew(
721              'V', mce, Ity_I64,
722              binop(Iop_Or64,
723                    assignNew('V', mce, Ity_I64, unop(Iop_Not64, data)),
724                    vbits) );
725 }
726 
mkImproveORV128(MCEnv * mce,IRAtom * data,IRAtom * vbits)727 static IRAtom* mkImproveORV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
728 {
729    tl_assert(isOriginalAtom(mce, data));
730    tl_assert(isShadowAtom(mce, vbits));
731    tl_assert(sameKindedAtoms(data, vbits));
732    return assignNew(
733              'V', mce, Ity_V128,
734              binop(Iop_OrV128,
735                    assignNew('V', mce, Ity_V128, unop(Iop_NotV128, data)),
736                    vbits) );
737 }
738 
mkImproveORV256(MCEnv * mce,IRAtom * data,IRAtom * vbits)739 static IRAtom* mkImproveORV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
740 {
741    tl_assert(isOriginalAtom(mce, data));
742    tl_assert(isShadowAtom(mce, vbits));
743    tl_assert(sameKindedAtoms(data, vbits));
744    return assignNew(
745              'V', mce, Ity_V256,
746              binop(Iop_OrV256,
747                    assignNew('V', mce, Ity_V256, unop(Iop_NotV256, data)),
748                    vbits) );
749 }
750 
751 /* --------- Pessimising casts. --------- */
752 
753 /* The function returns an expression of type DST_TY. If any of the VBITS
754    is undefined (value == 1) the resulting expression has all bits set to
755    1. Otherwise, all bits are 0. */
756 
mkPCastTo(MCEnv * mce,IRType dst_ty,IRAtom * vbits)757 static IRAtom* mkPCastTo( MCEnv* mce, IRType dst_ty, IRAtom* vbits )
758 {
759    IRType  src_ty;
760    IRAtom* tmp1;
761 
762    /* Note, dst_ty is a shadow type, not an original type. */
763    tl_assert(isShadowAtom(mce,vbits));
764    src_ty = typeOfIRExpr(mce->sb->tyenv, vbits);
765 
766    /* Fast-track some common cases */
767    if (src_ty == Ity_I32 && dst_ty == Ity_I32)
768       return assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
769 
770    if (src_ty == Ity_I64 && dst_ty == Ity_I64)
771       return assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
772 
773    if (src_ty == Ity_I32 && dst_ty == Ity_I64) {
774       /* PCast the arg, then clone it. */
775       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
776       return assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
777    }
778 
779    if (src_ty == Ity_I32 && dst_ty == Ity_V128) {
780       /* PCast the arg, then clone it 4 times. */
781       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
782       tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
783       return assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
784    }
785 
786    if (src_ty == Ity_I32 && dst_ty == Ity_V256) {
787       /* PCast the arg, then clone it 8 times. */
788       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
789       tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
790       tmp = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
791       return assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256, tmp, tmp));
792    }
793 
794    if (src_ty == Ity_I64 && dst_ty == Ity_I32) {
795       /* PCast the arg.  This gives all 0s or all 1s.  Then throw away
796          the top half. */
797       IRAtom* tmp = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
798       return assignNew('V', mce, Ity_I32, unop(Iop_64to32, tmp));
799    }
800 
801    if (src_ty == Ity_V128 && dst_ty == Ity_I64) {
802       /* Use InterleaveHI64x2 to copy the top half of the vector into
803          the bottom half.  Then we can UifU it with the original, throw
804          away the upper half of the result, and PCast-I64-to-I64
805          the lower half. */
806       // Generates vbits[127:64] : vbits[127:64]
807       IRAtom* hi64hi64
808          = assignNew('V', mce, Ity_V128,
809                      binop(Iop_InterleaveHI64x2, vbits, vbits));
810       // Generates
811       //   UifU(vbits[127:64],vbits[127:64]) : UifU(vbits[127:64],vbits[63:0])
812       //   == vbits[127:64] : UifU(vbits[127:64],vbits[63:0])
813       IRAtom* lohi64
814          = mkUifUV128(mce, hi64hi64, vbits);
815       // Generates UifU(vbits[127:64],vbits[63:0])
816       IRAtom* lo64
817          = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, lohi64));
818       // Generates
819       //   PCast-to-I64( UifU(vbits[127:64], vbits[63:0] )
820       //   == PCast-to-I64( vbits[127:0] )
821       IRAtom* res
822          = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, lo64));
823       return res;
824    }
825 
826    /* Else do it the slow way .. */
827    /* First of all, collapse vbits down to a single bit. */
828    tmp1   = NULL;
829    switch (src_ty) {
830       case Ity_I1:
831          tmp1 = vbits;
832          break;
833       case Ity_I8:
834          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ8, vbits));
835          break;
836       case Ity_I16:
837          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ16, vbits));
838          break;
839       case Ity_I32:
840          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ32, vbits));
841          break;
842       case Ity_I64:
843          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ64, vbits));
844          break;
845       case Ity_I128: {
846          /* Gah.  Chop it in half, OR the halves together, and compare
847             that with zero. */
848          IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vbits));
849          IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, vbits));
850          IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
851          tmp1         = assignNew('V', mce, Ity_I1,
852                                        unop(Iop_CmpNEZ64, tmp4));
853          break;
854       }
855       case Ity_V128: {
856          /* Chop it in half, OR the halves together, and compare that
857           * with zero.
858           */
859          IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, vbits));
860          IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vbits));
861          IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
862          tmp1         = assignNew('V', mce, Ity_I1,
863                                        unop(Iop_CmpNEZ64, tmp4));
864          break;
865       }
866       default:
867          ppIRType(src_ty);
868          VG_(tool_panic)("mkPCastTo(1)");
869    }
870    tl_assert(tmp1);
871    /* Now widen up to the dst type. */
872    switch (dst_ty) {
873       case Ity_I1:
874          return tmp1;
875       case Ity_I8:
876          return assignNew('V', mce, Ity_I8, unop(Iop_1Sto8, tmp1));
877       case Ity_I16:
878          return assignNew('V', mce, Ity_I16, unop(Iop_1Sto16, tmp1));
879       case Ity_I32:
880          return assignNew('V', mce, Ity_I32, unop(Iop_1Sto32, tmp1));
881       case Ity_I64:
882          return assignNew('V', mce, Ity_I64, unop(Iop_1Sto64, tmp1));
883       case Ity_V128:
884          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
885          tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp1, tmp1));
886          return tmp1;
887       case Ity_I128:
888          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
889          tmp1 = assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp1, tmp1));
890          return tmp1;
891       case Ity_V256:
892          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
893          tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128,
894                                                     tmp1, tmp1));
895          tmp1 = assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256,
896                                                     tmp1, tmp1));
897          return tmp1;
898       default:
899          ppIRType(dst_ty);
900          VG_(tool_panic)("mkPCastTo(2)");
901    }
902 }
903 
904 /* This is a minor variant.  It takes an arg of some type and returns
905    a value of the same type.  The result consists entirely of Defined
906    (zero) bits except its least significant bit, which is a PCast of
907    the entire argument down to a single bit. */
mkPCastXXtoXXlsb(MCEnv * mce,IRAtom * varg,IRType ty)908 static IRAtom* mkPCastXXtoXXlsb ( MCEnv* mce, IRAtom* varg, IRType ty )
909 {
910    if (ty == Ity_V128) {
911       /* --- Case for V128 --- */
912       IRAtom* varg128 = varg;
913       // generates: PCast-to-I64(varg128)
914       IRAtom* pcdTo64 = mkPCastTo(mce, Ity_I64, varg128);
915       // Now introduce zeros (defined bits) in the top 63 places
916       // generates: Def--(63)--Def PCast-to-I1(varg128)
917       IRAtom* d63pc
918          = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcdTo64, mkU64(1)));
919       // generates: Def--(64)--Def
920       IRAtom* d64
921          = definedOfType(Ity_I64);
922       // generates: Def--(127)--Def PCast-to-I1(varg128)
923       IRAtom* res
924          = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, d64, d63pc));
925       return res;
926    }
927    if (ty == Ity_I64) {
928       /* --- Case for I64 --- */
929       // PCast to 64
930       IRAtom* pcd = mkPCastTo(mce, Ity_I64, varg);
931       // Zero (Def) out the top 63 bits
932       IRAtom* res
933          = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcd, mkU64(1)));
934       return res;
935    }
936    /*NOTREACHED*/
937    tl_assert(0);
938 }
939 
940 /* --------- Accurate interpretation of CmpEQ/CmpNE. --------- */
941 /*
942    Normally, we can do CmpEQ/CmpNE by doing UifU on the arguments, and
943    PCasting to Ity_U1.  However, sometimes it is necessary to be more
944    accurate.  The insight is that the result is defined if two
945    corresponding bits can be found, one from each argument, so that
946    both bits are defined but are different -- that makes EQ say "No"
947    and NE say "Yes".  Hence, we compute an improvement term and DifD
948    it onto the "normal" (UifU) result.
949 
950    The result is:
951 
952    PCastTo<1> (
953       -- naive version
954       PCastTo<sz>( UifU<sz>(vxx, vyy) )
955 
956       `DifD<sz>`
957 
958       -- improvement term
959       PCastTo<sz>( PCast<sz>( CmpEQ<sz> ( vec, 1...1 ) ) )
960    )
961 
962    where
963      vec contains 0 (defined) bits where the corresponding arg bits
964      are defined but different, and 1 bits otherwise.
965 
966      vec = Or<sz>( vxx,   // 0 iff bit defined
967                    vyy,   // 0 iff bit defined
968                    Not<sz>(Xor<sz>( xx, yy )) // 0 iff bits different
969                  )
970 
971      If any bit of vec is 0, the result is defined and so the
972      improvement term should produce 0...0, else it should produce
973      1...1.
974 
975      Hence require for the improvement term:
976 
977         if vec == 1...1 then 1...1 else 0...0
978      ->
979         PCast<sz>( CmpEQ<sz> ( vec, 1...1 ) )
980 
981    This was extensively re-analysed and checked on 6 July 05.
982 */
expensiveCmpEQorNE(MCEnv * mce,IRType ty,IRAtom * vxx,IRAtom * vyy,IRAtom * xx,IRAtom * yy)983 static IRAtom* expensiveCmpEQorNE ( MCEnv*  mce,
984                                     IRType  ty,
985                                     IRAtom* vxx, IRAtom* vyy,
986                                     IRAtom* xx,  IRAtom* yy )
987 {
988    IRAtom *naive, *vec, *improvement_term;
989    IRAtom *improved, *final_cast, *top;
990    IROp   opDIFD, opUIFU, opXOR, opNOT, opCMP, opOR;
991 
992    tl_assert(isShadowAtom(mce,vxx));
993    tl_assert(isShadowAtom(mce,vyy));
994    tl_assert(isOriginalAtom(mce,xx));
995    tl_assert(isOriginalAtom(mce,yy));
996    tl_assert(sameKindedAtoms(vxx,xx));
997    tl_assert(sameKindedAtoms(vyy,yy));
998 
999    switch (ty) {
1000       case Ity_I16:
1001          opOR   = Iop_Or16;
1002          opDIFD = Iop_And16;
1003          opUIFU = Iop_Or16;
1004          opNOT  = Iop_Not16;
1005          opXOR  = Iop_Xor16;
1006          opCMP  = Iop_CmpEQ16;
1007          top    = mkU16(0xFFFF);
1008          break;
1009       case Ity_I32:
1010          opOR   = Iop_Or32;
1011          opDIFD = Iop_And32;
1012          opUIFU = Iop_Or32;
1013          opNOT  = Iop_Not32;
1014          opXOR  = Iop_Xor32;
1015          opCMP  = Iop_CmpEQ32;
1016          top    = mkU32(0xFFFFFFFF);
1017          break;
1018       case Ity_I64:
1019          opOR   = Iop_Or64;
1020          opDIFD = Iop_And64;
1021          opUIFU = Iop_Or64;
1022          opNOT  = Iop_Not64;
1023          opXOR  = Iop_Xor64;
1024          opCMP  = Iop_CmpEQ64;
1025          top    = mkU64(0xFFFFFFFFFFFFFFFFULL);
1026          break;
1027       default:
1028          VG_(tool_panic)("expensiveCmpEQorNE");
1029    }
1030 
1031    naive
1032       = mkPCastTo(mce,ty,
1033                   assignNew('V', mce, ty, binop(opUIFU, vxx, vyy)));
1034 
1035    vec
1036       = assignNew(
1037            'V', mce,ty,
1038            binop( opOR,
1039                   assignNew('V', mce,ty, binop(opOR, vxx, vyy)),
1040                   assignNew(
1041                      'V', mce,ty,
1042                      unop( opNOT,
1043                            assignNew('V', mce,ty, binop(opXOR, xx, yy))))));
1044 
1045    improvement_term
1046       = mkPCastTo( mce,ty,
1047                    assignNew('V', mce,Ity_I1, binop(opCMP, vec, top)));
1048 
1049    improved
1050       = assignNew( 'V', mce,ty, binop(opDIFD, naive, improvement_term) );
1051 
1052    final_cast
1053       = mkPCastTo( mce, Ity_I1, improved );
1054 
1055    return final_cast;
1056 }
1057 
1058 
1059 /* --------- Semi-accurate interpretation of CmpORD. --------- */
1060 
1061 /* CmpORD32{S,U} does PowerPC-style 3-way comparisons:
1062 
1063       CmpORD32S(x,y) = 1<<3   if  x <s y
1064                      = 1<<2   if  x >s y
1065                      = 1<<1   if  x == y
1066 
1067    and similarly the unsigned variant.  The default interpretation is:
1068 
1069       CmpORD32{S,U}#(x,y,x#,y#) = PCast(x# `UifU` y#)
1070                                   & (7<<1)
1071 
1072    The "& (7<<1)" reflects the fact that all result bits except 3,2,1
1073    are zero and therefore defined (viz, zero).
1074 
1075    Also deal with a special case better:
1076 
1077       CmpORD32S(x,0)
1078 
1079    Here, bit 3 (LT) of the result is a copy of the top bit of x and
1080    will be defined even if the rest of x isn't.  In which case we do:
1081 
1082       CmpORD32S#(x,x#,0,{impliedly 0}#)
1083          = PCast(x#) & (3<<1)      -- standard interp for GT#,EQ#
1084            | (x# >>u 31) << 3      -- LT# = x#[31]
1085 
1086    Analogous handling for CmpORD64{S,U}.
1087 */
isZeroU32(IRAtom * e)1088 static Bool isZeroU32 ( IRAtom* e )
1089 {
1090    return
1091       toBool( e->tag == Iex_Const
1092               && e->Iex.Const.con->tag == Ico_U32
1093               && e->Iex.Const.con->Ico.U32 == 0 );
1094 }
1095 
isZeroU64(IRAtom * e)1096 static Bool isZeroU64 ( IRAtom* e )
1097 {
1098    return
1099       toBool( e->tag == Iex_Const
1100               && e->Iex.Const.con->tag == Ico_U64
1101               && e->Iex.Const.con->Ico.U64 == 0 );
1102 }
1103 
doCmpORD(MCEnv * mce,IROp cmp_op,IRAtom * xxhash,IRAtom * yyhash,IRAtom * xx,IRAtom * yy)1104 static IRAtom* doCmpORD ( MCEnv*  mce,
1105                           IROp    cmp_op,
1106                           IRAtom* xxhash, IRAtom* yyhash,
1107                           IRAtom* xx,     IRAtom* yy )
1108 {
1109    Bool   m64    = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U;
1110    Bool   syned  = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD32S;
1111    IROp   opOR   = m64 ? Iop_Or64  : Iop_Or32;
1112    IROp   opAND  = m64 ? Iop_And64 : Iop_And32;
1113    IROp   opSHL  = m64 ? Iop_Shl64 : Iop_Shl32;
1114    IROp   opSHR  = m64 ? Iop_Shr64 : Iop_Shr32;
1115    IRType ty     = m64 ? Ity_I64   : Ity_I32;
1116    Int    width  = m64 ? 64        : 32;
1117 
1118    Bool (*isZero)(IRAtom*) = m64 ? isZeroU64 : isZeroU32;
1119 
1120    IRAtom* threeLeft1 = NULL;
1121    IRAtom* sevenLeft1 = NULL;
1122 
1123    tl_assert(isShadowAtom(mce,xxhash));
1124    tl_assert(isShadowAtom(mce,yyhash));
1125    tl_assert(isOriginalAtom(mce,xx));
1126    tl_assert(isOriginalAtom(mce,yy));
1127    tl_assert(sameKindedAtoms(xxhash,xx));
1128    tl_assert(sameKindedAtoms(yyhash,yy));
1129    tl_assert(cmp_op == Iop_CmpORD32S || cmp_op == Iop_CmpORD32U
1130              || cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U);
1131 
1132    if (0) {
1133       ppIROp(cmp_op); VG_(printf)(" ");
1134       ppIRExpr(xx); VG_(printf)(" "); ppIRExpr( yy ); VG_(printf)("\n");
1135    }
1136 
1137    if (syned && isZero(yy)) {
1138       /* fancy interpretation */
1139       /* if yy is zero, then it must be fully defined (zero#). */
1140       tl_assert(isZero(yyhash));
1141       threeLeft1 = m64 ? mkU64(3<<1) : mkU32(3<<1);
1142       return
1143          binop(
1144             opOR,
1145             assignNew(
1146                'V', mce,ty,
1147                binop(
1148                   opAND,
1149                   mkPCastTo(mce,ty, xxhash),
1150                   threeLeft1
1151                )),
1152             assignNew(
1153                'V', mce,ty,
1154                binop(
1155                   opSHL,
1156                   assignNew(
1157                      'V', mce,ty,
1158                      binop(opSHR, xxhash, mkU8(width-1))),
1159                   mkU8(3)
1160                ))
1161 	 );
1162    } else {
1163       /* standard interpretation */
1164       sevenLeft1 = m64 ? mkU64(7<<1) : mkU32(7<<1);
1165       return
1166          binop(
1167             opAND,
1168             mkPCastTo( mce,ty,
1169                        mkUifU(mce,ty, xxhash,yyhash)),
1170             sevenLeft1
1171          );
1172    }
1173 }
1174 
1175 
1176 /*------------------------------------------------------------*/
1177 /*--- Emit a test and complaint if something is undefined. ---*/
1178 /*------------------------------------------------------------*/
1179 
1180 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e ); /* fwds */
1181 
1182 
1183 /* Set the annotations on a dirty helper to indicate that the stack
1184    pointer and instruction pointers might be read.  This is the
1185    behaviour of all 'emit-a-complaint' style functions we might
1186    call. */
1187 
setHelperAnns(MCEnv * mce,IRDirty * di)1188 static void setHelperAnns ( MCEnv* mce, IRDirty* di ) {
1189    di->nFxState = 2;
1190    di->fxState[0].fx        = Ifx_Read;
1191    di->fxState[0].offset    = mce->layout->offset_SP;
1192    di->fxState[0].size      = mce->layout->sizeof_SP;
1193    di->fxState[0].nRepeats  = 0;
1194    di->fxState[0].repeatLen = 0;
1195    di->fxState[1].fx        = Ifx_Read;
1196    di->fxState[1].offset    = mce->layout->offset_IP;
1197    di->fxState[1].size      = mce->layout->sizeof_IP;
1198    di->fxState[1].nRepeats  = 0;
1199    di->fxState[1].repeatLen = 0;
1200 }
1201 
1202 
1203 /* Check the supplied *original* |atom| for undefinedness, and emit a
1204    complaint if so.  Once that happens, mark it as defined.  This is
1205    possible because the atom is either a tmp or literal.  If it's a
1206    tmp, it will be shadowed by a tmp, and so we can set the shadow to
1207    be defined.  In fact as mentioned above, we will have to allocate a
1208    new tmp to carry the new 'defined' shadow value, and update the
1209    original->tmp mapping accordingly; we cannot simply assign a new
1210    value to an existing shadow tmp as this breaks SSAness.
1211 
1212    The checks are performed, any resulting complaint emitted, and
1213    |atom|'s shadow temp set to 'defined', ONLY in the case that
1214    |guard| evaluates to True at run-time.  If it evaluates to False
1215    then no action is performed.  If |guard| is NULL (the usual case)
1216    then it is assumed to be always-true, and hence these actions are
1217    performed unconditionally.
1218 
1219    This routine does not generate code to check the definedness of
1220    |guard|.  The caller is assumed to have taken care of that already.
1221 */
complainIfUndefined(MCEnv * mce,IRAtom * atom,IRExpr * guard)1222 static void complainIfUndefined ( MCEnv* mce, IRAtom* atom, IRExpr *guard )
1223 {
1224    IRAtom*  vatom;
1225    IRType   ty;
1226    Int      sz;
1227    IRDirty* di;
1228    IRAtom*  cond;
1229    IRAtom*  origin;
1230    void*    fn;
1231    const HChar* nm;
1232    IRExpr** args;
1233    Int      nargs;
1234 
1235    // Don't do V bit tests if we're not reporting undefined value errors.
1236    if (MC_(clo_mc_level) == 1)
1237       return;
1238 
1239    if (guard)
1240       tl_assert(isOriginalAtom(mce, guard));
1241 
1242    /* Since the original expression is atomic, there's no duplicated
1243       work generated by making multiple V-expressions for it.  So we
1244       don't really care about the possibility that someone else may
1245       also create a V-interpretion for it. */
1246    tl_assert(isOriginalAtom(mce, atom));
1247    vatom = expr2vbits( mce, atom );
1248    tl_assert(isShadowAtom(mce, vatom));
1249    tl_assert(sameKindedAtoms(atom, vatom));
1250 
1251    ty = typeOfIRExpr(mce->sb->tyenv, vatom);
1252 
1253    /* sz is only used for constructing the error message */
1254    sz = ty==Ity_I1 ? 0 : sizeofIRType(ty);
1255 
1256    cond = mkPCastTo( mce, Ity_I1, vatom );
1257    /* cond will be 0 if all defined, and 1 if any not defined. */
1258 
1259    /* Get the origin info for the value we are about to check.  At
1260       least, if we are doing origin tracking.  If not, use a dummy
1261       zero origin. */
1262    if (MC_(clo_mc_level) == 3) {
1263       origin = schemeE( mce, atom );
1264       if (mce->hWordTy == Ity_I64) {
1265          origin = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, origin) );
1266       }
1267    } else {
1268       origin = NULL;
1269    }
1270 
1271    fn    = NULL;
1272    nm    = NULL;
1273    args  = NULL;
1274    nargs = -1;
1275 
1276    switch (sz) {
1277       case 0:
1278          if (origin) {
1279             fn    = &MC_(helperc_value_check0_fail_w_o);
1280             nm    = "MC_(helperc_value_check0_fail_w_o)";
1281             args  = mkIRExprVec_1(origin);
1282             nargs = 1;
1283          } else {
1284             fn    = &MC_(helperc_value_check0_fail_no_o);
1285             nm    = "MC_(helperc_value_check0_fail_no_o)";
1286             args  = mkIRExprVec_0();
1287             nargs = 0;
1288          }
1289          break;
1290       case 1:
1291          if (origin) {
1292             fn    = &MC_(helperc_value_check1_fail_w_o);
1293             nm    = "MC_(helperc_value_check1_fail_w_o)";
1294             args  = mkIRExprVec_1(origin);
1295             nargs = 1;
1296          } else {
1297             fn    = &MC_(helperc_value_check1_fail_no_o);
1298             nm    = "MC_(helperc_value_check1_fail_no_o)";
1299             args  = mkIRExprVec_0();
1300             nargs = 0;
1301          }
1302          break;
1303       case 4:
1304          if (origin) {
1305             fn    = &MC_(helperc_value_check4_fail_w_o);
1306             nm    = "MC_(helperc_value_check4_fail_w_o)";
1307             args  = mkIRExprVec_1(origin);
1308             nargs = 1;
1309          } else {
1310             fn    = &MC_(helperc_value_check4_fail_no_o);
1311             nm    = "MC_(helperc_value_check4_fail_no_o)";
1312             args  = mkIRExprVec_0();
1313             nargs = 0;
1314          }
1315          break;
1316       case 8:
1317          if (origin) {
1318             fn    = &MC_(helperc_value_check8_fail_w_o);
1319             nm    = "MC_(helperc_value_check8_fail_w_o)";
1320             args  = mkIRExprVec_1(origin);
1321             nargs = 1;
1322          } else {
1323             fn    = &MC_(helperc_value_check8_fail_no_o);
1324             nm    = "MC_(helperc_value_check8_fail_no_o)";
1325             args  = mkIRExprVec_0();
1326             nargs = 0;
1327          }
1328          break;
1329       case 2:
1330       case 16:
1331          if (origin) {
1332             fn    = &MC_(helperc_value_checkN_fail_w_o);
1333             nm    = "MC_(helperc_value_checkN_fail_w_o)";
1334             args  = mkIRExprVec_2( mkIRExpr_HWord( sz ), origin);
1335             nargs = 2;
1336          } else {
1337             fn    = &MC_(helperc_value_checkN_fail_no_o);
1338             nm    = "MC_(helperc_value_checkN_fail_no_o)";
1339             args  = mkIRExprVec_1( mkIRExpr_HWord( sz ) );
1340             nargs = 1;
1341          }
1342          break;
1343       default:
1344          VG_(tool_panic)("unexpected szB");
1345    }
1346 
1347    tl_assert(fn);
1348    tl_assert(nm);
1349    tl_assert(args);
1350    tl_assert(nargs >= 0 && nargs <= 2);
1351    tl_assert( (MC_(clo_mc_level) == 3 && origin != NULL)
1352               || (MC_(clo_mc_level) == 2 && origin == NULL) );
1353 
1354    di = unsafeIRDirty_0_N( nargs/*regparms*/, nm,
1355                            VG_(fnptr_to_fnentry)( fn ), args );
1356    di->guard = cond; // and cond is PCast-to-1(atom#)
1357 
1358    /* If the complaint is to be issued under a guard condition, AND
1359       that into the guard condition for the helper call. */
1360    if (guard) {
1361       IRAtom *g1 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, di->guard));
1362       IRAtom *g2 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, guard));
1363       IRAtom *e  = assignNew('V', mce, Ity_I32, binop(Iop_And32, g1, g2));
1364       di->guard  = assignNew('V', mce, Ity_I1,  unop(Iop_32to1, e));
1365    }
1366 
1367    setHelperAnns( mce, di );
1368    stmt( 'V', mce, IRStmt_Dirty(di));
1369 
1370    /* If |atom| is shadowed by an IRTemp, set the shadow tmp to be
1371       defined -- but only in the case where the guard evaluates to
1372       True at run-time.  Do the update by setting the orig->shadow
1373       mapping for tmp to reflect the fact that this shadow is getting
1374       a new value. */
1375    tl_assert(isIRAtom(vatom));
1376    /* sameKindedAtoms ... */
1377    if (vatom->tag == Iex_RdTmp) {
1378       tl_assert(atom->tag == Iex_RdTmp);
1379       if (guard == NULL) {
1380          // guard is 'always True', hence update unconditionally
1381          newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1382          assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp),
1383                           definedOfType(ty));
1384       } else {
1385          // update the temp only conditionally.  Do this by copying
1386          // its old value when the guard is False.
1387          // The old value ..
1388          IRTemp old_tmpV = findShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1389          newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1390          IRAtom* new_tmpV
1391             = assignNew('V', mce, shadowTypeV(ty),
1392                         IRExpr_ITE(guard, definedOfType(ty),
1393                                           mkexpr(old_tmpV)));
1394          assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp), new_tmpV);
1395       }
1396    }
1397 }
1398 
1399 
1400 /*------------------------------------------------------------*/
1401 /*--- Shadowing PUTs/GETs, and indexed variants thereof    ---*/
1402 /*------------------------------------------------------------*/
1403 
1404 /* Examine the always-defined sections declared in layout to see if
1405    the (offset,size) section is within one.  Note, is is an error to
1406    partially fall into such a region: (offset,size) should either be
1407    completely in such a region or completely not-in such a region.
1408 */
isAlwaysDefd(MCEnv * mce,Int offset,Int size)1409 static Bool isAlwaysDefd ( MCEnv* mce, Int offset, Int size )
1410 {
1411    Int minoffD, maxoffD, i;
1412    Int minoff = offset;
1413    Int maxoff = minoff + size - 1;
1414    tl_assert((minoff & ~0xFFFF) == 0);
1415    tl_assert((maxoff & ~0xFFFF) == 0);
1416 
1417    for (i = 0; i < mce->layout->n_alwaysDefd; i++) {
1418       minoffD = mce->layout->alwaysDefd[i].offset;
1419       maxoffD = minoffD + mce->layout->alwaysDefd[i].size - 1;
1420       tl_assert((minoffD & ~0xFFFF) == 0);
1421       tl_assert((maxoffD & ~0xFFFF) == 0);
1422 
1423       if (maxoff < minoffD || maxoffD < minoff)
1424          continue; /* no overlap */
1425       if (minoff >= minoffD && maxoff <= maxoffD)
1426          return True; /* completely contained in an always-defd section */
1427 
1428       VG_(tool_panic)("memcheck:isAlwaysDefd:partial overlap");
1429    }
1430    return False; /* could not find any containing section */
1431 }
1432 
1433 
1434 /* Generate into bb suitable actions to shadow this Put.  If the state
1435    slice is marked 'always defined', do nothing.  Otherwise, write the
1436    supplied V bits to the shadow state.  We can pass in either an
1437    original atom or a V-atom, but not both.  In the former case the
1438    relevant V-bits are then generated from the original.
1439    We assume here, that the definedness of GUARD has already been checked.
1440 */
1441 static
do_shadow_PUT(MCEnv * mce,Int offset,IRAtom * atom,IRAtom * vatom,IRExpr * guard)1442 void do_shadow_PUT ( MCEnv* mce,  Int offset,
1443                      IRAtom* atom, IRAtom* vatom, IRExpr *guard )
1444 {
1445    IRType ty;
1446 
1447    // Don't do shadow PUTs if we're not doing undefined value checking.
1448    // Their absence lets Vex's optimiser remove all the shadow computation
1449    // that they depend on, which includes GETs of the shadow registers.
1450    if (MC_(clo_mc_level) == 1)
1451       return;
1452 
1453    if (atom) {
1454       tl_assert(!vatom);
1455       tl_assert(isOriginalAtom(mce, atom));
1456       vatom = expr2vbits( mce, atom );
1457    } else {
1458       tl_assert(vatom);
1459       tl_assert(isShadowAtom(mce, vatom));
1460    }
1461 
1462    ty = typeOfIRExpr(mce->sb->tyenv, vatom);
1463    tl_assert(ty != Ity_I1);
1464    if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
1465       /* later: no ... */
1466       /* emit code to emit a complaint if any of the vbits are 1. */
1467       /* complainIfUndefined(mce, atom); */
1468    } else {
1469       /* Do a plain shadow Put. */
1470       if (guard) {
1471          /* If the guard expression evaluates to false we simply Put the value
1472             that is already stored in the guest state slot */
1473          IRAtom *cond, *iffalse;
1474 
1475          cond    = assignNew('V', mce, Ity_I1, guard);
1476          iffalse = assignNew('V', mce, ty,
1477                              IRExpr_Get(offset + mce->layout->total_sizeB, ty));
1478          vatom   = assignNew('V', mce, ty, IRExpr_ITE(cond, vatom, iffalse));
1479       }
1480       stmt( 'V', mce, IRStmt_Put( offset + mce->layout->total_sizeB, vatom ));
1481    }
1482 }
1483 
1484 
1485 /* Return an expression which contains the V bits corresponding to the
1486    given GETI (passed in in pieces).
1487 */
1488 static
do_shadow_PUTI(MCEnv * mce,IRPutI * puti)1489 void do_shadow_PUTI ( MCEnv* mce, IRPutI *puti)
1490 {
1491    IRAtom* vatom;
1492    IRType  ty, tyS;
1493    Int     arrSize;;
1494    IRRegArray* descr = puti->descr;
1495    IRAtom*     ix    = puti->ix;
1496    Int         bias  = puti->bias;
1497    IRAtom*     atom  = puti->data;
1498 
1499    // Don't do shadow PUTIs if we're not doing undefined value checking.
1500    // Their absence lets Vex's optimiser remove all the shadow computation
1501    // that they depend on, which includes GETIs of the shadow registers.
1502    if (MC_(clo_mc_level) == 1)
1503       return;
1504 
1505    tl_assert(isOriginalAtom(mce,atom));
1506    vatom = expr2vbits( mce, atom );
1507    tl_assert(sameKindedAtoms(atom, vatom));
1508    ty   = descr->elemTy;
1509    tyS  = shadowTypeV(ty);
1510    arrSize = descr->nElems * sizeofIRType(ty);
1511    tl_assert(ty != Ity_I1);
1512    tl_assert(isOriginalAtom(mce,ix));
1513    complainIfUndefined(mce, ix, NULL);
1514    if (isAlwaysDefd(mce, descr->base, arrSize)) {
1515       /* later: no ... */
1516       /* emit code to emit a complaint if any of the vbits are 1. */
1517       /* complainIfUndefined(mce, atom); */
1518    } else {
1519       /* Do a cloned version of the Put that refers to the shadow
1520          area. */
1521       IRRegArray* new_descr
1522          = mkIRRegArray( descr->base + mce->layout->total_sizeB,
1523                          tyS, descr->nElems);
1524       stmt( 'V', mce, IRStmt_PutI( mkIRPutI(new_descr, ix, bias, vatom) ));
1525    }
1526 }
1527 
1528 
1529 /* Return an expression which contains the V bits corresponding to the
1530    given GET (passed in in pieces).
1531 */
1532 static
shadow_GET(MCEnv * mce,Int offset,IRType ty)1533 IRExpr* shadow_GET ( MCEnv* mce, Int offset, IRType ty )
1534 {
1535    IRType tyS = shadowTypeV(ty);
1536    tl_assert(ty != Ity_I1);
1537    tl_assert(ty != Ity_I128);
1538    if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
1539       /* Always defined, return all zeroes of the relevant type */
1540       return definedOfType(tyS);
1541    } else {
1542       /* return a cloned version of the Get that refers to the shadow
1543          area. */
1544       /* FIXME: this isn't an atom! */
1545       return IRExpr_Get( offset + mce->layout->total_sizeB, tyS );
1546    }
1547 }
1548 
1549 
1550 /* Return an expression which contains the V bits corresponding to the
1551    given GETI (passed in in pieces).
1552 */
1553 static
shadow_GETI(MCEnv * mce,IRRegArray * descr,IRAtom * ix,Int bias)1554 IRExpr* shadow_GETI ( MCEnv* mce,
1555                       IRRegArray* descr, IRAtom* ix, Int bias )
1556 {
1557    IRType ty   = descr->elemTy;
1558    IRType tyS  = shadowTypeV(ty);
1559    Int arrSize = descr->nElems * sizeofIRType(ty);
1560    tl_assert(ty != Ity_I1);
1561    tl_assert(isOriginalAtom(mce,ix));
1562    complainIfUndefined(mce, ix, NULL);
1563    if (isAlwaysDefd(mce, descr->base, arrSize)) {
1564       /* Always defined, return all zeroes of the relevant type */
1565       return definedOfType(tyS);
1566    } else {
1567       /* return a cloned version of the Get that refers to the shadow
1568          area. */
1569       IRRegArray* new_descr
1570          = mkIRRegArray( descr->base + mce->layout->total_sizeB,
1571                          tyS, descr->nElems);
1572       return IRExpr_GetI( new_descr, ix, bias );
1573    }
1574 }
1575 
1576 
1577 /*------------------------------------------------------------*/
1578 /*--- Generating approximations for unknown operations,    ---*/
1579 /*--- using lazy-propagate semantics                       ---*/
1580 /*------------------------------------------------------------*/
1581 
1582 /* Lazy propagation of undefinedness from two values, resulting in the
1583    specified shadow type.
1584 */
1585 static
mkLazy2(MCEnv * mce,IRType finalVty,IRAtom * va1,IRAtom * va2)1586 IRAtom* mkLazy2 ( MCEnv* mce, IRType finalVty, IRAtom* va1, IRAtom* va2 )
1587 {
1588    IRAtom* at;
1589    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
1590    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
1591    tl_assert(isShadowAtom(mce,va1));
1592    tl_assert(isShadowAtom(mce,va2));
1593 
1594    /* The general case is inefficient because PCast is an expensive
1595       operation.  Here are some special cases which use PCast only
1596       once rather than twice. */
1597 
1598    /* I64 x I64 -> I64 */
1599    if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I64) {
1600       if (0) VG_(printf)("mkLazy2: I64 x I64 -> I64\n");
1601       at = mkUifU(mce, Ity_I64, va1, va2);
1602       at = mkPCastTo(mce, Ity_I64, at);
1603       return at;
1604    }
1605 
1606    /* I64 x I64 -> I32 */
1607    if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I32) {
1608       if (0) VG_(printf)("mkLazy2: I64 x I64 -> I32\n");
1609       at = mkUifU(mce, Ity_I64, va1, va2);
1610       at = mkPCastTo(mce, Ity_I32, at);
1611       return at;
1612    }
1613 
1614    /* I32 x I32 -> I32 */
1615    if (t1 == Ity_I32 && t2 == Ity_I32 && finalVty == Ity_I32) {
1616       if (0) VG_(printf)("mkLazy2: I32 x I32 -> I32\n");
1617       at = mkUifU(mce, Ity_I32, va1, va2);
1618       at = mkPCastTo(mce, Ity_I32, at);
1619       return at;
1620    }
1621 
1622    if (0) {
1623       VG_(printf)("mkLazy2 ");
1624       ppIRType(t1);
1625       VG_(printf)("_");
1626       ppIRType(t2);
1627       VG_(printf)("_");
1628       ppIRType(finalVty);
1629       VG_(printf)("\n");
1630    }
1631 
1632    /* General case: force everything via 32-bit intermediaries. */
1633    at = mkPCastTo(mce, Ity_I32, va1);
1634    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
1635    at = mkPCastTo(mce, finalVty, at);
1636    return at;
1637 }
1638 
1639 
1640 /* 3-arg version of the above. */
1641 static
mkLazy3(MCEnv * mce,IRType finalVty,IRAtom * va1,IRAtom * va2,IRAtom * va3)1642 IRAtom* mkLazy3 ( MCEnv* mce, IRType finalVty,
1643                   IRAtom* va1, IRAtom* va2, IRAtom* va3 )
1644 {
1645    IRAtom* at;
1646    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
1647    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
1648    IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
1649    tl_assert(isShadowAtom(mce,va1));
1650    tl_assert(isShadowAtom(mce,va2));
1651    tl_assert(isShadowAtom(mce,va3));
1652 
1653    /* The general case is inefficient because PCast is an expensive
1654       operation.  Here are some special cases which use PCast only
1655       twice rather than three times. */
1656 
1657    /* I32 x I64 x I64 -> I64 */
1658    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
1659    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
1660        && finalVty == Ity_I64) {
1661       if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I64\n");
1662       /* Widen 1st arg to I64.  Since 1st arg is typically a rounding
1663          mode indication which is fully defined, this should get
1664          folded out later. */
1665       at = mkPCastTo(mce, Ity_I64, va1);
1666       /* Now fold in 2nd and 3rd args. */
1667       at = mkUifU(mce, Ity_I64, at, va2);
1668       at = mkUifU(mce, Ity_I64, at, va3);
1669       /* and PCast once again. */
1670       at = mkPCastTo(mce, Ity_I64, at);
1671       return at;
1672    }
1673 
1674    /* I32 x I8 x I64 -> I64 */
1675    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I64
1676        && finalVty == Ity_I64) {
1677       if (0) VG_(printf)("mkLazy3: I32 x I8 x I64 -> I64\n");
1678       /* Widen 1st and 2nd args to I64.  Since 1st arg is typically a
1679        * rounding mode indication which is fully defined, this should
1680        * get folded out later.
1681       */
1682       IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
1683       IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
1684       at = mkUifU(mce, Ity_I64, at1, at2);  // UifU(PCast(va1), PCast(va2))
1685       at = mkUifU(mce, Ity_I64, at, va3);
1686       /* and PCast once again. */
1687       at = mkPCastTo(mce, Ity_I64, at);
1688       return at;
1689    }
1690 
1691    /* I32 x I64 x I64 -> I32 */
1692    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
1693        && finalVty == Ity_I32) {
1694       if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I32\n");
1695       at = mkPCastTo(mce, Ity_I64, va1);
1696       at = mkUifU(mce, Ity_I64, at, va2);
1697       at = mkUifU(mce, Ity_I64, at, va3);
1698       at = mkPCastTo(mce, Ity_I32, at);
1699       return at;
1700    }
1701 
1702    /* I32 x I32 x I32 -> I32 */
1703    /* 32-bit FP idiom, as (eg) happens on ARM */
1704    if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32
1705        && finalVty == Ity_I32) {
1706       if (0) VG_(printf)("mkLazy3: I32 x I32 x I32 -> I32\n");
1707       at = va1;
1708       at = mkUifU(mce, Ity_I32, at, va2);
1709       at = mkUifU(mce, Ity_I32, at, va3);
1710       at = mkPCastTo(mce, Ity_I32, at);
1711       return at;
1712    }
1713 
1714    /* I32 x I128 x I128 -> I128 */
1715    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
1716    if (t1 == Ity_I32 && t2 == Ity_I128 && t3 == Ity_I128
1717        && finalVty == Ity_I128) {
1718       if (0) VG_(printf)("mkLazy3: I32 x I128 x I128 -> I128\n");
1719       /* Widen 1st arg to I128.  Since 1st arg is typically a rounding
1720          mode indication which is fully defined, this should get
1721          folded out later. */
1722       at = mkPCastTo(mce, Ity_I128, va1);
1723       /* Now fold in 2nd and 3rd args. */
1724       at = mkUifU(mce, Ity_I128, at, va2);
1725       at = mkUifU(mce, Ity_I128, at, va3);
1726       /* and PCast once again. */
1727       at = mkPCastTo(mce, Ity_I128, at);
1728       return at;
1729    }
1730 
1731    /* I32 x I8 x I128 -> I128 */
1732    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
1733    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I128
1734        && finalVty == Ity_I128) {
1735       if (0) VG_(printf)("mkLazy3: I32 x I8 x I128 -> I128\n");
1736       /* Use I64 as an intermediate type, which means PCasting all 3
1737          args to I64 to start with. 1st arg is typically a rounding
1738          mode indication which is fully defined, so we hope that it
1739          will get folded out later. */
1740       IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
1741       IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
1742       IRAtom* at3 = mkPCastTo(mce, Ity_I64, va3);
1743       /* Now UifU all three together. */
1744       at = mkUifU(mce, Ity_I64, at1, at2);  // UifU(PCast(va1), PCast(va2))
1745       at = mkUifU(mce, Ity_I64, at, at3);   // ... `UifU` PCast(va3)
1746       /* and PCast once again. */
1747       at = mkPCastTo(mce, Ity_I128, at);
1748       return at;
1749    }
1750    if (1) {
1751       VG_(printf)("mkLazy3: ");
1752       ppIRType(t1);
1753       VG_(printf)(" x ");
1754       ppIRType(t2);
1755       VG_(printf)(" x ");
1756       ppIRType(t3);
1757       VG_(printf)(" -> ");
1758       ppIRType(finalVty);
1759       VG_(printf)("\n");
1760    }
1761 
1762    tl_assert(0);
1763    /* General case: force everything via 32-bit intermediaries. */
1764    /*
1765    at = mkPCastTo(mce, Ity_I32, va1);
1766    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
1767    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va3));
1768    at = mkPCastTo(mce, finalVty, at);
1769    return at;
1770    */
1771 }
1772 
1773 
1774 /* 4-arg version of the above. */
1775 static
mkLazy4(MCEnv * mce,IRType finalVty,IRAtom * va1,IRAtom * va2,IRAtom * va3,IRAtom * va4)1776 IRAtom* mkLazy4 ( MCEnv* mce, IRType finalVty,
1777                   IRAtom* va1, IRAtom* va2, IRAtom* va3, IRAtom* va4 )
1778 {
1779    IRAtom* at;
1780    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
1781    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
1782    IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
1783    IRType t4 = typeOfIRExpr(mce->sb->tyenv, va4);
1784    tl_assert(isShadowAtom(mce,va1));
1785    tl_assert(isShadowAtom(mce,va2));
1786    tl_assert(isShadowAtom(mce,va3));
1787    tl_assert(isShadowAtom(mce,va4));
1788 
1789    /* The general case is inefficient because PCast is an expensive
1790       operation.  Here are some special cases which use PCast only
1791       twice rather than three times. */
1792 
1793    /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
1794 
1795    if (t1 == Ity_I32 && t2 == Ity_I128 && t3 == Ity_I128 && t4 == Ity_I128
1796        && finalVty == Ity_I128) {
1797       if (0) VG_(printf)("mkLazy4: I32 x I128 x I128 x I128 -> I128\n");
1798       /* Widen 1st arg to I128.  Since 1st arg is typically a rounding
1799          mode indication which is fully defined, this should get
1800          folded out later. */
1801       at = mkPCastTo(mce, Ity_I128, va1);
1802       /* Now fold in 2nd, 3rd, 4th args. */
1803       at = mkUifU(mce, Ity_I128, at, va2);
1804       at = mkUifU(mce, Ity_I128, at, va3);
1805       at = mkUifU(mce, Ity_I128, at, va4);
1806       /* and PCast once again. */
1807       at = mkPCastTo(mce, Ity_I128, at);
1808       return at;
1809    }
1810 
1811    /* I32 x I64 x I64 x I64 -> I64 */
1812    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64 && t4 == Ity_I64
1813        && finalVty == Ity_I64) {
1814       if (0) VG_(printf)("mkLazy4: I32 x I64 x I64 x I64 -> I64\n");
1815       /* Widen 1st arg to I64.  Since 1st arg is typically a rounding
1816          mode indication which is fully defined, this should get
1817          folded out later. */
1818       at = mkPCastTo(mce, Ity_I64, va1);
1819       /* Now fold in 2nd, 3rd, 4th args. */
1820       at = mkUifU(mce, Ity_I64, at, va2);
1821       at = mkUifU(mce, Ity_I64, at, va3);
1822       at = mkUifU(mce, Ity_I64, at, va4);
1823       /* and PCast once again. */
1824       at = mkPCastTo(mce, Ity_I64, at);
1825       return at;
1826    }
1827    /* I32 x I32 x I32 x I32 -> I32 */
1828    /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
1829    if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32 && t4 == Ity_I32
1830        && finalVty == Ity_I32) {
1831       if (0) VG_(printf)("mkLazy4: I32 x I32 x I32 x I32 -> I32\n");
1832       at = va1;
1833       /* Now fold in 2nd, 3rd, 4th args. */
1834       at = mkUifU(mce, Ity_I32, at, va2);
1835       at = mkUifU(mce, Ity_I32, at, va3);
1836       at = mkUifU(mce, Ity_I32, at, va4);
1837       at = mkPCastTo(mce, Ity_I32, at);
1838       return at;
1839    }
1840 
1841    if (1) {
1842       VG_(printf)("mkLazy4: ");
1843       ppIRType(t1);
1844       VG_(printf)(" x ");
1845       ppIRType(t2);
1846       VG_(printf)(" x ");
1847       ppIRType(t3);
1848       VG_(printf)(" x ");
1849       ppIRType(t4);
1850       VG_(printf)(" -> ");
1851       ppIRType(finalVty);
1852       VG_(printf)("\n");
1853    }
1854 
1855    tl_assert(0);
1856 }
1857 
1858 
1859 /* Do the lazy propagation game from a null-terminated vector of
1860    atoms.  This is presumably the arguments to a helper call, so the
1861    IRCallee info is also supplied in order that we can know which
1862    arguments should be ignored (via the .mcx_mask field).
1863 */
1864 static
mkLazyN(MCEnv * mce,IRAtom ** exprvec,IRType finalVtype,IRCallee * cee)1865 IRAtom* mkLazyN ( MCEnv* mce,
1866                   IRAtom** exprvec, IRType finalVtype, IRCallee* cee )
1867 {
1868    Int     i;
1869    IRAtom* here;
1870    IRAtom* curr;
1871    IRType  mergeTy;
1872    Bool    mergeTy64 = True;
1873 
1874    /* Decide on the type of the merge intermediary.  If all relevant
1875       args are I64, then it's I64.  In all other circumstances, use
1876       I32. */
1877    for (i = 0; exprvec[i]; i++) {
1878       tl_assert(i < 32);
1879       tl_assert(isOriginalAtom(mce, exprvec[i]));
1880       if (cee->mcx_mask & (1<<i))
1881          continue;
1882       if (typeOfIRExpr(mce->sb->tyenv, exprvec[i]) != Ity_I64)
1883          mergeTy64 = False;
1884    }
1885 
1886    mergeTy = mergeTy64  ? Ity_I64  : Ity_I32;
1887    curr    = definedOfType(mergeTy);
1888 
1889    for (i = 0; exprvec[i]; i++) {
1890       tl_assert(i < 32);
1891       tl_assert(isOriginalAtom(mce, exprvec[i]));
1892       /* Only take notice of this arg if the callee's mc-exclusion
1893          mask does not say it is to be excluded. */
1894       if (cee->mcx_mask & (1<<i)) {
1895          /* the arg is to be excluded from definedness checking.  Do
1896             nothing. */
1897          if (0) VG_(printf)("excluding %s(%d)\n", cee->name, i);
1898       } else {
1899          /* calculate the arg's definedness, and pessimistically merge
1900             it in. */
1901          here = mkPCastTo( mce, mergeTy, expr2vbits(mce, exprvec[i]) );
1902          curr = mergeTy64
1903                    ? mkUifU64(mce, here, curr)
1904                    : mkUifU32(mce, here, curr);
1905       }
1906    }
1907    return mkPCastTo(mce, finalVtype, curr );
1908 }
1909 
1910 
1911 /*------------------------------------------------------------*/
1912 /*--- Generating expensive sequences for exact carry-chain ---*/
1913 /*--- propagation in add/sub and related operations.       ---*/
1914 /*------------------------------------------------------------*/
1915 
1916 static
expensiveAddSub(MCEnv * mce,Bool add,IRType ty,IRAtom * qaa,IRAtom * qbb,IRAtom * aa,IRAtom * bb)1917 IRAtom* expensiveAddSub ( MCEnv*  mce,
1918                           Bool    add,
1919                           IRType  ty,
1920                           IRAtom* qaa, IRAtom* qbb,
1921                           IRAtom* aa,  IRAtom* bb )
1922 {
1923    IRAtom *a_min, *b_min, *a_max, *b_max;
1924    IROp   opAND, opOR, opXOR, opNOT, opADD, opSUB;
1925 
1926    tl_assert(isShadowAtom(mce,qaa));
1927    tl_assert(isShadowAtom(mce,qbb));
1928    tl_assert(isOriginalAtom(mce,aa));
1929    tl_assert(isOriginalAtom(mce,bb));
1930    tl_assert(sameKindedAtoms(qaa,aa));
1931    tl_assert(sameKindedAtoms(qbb,bb));
1932 
1933    switch (ty) {
1934       case Ity_I32:
1935          opAND = Iop_And32;
1936          opOR  = Iop_Or32;
1937          opXOR = Iop_Xor32;
1938          opNOT = Iop_Not32;
1939          opADD = Iop_Add32;
1940          opSUB = Iop_Sub32;
1941          break;
1942       case Ity_I64:
1943          opAND = Iop_And64;
1944          opOR  = Iop_Or64;
1945          opXOR = Iop_Xor64;
1946          opNOT = Iop_Not64;
1947          opADD = Iop_Add64;
1948          opSUB = Iop_Sub64;
1949          break;
1950       default:
1951          VG_(tool_panic)("expensiveAddSub");
1952    }
1953 
1954    // a_min = aa & ~qaa
1955    a_min = assignNew('V', mce,ty,
1956                      binop(opAND, aa,
1957                                   assignNew('V', mce,ty, unop(opNOT, qaa))));
1958 
1959    // b_min = bb & ~qbb
1960    b_min = assignNew('V', mce,ty,
1961                      binop(opAND, bb,
1962                                   assignNew('V', mce,ty, unop(opNOT, qbb))));
1963 
1964    // a_max = aa | qaa
1965    a_max = assignNew('V', mce,ty, binop(opOR, aa, qaa));
1966 
1967    // b_max = bb | qbb
1968    b_max = assignNew('V', mce,ty, binop(opOR, bb, qbb));
1969 
1970    if (add) {
1971       // result = (qaa | qbb) | ((a_min + b_min) ^ (a_max + b_max))
1972       return
1973       assignNew('V', mce,ty,
1974          binop( opOR,
1975                 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
1976                 assignNew('V', mce,ty,
1977                    binop( opXOR,
1978                           assignNew('V', mce,ty, binop(opADD, a_min, b_min)),
1979                           assignNew('V', mce,ty, binop(opADD, a_max, b_max))
1980                    )
1981                 )
1982          )
1983       );
1984    } else {
1985       // result = (qaa | qbb) | ((a_min - b_max) ^ (a_max + b_min))
1986       return
1987       assignNew('V', mce,ty,
1988          binop( opOR,
1989                 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
1990                 assignNew('V', mce,ty,
1991                    binop( opXOR,
1992                           assignNew('V', mce,ty, binop(opSUB, a_min, b_max)),
1993                           assignNew('V', mce,ty, binop(opSUB, a_max, b_min))
1994                    )
1995                 )
1996          )
1997       );
1998    }
1999 
2000 }
2001 
2002 
2003 static
expensiveCountTrailingZeroes(MCEnv * mce,IROp czop,IRAtom * atom,IRAtom * vatom)2004 IRAtom* expensiveCountTrailingZeroes ( MCEnv* mce, IROp czop,
2005                                        IRAtom* atom, IRAtom* vatom )
2006 {
2007    IRType ty;
2008    IROp xorOp, subOp, andOp;
2009    IRExpr *one;
2010    IRAtom *improver, *improved;
2011    tl_assert(isShadowAtom(mce,vatom));
2012    tl_assert(isOriginalAtom(mce,atom));
2013    tl_assert(sameKindedAtoms(atom,vatom));
2014 
2015    switch (czop) {
2016       case Iop_Ctz32:
2017          ty = Ity_I32;
2018          xorOp = Iop_Xor32;
2019          subOp = Iop_Sub32;
2020          andOp = Iop_And32;
2021          one = mkU32(1);
2022          break;
2023       case Iop_Ctz64:
2024          ty = Ity_I64;
2025          xorOp = Iop_Xor64;
2026          subOp = Iop_Sub64;
2027          andOp = Iop_And64;
2028          one = mkU64(1);
2029          break;
2030       default:
2031          ppIROp(czop);
2032          VG_(tool_panic)("memcheck:expensiveCountTrailingZeroes");
2033    }
2034 
2035    // improver = atom ^ (atom - 1)
2036    //
2037    // That is, improver has its low ctz(atom) bits equal to one;
2038    // higher bits (if any) equal to zero.
2039    improver = assignNew('V', mce,ty,
2040                         binop(xorOp,
2041                               atom,
2042                               assignNew('V', mce, ty,
2043                                         binop(subOp, atom, one))));
2044 
2045    // improved = vatom & improver
2046    //
2047    // That is, treat any V bits above the first ctz(atom) bits as
2048    // "defined".
2049    improved = assignNew('V', mce, ty,
2050                         binop(andOp, vatom, improver));
2051 
2052    // Return pessimizing cast of improved.
2053    return mkPCastTo(mce, ty, improved);
2054 }
2055 
2056 
2057 /*------------------------------------------------------------*/
2058 /*--- Scalar shifts.                                       ---*/
2059 /*------------------------------------------------------------*/
2060 
2061 /* Produce an interpretation for (aa << bb) (or >>s, >>u).  The basic
2062    idea is to shift the definedness bits by the original shift amount.
2063    This introduces 0s ("defined") in new positions for left shifts and
2064    unsigned right shifts, and copies the top definedness bit for
2065    signed right shifts.  So, conveniently, applying the original shift
2066    operator to the definedness bits for the left arg is exactly the
2067    right thing to do:
2068 
2069       (qaa << bb)
2070 
2071    However if the shift amount is undefined then the whole result
2072    is undefined.  Hence need:
2073 
2074       (qaa << bb) `UifU` PCast(qbb)
2075 
2076    If the shift amount bb is a literal than qbb will say 'all defined'
2077    and the UifU and PCast will get folded out by post-instrumentation
2078    optimisation.
2079 */
scalarShift(MCEnv * mce,IRType ty,IROp original_op,IRAtom * qaa,IRAtom * qbb,IRAtom * aa,IRAtom * bb)2080 static IRAtom* scalarShift ( MCEnv*  mce,
2081                              IRType  ty,
2082                              IROp    original_op,
2083                              IRAtom* qaa, IRAtom* qbb,
2084                              IRAtom* aa,  IRAtom* bb )
2085 {
2086    tl_assert(isShadowAtom(mce,qaa));
2087    tl_assert(isShadowAtom(mce,qbb));
2088    tl_assert(isOriginalAtom(mce,aa));
2089    tl_assert(isOriginalAtom(mce,bb));
2090    tl_assert(sameKindedAtoms(qaa,aa));
2091    tl_assert(sameKindedAtoms(qbb,bb));
2092    return
2093       assignNew(
2094          'V', mce, ty,
2095          mkUifU( mce, ty,
2096                  assignNew('V', mce, ty, binop(original_op, qaa, bb)),
2097                  mkPCastTo(mce, ty, qbb)
2098          )
2099    );
2100 }
2101 
2102 
2103 /*------------------------------------------------------------*/
2104 /*--- Helpers for dealing with vector primops.             ---*/
2105 /*------------------------------------------------------------*/
2106 
2107 /* Vector pessimisation -- pessimise within each lane individually. */
2108 
mkPCast8x16(MCEnv * mce,IRAtom * at)2109 static IRAtom* mkPCast8x16 ( MCEnv* mce, IRAtom* at )
2110 {
2111    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ8x16, at));
2112 }
2113 
mkPCast16x8(MCEnv * mce,IRAtom * at)2114 static IRAtom* mkPCast16x8 ( MCEnv* mce, IRAtom* at )
2115 {
2116    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ16x8, at));
2117 }
2118 
mkPCast32x4(MCEnv * mce,IRAtom * at)2119 static IRAtom* mkPCast32x4 ( MCEnv* mce, IRAtom* at )
2120 {
2121    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ32x4, at));
2122 }
2123 
mkPCast64x2(MCEnv * mce,IRAtom * at)2124 static IRAtom* mkPCast64x2 ( MCEnv* mce, IRAtom* at )
2125 {
2126    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ64x2, at));
2127 }
2128 
mkPCast64x4(MCEnv * mce,IRAtom * at)2129 static IRAtom* mkPCast64x4 ( MCEnv* mce, IRAtom* at )
2130 {
2131    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ64x4, at));
2132 }
2133 
mkPCast32x8(MCEnv * mce,IRAtom * at)2134 static IRAtom* mkPCast32x8 ( MCEnv* mce, IRAtom* at )
2135 {
2136    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ32x8, at));
2137 }
2138 
mkPCast32x2(MCEnv * mce,IRAtom * at)2139 static IRAtom* mkPCast32x2 ( MCEnv* mce, IRAtom* at )
2140 {
2141    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ32x2, at));
2142 }
2143 
mkPCast16x16(MCEnv * mce,IRAtom * at)2144 static IRAtom* mkPCast16x16 ( MCEnv* mce, IRAtom* at )
2145 {
2146    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ16x16, at));
2147 }
2148 
mkPCast16x4(MCEnv * mce,IRAtom * at)2149 static IRAtom* mkPCast16x4 ( MCEnv* mce, IRAtom* at )
2150 {
2151    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ16x4, at));
2152 }
2153 
mkPCast8x32(MCEnv * mce,IRAtom * at)2154 static IRAtom* mkPCast8x32 ( MCEnv* mce, IRAtom* at )
2155 {
2156    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ8x32, at));
2157 }
2158 
mkPCast8x8(MCEnv * mce,IRAtom * at)2159 static IRAtom* mkPCast8x8 ( MCEnv* mce, IRAtom* at )
2160 {
2161    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ8x8, at));
2162 }
2163 
mkPCast16x2(MCEnv * mce,IRAtom * at)2164 static IRAtom* mkPCast16x2 ( MCEnv* mce, IRAtom* at )
2165 {
2166    return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ16x2, at));
2167 }
2168 
mkPCast8x4(MCEnv * mce,IRAtom * at)2169 static IRAtom* mkPCast8x4 ( MCEnv* mce, IRAtom* at )
2170 {
2171    return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ8x4, at));
2172 }
2173 
2174 
2175 /* Here's a simple scheme capable of handling ops derived from SSE1
2176    code and while only generating ops that can be efficiently
2177    implemented in SSE1. */
2178 
2179 /* All-lanes versions are straightforward:
2180 
2181    binary32Fx4(x,y)   ==> PCast32x4(UifUV128(x#,y#))
2182 
2183    unary32Fx4(x,y)    ==> PCast32x4(x#)
2184 
2185    Lowest-lane-only versions are more complex:
2186 
2187    binary32F0x4(x,y)  ==> SetV128lo32(
2188                              x#,
2189                              PCast32(V128to32(UifUV128(x#,y#)))
2190                           )
2191 
2192    This is perhaps not so obvious.  In particular, it's faster to
2193    do a V128-bit UifU and then take the bottom 32 bits than the more
2194    obvious scheme of taking the bottom 32 bits of each operand
2195    and doing a 32-bit UifU.  Basically since UifU is fast and
2196    chopping lanes off vector values is slow.
2197 
2198    Finally:
2199 
2200    unary32F0x4(x)     ==> SetV128lo32(
2201                              x#,
2202                              PCast32(V128to32(x#))
2203                           )
2204 
2205    Where:
2206 
2207    PCast32(v#)   = 1Sto32(CmpNE32(v#,0))
2208    PCast32x4(v#) = CmpNEZ32x4(v#)
2209 */
2210 
2211 static
binary32Fx4(MCEnv * mce,IRAtom * vatomX,IRAtom * vatomY)2212 IRAtom* binary32Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2213 {
2214    IRAtom* at;
2215    tl_assert(isShadowAtom(mce, vatomX));
2216    tl_assert(isShadowAtom(mce, vatomY));
2217    at = mkUifUV128(mce, vatomX, vatomY);
2218    at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, at));
2219    return at;
2220 }
2221 
2222 static
unary32Fx4(MCEnv * mce,IRAtom * vatomX)2223 IRAtom* unary32Fx4 ( MCEnv* mce, IRAtom* vatomX )
2224 {
2225    IRAtom* at;
2226    tl_assert(isShadowAtom(mce, vatomX));
2227    at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, vatomX));
2228    return at;
2229 }
2230 
2231 static
binary32F0x4(MCEnv * mce,IRAtom * vatomX,IRAtom * vatomY)2232 IRAtom* binary32F0x4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2233 {
2234    IRAtom* at;
2235    tl_assert(isShadowAtom(mce, vatomX));
2236    tl_assert(isShadowAtom(mce, vatomY));
2237    at = mkUifUV128(mce, vatomX, vatomY);
2238    at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, at));
2239    at = mkPCastTo(mce, Ity_I32, at);
2240    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
2241    return at;
2242 }
2243 
2244 static
unary32F0x4(MCEnv * mce,IRAtom * vatomX)2245 IRAtom* unary32F0x4 ( MCEnv* mce, IRAtom* vatomX )
2246 {
2247    IRAtom* at;
2248    tl_assert(isShadowAtom(mce, vatomX));
2249    at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, vatomX));
2250    at = mkPCastTo(mce, Ity_I32, at);
2251    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
2252    return at;
2253 }
2254 
2255 /* --- ... and ... 64Fx2 versions of the same ... --- */
2256 
2257 static
binary64Fx2(MCEnv * mce,IRAtom * vatomX,IRAtom * vatomY)2258 IRAtom* binary64Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2259 {
2260    IRAtom* at;
2261    tl_assert(isShadowAtom(mce, vatomX));
2262    tl_assert(isShadowAtom(mce, vatomY));
2263    at = mkUifUV128(mce, vatomX, vatomY);
2264    at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, at));
2265    return at;
2266 }
2267 
2268 static
unary64Fx2(MCEnv * mce,IRAtom * vatomX)2269 IRAtom* unary64Fx2 ( MCEnv* mce, IRAtom* vatomX )
2270 {
2271    IRAtom* at;
2272    tl_assert(isShadowAtom(mce, vatomX));
2273    at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, vatomX));
2274    return at;
2275 }
2276 
2277 static
binary64F0x2(MCEnv * mce,IRAtom * vatomX,IRAtom * vatomY)2278 IRAtom* binary64F0x2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2279 {
2280    IRAtom* at;
2281    tl_assert(isShadowAtom(mce, vatomX));
2282    tl_assert(isShadowAtom(mce, vatomY));
2283    at = mkUifUV128(mce, vatomX, vatomY);
2284    at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, at));
2285    at = mkPCastTo(mce, Ity_I64, at);
2286    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
2287    return at;
2288 }
2289 
2290 static
unary64F0x2(MCEnv * mce,IRAtom * vatomX)2291 IRAtom* unary64F0x2 ( MCEnv* mce, IRAtom* vatomX )
2292 {
2293    IRAtom* at;
2294    tl_assert(isShadowAtom(mce, vatomX));
2295    at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vatomX));
2296    at = mkPCastTo(mce, Ity_I64, at);
2297    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
2298    return at;
2299 }
2300 
2301 /* --- --- ... and ... 32Fx2 versions of the same --- --- */
2302 
2303 static
binary32Fx2(MCEnv * mce,IRAtom * vatomX,IRAtom * vatomY)2304 IRAtom* binary32Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2305 {
2306    IRAtom* at;
2307    tl_assert(isShadowAtom(mce, vatomX));
2308    tl_assert(isShadowAtom(mce, vatomY));
2309    at = mkUifU64(mce, vatomX, vatomY);
2310    at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, at));
2311    return at;
2312 }
2313 
2314 static
unary32Fx2(MCEnv * mce,IRAtom * vatomX)2315 IRAtom* unary32Fx2 ( MCEnv* mce, IRAtom* vatomX )
2316 {
2317    IRAtom* at;
2318    tl_assert(isShadowAtom(mce, vatomX));
2319    at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, vatomX));
2320    return at;
2321 }
2322 
2323 /* --- ... and ... 64Fx4 versions of the same ... --- */
2324 
2325 static
binary64Fx4(MCEnv * mce,IRAtom * vatomX,IRAtom * vatomY)2326 IRAtom* binary64Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2327 {
2328    IRAtom* at;
2329    tl_assert(isShadowAtom(mce, vatomX));
2330    tl_assert(isShadowAtom(mce, vatomY));
2331    at = mkUifUV256(mce, vatomX, vatomY);
2332    at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, at));
2333    return at;
2334 }
2335 
2336 static
unary64Fx4(MCEnv * mce,IRAtom * vatomX)2337 IRAtom* unary64Fx4 ( MCEnv* mce, IRAtom* vatomX )
2338 {
2339    IRAtom* at;
2340    tl_assert(isShadowAtom(mce, vatomX));
2341    at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, vatomX));
2342    return at;
2343 }
2344 
2345 /* --- ... and ... 32Fx8 versions of the same ... --- */
2346 
2347 static
binary32Fx8(MCEnv * mce,IRAtom * vatomX,IRAtom * vatomY)2348 IRAtom* binary32Fx8 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2349 {
2350    IRAtom* at;
2351    tl_assert(isShadowAtom(mce, vatomX));
2352    tl_assert(isShadowAtom(mce, vatomY));
2353    at = mkUifUV256(mce, vatomX, vatomY);
2354    at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, at));
2355    return at;
2356 }
2357 
2358 static
unary32Fx8(MCEnv * mce,IRAtom * vatomX)2359 IRAtom* unary32Fx8 ( MCEnv* mce, IRAtom* vatomX )
2360 {
2361    IRAtom* at;
2362    tl_assert(isShadowAtom(mce, vatomX));
2363    at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, vatomX));
2364    return at;
2365 }
2366 
2367 /* --- 64Fx2 binary FP ops, with rounding mode --- */
2368 
2369 static
binary64Fx2_w_rm(MCEnv * mce,IRAtom * vRM,IRAtom * vatomX,IRAtom * vatomY)2370 IRAtom* binary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM,
2371                                        IRAtom* vatomX, IRAtom* vatomY )
2372 {
2373    /* This is the same as binary64Fx2, except that we subsequently
2374       pessimise vRM (definedness of the rounding mode), widen to 128
2375       bits and UifU it into the result.  As with the scalar cases, if
2376       the RM is a constant then it is defined and so this extra bit
2377       will get constant-folded out later. */
2378    // "do" the vector args
2379    IRAtom* t1 = binary64Fx2(mce, vatomX, vatomY);
2380    // PCast the RM, and widen it to 128 bits
2381    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2382    // Roll it into the result
2383    t1 = mkUifUV128(mce, t1, t2);
2384    return t1;
2385 }
2386 
2387 /* --- ... and ... 32Fx4 versions of the same --- */
2388 
2389 static
binary32Fx4_w_rm(MCEnv * mce,IRAtom * vRM,IRAtom * vatomX,IRAtom * vatomY)2390 IRAtom* binary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
2391                                        IRAtom* vatomX, IRAtom* vatomY )
2392 {
2393    IRAtom* t1 = binary32Fx4(mce, vatomX, vatomY);
2394    // PCast the RM, and widen it to 128 bits
2395    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2396    // Roll it into the result
2397    t1 = mkUifUV128(mce, t1, t2);
2398    return t1;
2399 }
2400 
2401 /* --- ... and ... 64Fx4 versions of the same --- */
2402 
2403 static
binary64Fx4_w_rm(MCEnv * mce,IRAtom * vRM,IRAtom * vatomX,IRAtom * vatomY)2404 IRAtom* binary64Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
2405                                        IRAtom* vatomX, IRAtom* vatomY )
2406 {
2407    IRAtom* t1 = binary64Fx4(mce, vatomX, vatomY);
2408    // PCast the RM, and widen it to 256 bits
2409    IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
2410    // Roll it into the result
2411    t1 = mkUifUV256(mce, t1, t2);
2412    return t1;
2413 }
2414 
2415 /* --- ... and ... 32Fx8 versions of the same --- */
2416 
2417 static
binary32Fx8_w_rm(MCEnv * mce,IRAtom * vRM,IRAtom * vatomX,IRAtom * vatomY)2418 IRAtom* binary32Fx8_w_rm ( MCEnv* mce, IRAtom* vRM,
2419                                        IRAtom* vatomX, IRAtom* vatomY )
2420 {
2421    IRAtom* t1 = binary32Fx8(mce, vatomX, vatomY);
2422    // PCast the RM, and widen it to 256 bits
2423    IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
2424    // Roll it into the result
2425    t1 = mkUifUV256(mce, t1, t2);
2426    return t1;
2427 }
2428 
2429 /* --- 64Fx2 unary FP ops, with rounding mode --- */
2430 
2431 static
unary64Fx2_w_rm(MCEnv * mce,IRAtom * vRM,IRAtom * vatomX)2432 IRAtom* unary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
2433 {
2434    /* Same scheme as binary64Fx2_w_rm. */
2435    // "do" the vector arg
2436    IRAtom* t1 = unary64Fx2(mce, vatomX);
2437    // PCast the RM, and widen it to 128 bits
2438    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2439    // Roll it into the result
2440    t1 = mkUifUV128(mce, t1, t2);
2441    return t1;
2442 }
2443 
2444 /* --- ... and ... 32Fx4 versions of the same --- */
2445 
2446 static
unary32Fx4_w_rm(MCEnv * mce,IRAtom * vRM,IRAtom * vatomX)2447 IRAtom* unary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
2448 {
2449    /* Same scheme as unary32Fx4_w_rm. */
2450    IRAtom* t1 = unary32Fx4(mce, vatomX);
2451    // PCast the RM, and widen it to 128 bits
2452    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2453    // Roll it into the result
2454    t1 = mkUifUV128(mce, t1, t2);
2455    return t1;
2456 }
2457 
2458 
2459 /* --- --- Vector saturated narrowing --- --- */
2460 
2461 /* We used to do something very clever here, but on closer inspection
2462    (2011-Jun-15), and in particular bug #279698, it turns out to be
2463    wrong.  Part of the problem came from the fact that for a long
2464    time, the IR primops to do with saturated narrowing were
2465    underspecified and managed to confuse multiple cases which needed
2466    to be separate: the op names had a signedness qualifier, but in
2467    fact the source and destination signednesses needed to be specified
2468    independently, so the op names really need two independent
2469    signedness specifiers.
2470 
2471    As of 2011-Jun-15 (ish) the underspecification was sorted out
2472    properly.  The incorrect instrumentation remained, though.  That
2473    has now (2011-Oct-22) been fixed.
2474 
2475    What we now do is simple:
2476 
2477    Let the original narrowing op be QNarrowBinXtoYxZ, where Z is a
2478    number of lanes, X is the source lane width and signedness, and Y
2479    is the destination lane width and signedness.  In all cases the
2480    destination lane width is half the source lane width, so the names
2481    have a bit of redundancy, but are at least easy to read.
2482 
2483    For example, Iop_QNarrowBin32Sto16Ux8 narrows 8 lanes of signed 32s
2484    to unsigned 16s.
2485 
2486    Let Vanilla(OP) be a function that takes OP, one of these
2487    saturating narrowing ops, and produces the same "shaped" narrowing
2488    op which is not saturating, but merely dumps the most significant
2489    bits.  "same shape" means that the lane numbers and widths are the
2490    same as with OP.
2491 
2492    For example, Vanilla(Iop_QNarrowBin32Sto16Ux8)
2493                   = Iop_NarrowBin32to16x8,
2494    that is, narrow 8 lanes of 32 bits to 8 lanes of 16 bits, by
2495    dumping the top half of each lane.
2496 
2497    So, with that in place, the scheme is simple, and it is simple to
2498    pessimise each lane individually and then apply Vanilla(OP) so as
2499    to get the result in the right "shape".  If the original OP is
2500    QNarrowBinXtoYxZ then we produce
2501 
2502    Vanilla(OP)( PCast-X-to-X-x-Z(vatom1), PCast-X-to-X-x-Z(vatom2) )
2503 
2504    or for the case when OP is unary (Iop_QNarrowUn*)
2505 
2506    Vanilla(OP)( PCast-X-to-X-x-Z(vatom) )
2507 */
2508 static
vanillaNarrowingOpOfShape(IROp qnarrowOp)2509 IROp vanillaNarrowingOpOfShape ( IROp qnarrowOp )
2510 {
2511    switch (qnarrowOp) {
2512       /* Binary: (128, 128) -> 128 */
2513       case Iop_QNarrowBin16Sto8Ux16:
2514       case Iop_QNarrowBin16Sto8Sx16:
2515       case Iop_QNarrowBin16Uto8Ux16:
2516       case Iop_QNarrowBin64Sto32Sx4:
2517       case Iop_QNarrowBin64Uto32Ux4:
2518          return Iop_NarrowBin16to8x16;
2519       case Iop_QNarrowBin32Sto16Ux8:
2520       case Iop_QNarrowBin32Sto16Sx8:
2521       case Iop_QNarrowBin32Uto16Ux8:
2522          return Iop_NarrowBin32to16x8;
2523       /* Binary: (64, 64) -> 64 */
2524       case Iop_QNarrowBin32Sto16Sx4:
2525          return Iop_NarrowBin32to16x4;
2526       case Iop_QNarrowBin16Sto8Ux8:
2527       case Iop_QNarrowBin16Sto8Sx8:
2528          return Iop_NarrowBin16to8x8;
2529       /* Unary: 128 -> 64 */
2530       case Iop_QNarrowUn64Uto32Ux2:
2531       case Iop_QNarrowUn64Sto32Sx2:
2532       case Iop_QNarrowUn64Sto32Ux2:
2533          return Iop_NarrowUn64to32x2;
2534       case Iop_QNarrowUn32Uto16Ux4:
2535       case Iop_QNarrowUn32Sto16Sx4:
2536       case Iop_QNarrowUn32Sto16Ux4:
2537       case Iop_F32toF16x4:
2538          return Iop_NarrowUn32to16x4;
2539       case Iop_QNarrowUn16Uto8Ux8:
2540       case Iop_QNarrowUn16Sto8Sx8:
2541       case Iop_QNarrowUn16Sto8Ux8:
2542          return Iop_NarrowUn16to8x8;
2543       default:
2544          ppIROp(qnarrowOp);
2545          VG_(tool_panic)("vanillaNarrowOpOfShape");
2546    }
2547 }
2548 
2549 static
vectorNarrowBinV128(MCEnv * mce,IROp narrow_op,IRAtom * vatom1,IRAtom * vatom2)2550 IRAtom* vectorNarrowBinV128 ( MCEnv* mce, IROp narrow_op,
2551                               IRAtom* vatom1, IRAtom* vatom2)
2552 {
2553    IRAtom *at1, *at2, *at3;
2554    IRAtom* (*pcast)( MCEnv*, IRAtom* );
2555    switch (narrow_op) {
2556       case Iop_QNarrowBin64Sto32Sx4: pcast = mkPCast32x4; break;
2557       case Iop_QNarrowBin64Uto32Ux4: pcast = mkPCast32x4; break;
2558       case Iop_QNarrowBin32Sto16Sx8: pcast = mkPCast32x4; break;
2559       case Iop_QNarrowBin32Uto16Ux8: pcast = mkPCast32x4; break;
2560       case Iop_QNarrowBin32Sto16Ux8: pcast = mkPCast32x4; break;
2561       case Iop_QNarrowBin16Sto8Sx16: pcast = mkPCast16x8; break;
2562       case Iop_QNarrowBin16Uto8Ux16: pcast = mkPCast16x8; break;
2563       case Iop_QNarrowBin16Sto8Ux16: pcast = mkPCast16x8; break;
2564       default: VG_(tool_panic)("vectorNarrowBinV128");
2565    }
2566    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
2567    tl_assert(isShadowAtom(mce,vatom1));
2568    tl_assert(isShadowAtom(mce,vatom2));
2569    at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
2570    at2 = assignNew('V', mce, Ity_V128, pcast(mce, vatom2));
2571    at3 = assignNew('V', mce, Ity_V128, binop(vanilla_narrow, at1, at2));
2572    return at3;
2573 }
2574 
2575 static
vectorNarrowBin64(MCEnv * mce,IROp narrow_op,IRAtom * vatom1,IRAtom * vatom2)2576 IRAtom* vectorNarrowBin64 ( MCEnv* mce, IROp narrow_op,
2577                             IRAtom* vatom1, IRAtom* vatom2)
2578 {
2579    IRAtom *at1, *at2, *at3;
2580    IRAtom* (*pcast)( MCEnv*, IRAtom* );
2581    switch (narrow_op) {
2582       case Iop_QNarrowBin32Sto16Sx4: pcast = mkPCast32x2; break;
2583       case Iop_QNarrowBin16Sto8Sx8:  pcast = mkPCast16x4; break;
2584       case Iop_QNarrowBin16Sto8Ux8:  pcast = mkPCast16x4; break;
2585       default: VG_(tool_panic)("vectorNarrowBin64");
2586    }
2587    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
2588    tl_assert(isShadowAtom(mce,vatom1));
2589    tl_assert(isShadowAtom(mce,vatom2));
2590    at1 = assignNew('V', mce, Ity_I64, pcast(mce, vatom1));
2591    at2 = assignNew('V', mce, Ity_I64, pcast(mce, vatom2));
2592    at3 = assignNew('V', mce, Ity_I64, binop(vanilla_narrow, at1, at2));
2593    return at3;
2594 }
2595 
2596 static
vectorNarrowUnV128(MCEnv * mce,IROp narrow_op,IRAtom * vatom1)2597 IRAtom* vectorNarrowUnV128 ( MCEnv* mce, IROp narrow_op,
2598                              IRAtom* vatom1)
2599 {
2600    IRAtom *at1, *at2;
2601    IRAtom* (*pcast)( MCEnv*, IRAtom* );
2602    tl_assert(isShadowAtom(mce,vatom1));
2603    /* For vanilla narrowing (non-saturating), we can just apply
2604       the op directly to the V bits. */
2605    switch (narrow_op) {
2606       case Iop_NarrowUn16to8x8:
2607       case Iop_NarrowUn32to16x4:
2608       case Iop_NarrowUn64to32x2:
2609       case Iop_F32toF16x4:
2610          at1 = assignNew('V', mce, Ity_I64, unop(narrow_op, vatom1));
2611          return at1;
2612       default:
2613          break; /* Do Plan B */
2614    }
2615    /* Plan B: for ops that involve a saturation operation on the args,
2616       we must PCast before the vanilla narrow. */
2617    switch (narrow_op) {
2618       case Iop_QNarrowUn16Sto8Sx8:  pcast = mkPCast16x8; break;
2619       case Iop_QNarrowUn16Sto8Ux8:  pcast = mkPCast16x8; break;
2620       case Iop_QNarrowUn16Uto8Ux8:  pcast = mkPCast16x8; break;
2621       case Iop_QNarrowUn32Sto16Sx4: pcast = mkPCast32x4; break;
2622       case Iop_QNarrowUn32Sto16Ux4: pcast = mkPCast32x4; break;
2623       case Iop_QNarrowUn32Uto16Ux4: pcast = mkPCast32x4; break;
2624       case Iop_QNarrowUn64Sto32Sx2: pcast = mkPCast64x2; break;
2625       case Iop_QNarrowUn64Sto32Ux2: pcast = mkPCast64x2; break;
2626       case Iop_QNarrowUn64Uto32Ux2: pcast = mkPCast64x2; break;
2627       default: VG_(tool_panic)("vectorNarrowUnV128");
2628    }
2629    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
2630    at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
2631    at2 = assignNew('V', mce, Ity_I64, unop(vanilla_narrow, at1));
2632    return at2;
2633 }
2634 
2635 static
vectorWidenI64(MCEnv * mce,IROp longen_op,IRAtom * vatom1)2636 IRAtom* vectorWidenI64 ( MCEnv* mce, IROp longen_op,
2637                          IRAtom* vatom1)
2638 {
2639    IRAtom *at1, *at2;
2640    IRAtom* (*pcast)( MCEnv*, IRAtom* );
2641    switch (longen_op) {
2642       case Iop_Widen8Uto16x8:  pcast = mkPCast16x8; break;
2643       case Iop_Widen8Sto16x8:  pcast = mkPCast16x8; break;
2644       case Iop_Widen16Uto32x4: pcast = mkPCast32x4; break;
2645       case Iop_Widen16Sto32x4: pcast = mkPCast32x4; break;
2646       case Iop_Widen32Uto64x2: pcast = mkPCast64x2; break;
2647       case Iop_Widen32Sto64x2: pcast = mkPCast64x2; break;
2648       case Iop_F16toF32x4:     pcast = mkPCast32x4; break;
2649       default: VG_(tool_panic)("vectorWidenI64");
2650    }
2651    tl_assert(isShadowAtom(mce,vatom1));
2652    at1 = assignNew('V', mce, Ity_V128, unop(longen_op, vatom1));
2653    at2 = assignNew('V', mce, Ity_V128, pcast(mce, at1));
2654    return at2;
2655 }
2656 
2657 
2658 /* --- --- Vector integer arithmetic --- --- */
2659 
2660 /* Simple ... UifU the args and per-lane pessimise the results. */
2661 
2662 /* --- V256-bit versions --- */
2663 
2664 static
binary8Ix32(MCEnv * mce,IRAtom * vatom1,IRAtom * vatom2)2665 IRAtom* binary8Ix32 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2666 {
2667    IRAtom* at;
2668    at = mkUifUV256(mce, vatom1, vatom2);
2669    at = mkPCast8x32(mce, at);
2670    return at;
2671 }
2672 
2673 static
binary16Ix16(MCEnv * mce,IRAtom * vatom1,IRAtom * vatom2)2674 IRAtom* binary16Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2675 {
2676    IRAtom* at;
2677    at = mkUifUV256(mce, vatom1, vatom2);
2678    at = mkPCast16x16(mce, at);
2679    return at;
2680 }
2681 
2682 static
binary32Ix8(MCEnv * mce,IRAtom * vatom1,IRAtom * vatom2)2683 IRAtom* binary32Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2684 {
2685    IRAtom* at;
2686    at = mkUifUV256(mce, vatom1, vatom2);
2687    at = mkPCast32x8(mce, at);
2688    return at;
2689 }
2690 
2691 static
binary64Ix4(MCEnv * mce,IRAtom * vatom1,IRAtom * vatom2)2692 IRAtom* binary64Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2693 {
2694    IRAtom* at;
2695    at = mkUifUV256(mce, vatom1, vatom2);
2696    at = mkPCast64x4(mce, at);
2697    return at;
2698 }
2699 
2700 /* --- V128-bit versions --- */
2701 
2702 static
binary8Ix16(MCEnv * mce,IRAtom * vatom1,IRAtom * vatom2)2703 IRAtom* binary8Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2704 {
2705    IRAtom* at;
2706    at = mkUifUV128(mce, vatom1, vatom2);
2707    at = mkPCast8x16(mce, at);
2708    return at;
2709 }
2710 
2711 static
binary16Ix8(MCEnv * mce,IRAtom * vatom1,IRAtom * vatom2)2712 IRAtom* binary16Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2713 {
2714    IRAtom* at;
2715    at = mkUifUV128(mce, vatom1, vatom2);
2716    at = mkPCast16x8(mce, at);
2717    return at;
2718 }
2719 
2720 static
binary32Ix4(MCEnv * mce,IRAtom * vatom1,IRAtom * vatom2)2721 IRAtom* binary32Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2722 {
2723    IRAtom* at;
2724    at = mkUifUV128(mce, vatom1, vatom2);
2725    at = mkPCast32x4(mce, at);
2726    return at;
2727 }
2728 
2729 static
binary64Ix2(MCEnv * mce,IRAtom * vatom1,IRAtom * vatom2)2730 IRAtom* binary64Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2731 {
2732    IRAtom* at;
2733    at = mkUifUV128(mce, vatom1, vatom2);
2734    at = mkPCast64x2(mce, at);
2735    return at;
2736 }
2737 
2738 /* --- 64-bit versions --- */
2739 
2740 static
binary8Ix8(MCEnv * mce,IRAtom * vatom1,IRAtom * vatom2)2741 IRAtom* binary8Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2742 {
2743    IRAtom* at;
2744    at = mkUifU64(mce, vatom1, vatom2);
2745    at = mkPCast8x8(mce, at);
2746    return at;
2747 }
2748 
2749 static
binary16Ix4(MCEnv * mce,IRAtom * vatom1,IRAtom * vatom2)2750 IRAtom* binary16Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2751 {
2752    IRAtom* at;
2753    at = mkUifU64(mce, vatom1, vatom2);
2754    at = mkPCast16x4(mce, at);
2755    return at;
2756 }
2757 
2758 static
binary32Ix2(MCEnv * mce,IRAtom * vatom1,IRAtom * vatom2)2759 IRAtom* binary32Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2760 {
2761    IRAtom* at;
2762    at = mkUifU64(mce, vatom1, vatom2);
2763    at = mkPCast32x2(mce, at);
2764    return at;
2765 }
2766 
2767 static
binary64Ix1(MCEnv * mce,IRAtom * vatom1,IRAtom * vatom2)2768 IRAtom* binary64Ix1 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2769 {
2770    IRAtom* at;
2771    at = mkUifU64(mce, vatom1, vatom2);
2772    at = mkPCastTo(mce, Ity_I64, at);
2773    return at;
2774 }
2775 
2776 /* --- 32-bit versions --- */
2777 
2778 static
binary8Ix4(MCEnv * mce,IRAtom * vatom1,IRAtom * vatom2)2779 IRAtom* binary8Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2780 {
2781    IRAtom* at;
2782    at = mkUifU32(mce, vatom1, vatom2);
2783    at = mkPCast8x4(mce, at);
2784    return at;
2785 }
2786 
2787 static
binary16Ix2(MCEnv * mce,IRAtom * vatom1,IRAtom * vatom2)2788 IRAtom* binary16Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2789 {
2790    IRAtom* at;
2791    at = mkUifU32(mce, vatom1, vatom2);
2792    at = mkPCast16x2(mce, at);
2793    return at;
2794 }
2795 
2796 
2797 /*------------------------------------------------------------*/
2798 /*--- Generate shadow values from all kinds of IRExprs.    ---*/
2799 /*------------------------------------------------------------*/
2800 
2801 static
expr2vbits_Qop(MCEnv * mce,IROp op,IRAtom * atom1,IRAtom * atom2,IRAtom * atom3,IRAtom * atom4)2802 IRAtom* expr2vbits_Qop ( MCEnv* mce,
2803                          IROp op,
2804                          IRAtom* atom1, IRAtom* atom2,
2805                          IRAtom* atom3, IRAtom* atom4 )
2806 {
2807    IRAtom* vatom1 = expr2vbits( mce, atom1 );
2808    IRAtom* vatom2 = expr2vbits( mce, atom2 );
2809    IRAtom* vatom3 = expr2vbits( mce, atom3 );
2810    IRAtom* vatom4 = expr2vbits( mce, atom4 );
2811 
2812    tl_assert(isOriginalAtom(mce,atom1));
2813    tl_assert(isOriginalAtom(mce,atom2));
2814    tl_assert(isOriginalAtom(mce,atom3));
2815    tl_assert(isOriginalAtom(mce,atom4));
2816    tl_assert(isShadowAtom(mce,vatom1));
2817    tl_assert(isShadowAtom(mce,vatom2));
2818    tl_assert(isShadowAtom(mce,vatom3));
2819    tl_assert(isShadowAtom(mce,vatom4));
2820    tl_assert(sameKindedAtoms(atom1,vatom1));
2821    tl_assert(sameKindedAtoms(atom2,vatom2));
2822    tl_assert(sameKindedAtoms(atom3,vatom3));
2823    tl_assert(sameKindedAtoms(atom4,vatom4));
2824    switch (op) {
2825       case Iop_MAddF64:
2826       case Iop_MAddF64r32:
2827       case Iop_MSubF64:
2828       case Iop_MSubF64r32:
2829          /* I32(rm) x F64 x F64 x F64 -> F64 */
2830          return mkLazy4(mce, Ity_I64, vatom1, vatom2, vatom3, vatom4);
2831 
2832       case Iop_MAddF32:
2833       case Iop_MSubF32:
2834          /* I32(rm) x F32 x F32 x F32 -> F32 */
2835          return mkLazy4(mce, Ity_I32, vatom1, vatom2, vatom3, vatom4);
2836 
2837       case Iop_MAddF128:
2838       case Iop_MSubF128:
2839       case Iop_NegMAddF128:
2840       case Iop_NegMSubF128:
2841          /* I32(rm) x F128 x F128 x F128 -> F128 */
2842          return mkLazy4(mce, Ity_I128, vatom1, vatom2, vatom3, vatom4);
2843 
2844       /* V256-bit data-steering */
2845       case Iop_64x4toV256:
2846          return assignNew('V', mce, Ity_V256,
2847                           IRExpr_Qop(op, vatom1, vatom2, vatom3, vatom4));
2848 
2849       default:
2850          ppIROp(op);
2851          VG_(tool_panic)("memcheck:expr2vbits_Qop");
2852    }
2853 }
2854 
2855 
2856 static
expr2vbits_Triop(MCEnv * mce,IROp op,IRAtom * atom1,IRAtom * atom2,IRAtom * atom3)2857 IRAtom* expr2vbits_Triop ( MCEnv* mce,
2858                            IROp op,
2859                            IRAtom* atom1, IRAtom* atom2, IRAtom* atom3 )
2860 {
2861    IRAtom* vatom1 = expr2vbits( mce, atom1 );
2862    IRAtom* vatom2 = expr2vbits( mce, atom2 );
2863    IRAtom* vatom3 = expr2vbits( mce, atom3 );
2864 
2865    tl_assert(isOriginalAtom(mce,atom1));
2866    tl_assert(isOriginalAtom(mce,atom2));
2867    tl_assert(isOriginalAtom(mce,atom3));
2868    tl_assert(isShadowAtom(mce,vatom1));
2869    tl_assert(isShadowAtom(mce,vatom2));
2870    tl_assert(isShadowAtom(mce,vatom3));
2871    tl_assert(sameKindedAtoms(atom1,vatom1));
2872    tl_assert(sameKindedAtoms(atom2,vatom2));
2873    tl_assert(sameKindedAtoms(atom3,vatom3));
2874    switch (op) {
2875       case Iop_AddF128:
2876       case Iop_SubF128:
2877       case Iop_MulF128:
2878       case Iop_DivF128:
2879       case Iop_AddD128:
2880       case Iop_SubD128:
2881       case Iop_MulD128:
2882       case Iop_DivD128:
2883       case Iop_QuantizeD128:
2884          /* I32(rm) x F128/D128 x F128/D128 -> F128/D128 */
2885          return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
2886       case Iop_AddF64:
2887       case Iop_AddD64:
2888       case Iop_AddF64r32:
2889       case Iop_SubF64:
2890       case Iop_SubD64:
2891       case Iop_SubF64r32:
2892       case Iop_MulF64:
2893       case Iop_MulD64:
2894       case Iop_MulF64r32:
2895       case Iop_DivF64:
2896       case Iop_DivD64:
2897       case Iop_DivF64r32:
2898       case Iop_ScaleF64:
2899       case Iop_Yl2xF64:
2900       case Iop_Yl2xp1F64:
2901       case Iop_AtanF64:
2902       case Iop_PRemF64:
2903       case Iop_PRem1F64:
2904       case Iop_QuantizeD64:
2905          /* I32(rm) x F64/D64 x F64/D64 -> F64/D64 */
2906          return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
2907       case Iop_PRemC3210F64:
2908       case Iop_PRem1C3210F64:
2909          /* I32(rm) x F64 x F64 -> I32 */
2910          return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
2911       case Iop_AddF32:
2912       case Iop_SubF32:
2913       case Iop_MulF32:
2914       case Iop_DivF32:
2915          /* I32(rm) x F32 x F32 -> I32 */
2916          return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
2917       case Iop_SignificanceRoundD64:
2918          /* IRRoundingMode(I32) x I8 x D64 -> D64 */
2919          return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
2920       case Iop_SignificanceRoundD128:
2921          /* IRRoundingMode(I32) x I8 x D128 -> D128 */
2922          return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
2923       case Iop_SliceV128:
2924          /* (V128, V128, I8) -> V128 */
2925          complainIfUndefined(mce, atom3, NULL);
2926          return assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3));
2927       case Iop_Slice64:
2928          /* (I64, I64, I8) -> I64 */
2929          complainIfUndefined(mce, atom3, NULL);
2930          return assignNew('V', mce, Ity_I64, triop(op, vatom1, vatom2, atom3));
2931       case Iop_SetElem8x8:
2932       case Iop_SetElem16x4:
2933       case Iop_SetElem32x2:
2934          complainIfUndefined(mce, atom2, NULL);
2935          return assignNew('V', mce, Ity_I64, triop(op, vatom1, atom2, vatom3));
2936 
2937       /* Vector FP with rounding mode as the first arg */
2938       case Iop_Add64Fx2:
2939       case Iop_Sub64Fx2:
2940       case Iop_Mul64Fx2:
2941       case Iop_Div64Fx2:
2942          return binary64Fx2_w_rm(mce, vatom1, vatom2, vatom3);
2943 
2944       case Iop_Add32Fx4:
2945       case Iop_Sub32Fx4:
2946       case Iop_Mul32Fx4:
2947       case Iop_Div32Fx4:
2948         return binary32Fx4_w_rm(mce, vatom1, vatom2, vatom3);
2949 
2950       case Iop_Add64Fx4:
2951       case Iop_Sub64Fx4:
2952       case Iop_Mul64Fx4:
2953       case Iop_Div64Fx4:
2954          return binary64Fx4_w_rm(mce, vatom1, vatom2, vatom3);
2955 
2956       case Iop_Add32Fx8:
2957       case Iop_Sub32Fx8:
2958       case Iop_Mul32Fx8:
2959       case Iop_Div32Fx8:
2960          return binary32Fx8_w_rm(mce, vatom1, vatom2, vatom3);
2961 
2962       default:
2963          ppIROp(op);
2964          VG_(tool_panic)("memcheck:expr2vbits_Triop");
2965    }
2966 }
2967 
2968 
2969 static
expr2vbits_Binop(MCEnv * mce,IROp op,IRAtom * atom1,IRAtom * atom2)2970 IRAtom* expr2vbits_Binop ( MCEnv* mce,
2971                            IROp op,
2972                            IRAtom* atom1, IRAtom* atom2 )
2973 {
2974    IRType  and_or_ty;
2975    IRAtom* (*uifu)    (MCEnv*, IRAtom*, IRAtom*);
2976    IRAtom* (*difd)    (MCEnv*, IRAtom*, IRAtom*);
2977    IRAtom* (*improve) (MCEnv*, IRAtom*, IRAtom*);
2978 
2979    IRAtom* vatom1 = expr2vbits( mce, atom1 );
2980    IRAtom* vatom2 = expr2vbits( mce, atom2 );
2981 
2982    tl_assert(isOriginalAtom(mce,atom1));
2983    tl_assert(isOriginalAtom(mce,atom2));
2984    tl_assert(isShadowAtom(mce,vatom1));
2985    tl_assert(isShadowAtom(mce,vatom2));
2986    tl_assert(sameKindedAtoms(atom1,vatom1));
2987    tl_assert(sameKindedAtoms(atom2,vatom2));
2988    switch (op) {
2989 
2990       /* 32-bit SIMD */
2991 
2992       case Iop_Add16x2:
2993       case Iop_HAdd16Ux2:
2994       case Iop_HAdd16Sx2:
2995       case Iop_Sub16x2:
2996       case Iop_HSub16Ux2:
2997       case Iop_HSub16Sx2:
2998       case Iop_QAdd16Sx2:
2999       case Iop_QSub16Sx2:
3000       case Iop_QSub16Ux2:
3001       case Iop_QAdd16Ux2:
3002          return binary16Ix2(mce, vatom1, vatom2);
3003 
3004       case Iop_Add8x4:
3005       case Iop_HAdd8Ux4:
3006       case Iop_HAdd8Sx4:
3007       case Iop_Sub8x4:
3008       case Iop_HSub8Ux4:
3009       case Iop_HSub8Sx4:
3010       case Iop_QSub8Ux4:
3011       case Iop_QAdd8Ux4:
3012       case Iop_QSub8Sx4:
3013       case Iop_QAdd8Sx4:
3014          return binary8Ix4(mce, vatom1, vatom2);
3015 
3016       /* 64-bit SIMD */
3017 
3018       case Iop_ShrN8x8:
3019       case Iop_ShrN16x4:
3020       case Iop_ShrN32x2:
3021       case Iop_SarN8x8:
3022       case Iop_SarN16x4:
3023       case Iop_SarN32x2:
3024       case Iop_ShlN16x4:
3025       case Iop_ShlN32x2:
3026       case Iop_ShlN8x8:
3027          /* Same scheme as with all other shifts. */
3028          complainIfUndefined(mce, atom2, NULL);
3029          return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
3030 
3031       case Iop_QNarrowBin32Sto16Sx4:
3032       case Iop_QNarrowBin16Sto8Sx8:
3033       case Iop_QNarrowBin16Sto8Ux8:
3034          return vectorNarrowBin64(mce, op, vatom1, vatom2);
3035 
3036       case Iop_Min8Ux8:
3037       case Iop_Min8Sx8:
3038       case Iop_Max8Ux8:
3039       case Iop_Max8Sx8:
3040       case Iop_Avg8Ux8:
3041       case Iop_QSub8Sx8:
3042       case Iop_QSub8Ux8:
3043       case Iop_Sub8x8:
3044       case Iop_CmpGT8Sx8:
3045       case Iop_CmpGT8Ux8:
3046       case Iop_CmpEQ8x8:
3047       case Iop_QAdd8Sx8:
3048       case Iop_QAdd8Ux8:
3049       case Iop_QSal8x8:
3050       case Iop_QShl8x8:
3051       case Iop_Add8x8:
3052       case Iop_Mul8x8:
3053       case Iop_PolynomialMul8x8:
3054          return binary8Ix8(mce, vatom1, vatom2);
3055 
3056       case Iop_Min16Sx4:
3057       case Iop_Min16Ux4:
3058       case Iop_Max16Sx4:
3059       case Iop_Max16Ux4:
3060       case Iop_Avg16Ux4:
3061       case Iop_QSub16Ux4:
3062       case Iop_QSub16Sx4:
3063       case Iop_Sub16x4:
3064       case Iop_Mul16x4:
3065       case Iop_MulHi16Sx4:
3066       case Iop_MulHi16Ux4:
3067       case Iop_CmpGT16Sx4:
3068       case Iop_CmpGT16Ux4:
3069       case Iop_CmpEQ16x4:
3070       case Iop_QAdd16Sx4:
3071       case Iop_QAdd16Ux4:
3072       case Iop_QSal16x4:
3073       case Iop_QShl16x4:
3074       case Iop_Add16x4:
3075       case Iop_QDMulHi16Sx4:
3076       case Iop_QRDMulHi16Sx4:
3077          return binary16Ix4(mce, vatom1, vatom2);
3078 
3079       case Iop_Sub32x2:
3080       case Iop_Mul32x2:
3081       case Iop_Max32Sx2:
3082       case Iop_Max32Ux2:
3083       case Iop_Min32Sx2:
3084       case Iop_Min32Ux2:
3085       case Iop_CmpGT32Sx2:
3086       case Iop_CmpGT32Ux2:
3087       case Iop_CmpEQ32x2:
3088       case Iop_Add32x2:
3089       case Iop_QAdd32Ux2:
3090       case Iop_QAdd32Sx2:
3091       case Iop_QSub32Ux2:
3092       case Iop_QSub32Sx2:
3093       case Iop_QSal32x2:
3094       case Iop_QShl32x2:
3095       case Iop_QDMulHi32Sx2:
3096       case Iop_QRDMulHi32Sx2:
3097          return binary32Ix2(mce, vatom1, vatom2);
3098 
3099       case Iop_QSub64Ux1:
3100       case Iop_QSub64Sx1:
3101       case Iop_QAdd64Ux1:
3102       case Iop_QAdd64Sx1:
3103       case Iop_QSal64x1:
3104       case Iop_QShl64x1:
3105       case Iop_Sal64x1:
3106          return binary64Ix1(mce, vatom1, vatom2);
3107 
3108       case Iop_QShlNsatSU8x8:
3109       case Iop_QShlNsatUU8x8:
3110       case Iop_QShlNsatSS8x8:
3111          complainIfUndefined(mce, atom2, NULL);
3112          return mkPCast8x8(mce, vatom1);
3113 
3114       case Iop_QShlNsatSU16x4:
3115       case Iop_QShlNsatUU16x4:
3116       case Iop_QShlNsatSS16x4:
3117          complainIfUndefined(mce, atom2, NULL);
3118          return mkPCast16x4(mce, vatom1);
3119 
3120       case Iop_QShlNsatSU32x2:
3121       case Iop_QShlNsatUU32x2:
3122       case Iop_QShlNsatSS32x2:
3123          complainIfUndefined(mce, atom2, NULL);
3124          return mkPCast32x2(mce, vatom1);
3125 
3126       case Iop_QShlNsatSU64x1:
3127       case Iop_QShlNsatUU64x1:
3128       case Iop_QShlNsatSS64x1:
3129          complainIfUndefined(mce, atom2, NULL);
3130          return mkPCast32x2(mce, vatom1);
3131 
3132       case Iop_PwMax32Sx2:
3133       case Iop_PwMax32Ux2:
3134       case Iop_PwMin32Sx2:
3135       case Iop_PwMin32Ux2:
3136       case Iop_PwMax32Fx2:
3137       case Iop_PwMin32Fx2:
3138          return assignNew('V', mce, Ity_I64,
3139                           binop(Iop_PwMax32Ux2,
3140                                 mkPCast32x2(mce, vatom1),
3141                                 mkPCast32x2(mce, vatom2)));
3142 
3143       case Iop_PwMax16Sx4:
3144       case Iop_PwMax16Ux4:
3145       case Iop_PwMin16Sx4:
3146       case Iop_PwMin16Ux4:
3147          return assignNew('V', mce, Ity_I64,
3148                           binop(Iop_PwMax16Ux4,
3149                                 mkPCast16x4(mce, vatom1),
3150                                 mkPCast16x4(mce, vatom2)));
3151 
3152       case Iop_PwMax8Sx8:
3153       case Iop_PwMax8Ux8:
3154       case Iop_PwMin8Sx8:
3155       case Iop_PwMin8Ux8:
3156          return assignNew('V', mce, Ity_I64,
3157                           binop(Iop_PwMax8Ux8,
3158                                 mkPCast8x8(mce, vatom1),
3159                                 mkPCast8x8(mce, vatom2)));
3160 
3161       case Iop_PwAdd32x2:
3162       case Iop_PwAdd32Fx2:
3163          return mkPCast32x2(mce,
3164                assignNew('V', mce, Ity_I64,
3165                          binop(Iop_PwAdd32x2,
3166                                mkPCast32x2(mce, vatom1),
3167                                mkPCast32x2(mce, vatom2))));
3168 
3169       case Iop_PwAdd16x4:
3170          return mkPCast16x4(mce,
3171                assignNew('V', mce, Ity_I64,
3172                          binop(op, mkPCast16x4(mce, vatom1),
3173                                    mkPCast16x4(mce, vatom2))));
3174 
3175       case Iop_PwAdd8x8:
3176          return mkPCast8x8(mce,
3177                assignNew('V', mce, Ity_I64,
3178                          binop(op, mkPCast8x8(mce, vatom1),
3179                                    mkPCast8x8(mce, vatom2))));
3180 
3181       case Iop_Shl8x8:
3182       case Iop_Shr8x8:
3183       case Iop_Sar8x8:
3184       case Iop_Sal8x8:
3185          return mkUifU64(mce,
3186                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3187                    mkPCast8x8(mce,vatom2)
3188                 );
3189 
3190       case Iop_Shl16x4:
3191       case Iop_Shr16x4:
3192       case Iop_Sar16x4:
3193       case Iop_Sal16x4:
3194          return mkUifU64(mce,
3195                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3196                    mkPCast16x4(mce,vatom2)
3197                 );
3198 
3199       case Iop_Shl32x2:
3200       case Iop_Shr32x2:
3201       case Iop_Sar32x2:
3202       case Iop_Sal32x2:
3203          return mkUifU64(mce,
3204                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3205                    mkPCast32x2(mce,vatom2)
3206                 );
3207 
3208       /* 64-bit data-steering */
3209       case Iop_InterleaveLO32x2:
3210       case Iop_InterleaveLO16x4:
3211       case Iop_InterleaveLO8x8:
3212       case Iop_InterleaveHI32x2:
3213       case Iop_InterleaveHI16x4:
3214       case Iop_InterleaveHI8x8:
3215       case Iop_CatOddLanes8x8:
3216       case Iop_CatEvenLanes8x8:
3217       case Iop_CatOddLanes16x4:
3218       case Iop_CatEvenLanes16x4:
3219       case Iop_InterleaveOddLanes8x8:
3220       case Iop_InterleaveEvenLanes8x8:
3221       case Iop_InterleaveOddLanes16x4:
3222       case Iop_InterleaveEvenLanes16x4:
3223          return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
3224 
3225       case Iop_GetElem8x8:
3226          complainIfUndefined(mce, atom2, NULL);
3227          return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
3228       case Iop_GetElem16x4:
3229          complainIfUndefined(mce, atom2, NULL);
3230          return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
3231       case Iop_GetElem32x2:
3232          complainIfUndefined(mce, atom2, NULL);
3233          return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
3234 
3235       /* Perm8x8: rearrange values in left arg using steering values
3236         from right arg.  So rearrange the vbits in the same way but
3237         pessimise wrt steering values. */
3238       case Iop_Perm8x8:
3239          return mkUifU64(
3240                    mce,
3241                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3242                    mkPCast8x8(mce, vatom2)
3243                 );
3244 
3245       /* V128-bit SIMD */
3246 
3247       case Iop_Sqrt32Fx4:
3248          return unary32Fx4_w_rm(mce, vatom1, vatom2);
3249       case Iop_Sqrt64Fx2:
3250          return unary64Fx2_w_rm(mce, vatom1, vatom2);
3251 
3252       case Iop_ShrN8x16:
3253       case Iop_ShrN16x8:
3254       case Iop_ShrN32x4:
3255       case Iop_ShrN64x2:
3256       case Iop_SarN8x16:
3257       case Iop_SarN16x8:
3258       case Iop_SarN32x4:
3259       case Iop_SarN64x2:
3260       case Iop_ShlN8x16:
3261       case Iop_ShlN16x8:
3262       case Iop_ShlN32x4:
3263       case Iop_ShlN64x2:
3264          /* Same scheme as with all other shifts.  Note: 22 Oct 05:
3265             this is wrong now, scalar shifts are done properly lazily.
3266             Vector shifts should be fixed too. */
3267          complainIfUndefined(mce, atom2, NULL);
3268          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
3269 
3270       /* V x V shifts/rotates are done using the standard lazy scheme. */
3271       /* For the non-rounding variants of bi-di vector x vector
3272          shifts (the Iop_Sh.. ops, that is) we use the lazy scheme.
3273          But note that this is overly pessimistic, because in fact only
3274          the bottom 8 bits of each lane of the second argument are taken
3275          into account when shifting.  So really we ought to ignore
3276          undefinedness in bits 8 and above of each lane in the
3277          second argument. */
3278       case Iop_Shl8x16:
3279       case Iop_Shr8x16:
3280       case Iop_Sar8x16:
3281       case Iop_Sal8x16:
3282       case Iop_Rol8x16:
3283       case Iop_Sh8Sx16:
3284       case Iop_Sh8Ux16:
3285          return mkUifUV128(mce,
3286                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3287                    mkPCast8x16(mce,vatom2)
3288                 );
3289 
3290       case Iop_Shl16x8:
3291       case Iop_Shr16x8:
3292       case Iop_Sar16x8:
3293       case Iop_Sal16x8:
3294       case Iop_Rol16x8:
3295       case Iop_Sh16Sx8:
3296       case Iop_Sh16Ux8:
3297          return mkUifUV128(mce,
3298                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3299                    mkPCast16x8(mce,vatom2)
3300                 );
3301 
3302       case Iop_Shl32x4:
3303       case Iop_Shr32x4:
3304       case Iop_Sar32x4:
3305       case Iop_Sal32x4:
3306       case Iop_Rol32x4:
3307       case Iop_Sh32Sx4:
3308       case Iop_Sh32Ux4:
3309          return mkUifUV128(mce,
3310                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3311                    mkPCast32x4(mce,vatom2)
3312                 );
3313 
3314       case Iop_Shl64x2:
3315       case Iop_Shr64x2:
3316       case Iop_Sar64x2:
3317       case Iop_Sal64x2:
3318       case Iop_Rol64x2:
3319       case Iop_Sh64Sx2:
3320       case Iop_Sh64Ux2:
3321          return mkUifUV128(mce,
3322                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3323                    mkPCast64x2(mce,vatom2)
3324                 );
3325 
3326       /* For the rounding variants of bi-di vector x vector shifts, the
3327          rounding adjustment can cause undefinedness to propagate through
3328          the entire lane, in the worst case.  Too complex to handle
3329          properly .. just UifU the arguments and then PCast them.
3330          Suboptimal but safe. */
3331       case Iop_Rsh8Sx16:
3332       case Iop_Rsh8Ux16:
3333          return binary8Ix16(mce, vatom1, vatom2);
3334       case Iop_Rsh16Sx8:
3335       case Iop_Rsh16Ux8:
3336          return binary16Ix8(mce, vatom1, vatom2);
3337       case Iop_Rsh32Sx4:
3338       case Iop_Rsh32Ux4:
3339          return binary32Ix4(mce, vatom1, vatom2);
3340       case Iop_Rsh64Sx2:
3341       case Iop_Rsh64Ux2:
3342          return binary64Ix2(mce, vatom1, vatom2);
3343 
3344       case Iop_F32ToFixed32Ux4_RZ:
3345       case Iop_F32ToFixed32Sx4_RZ:
3346       case Iop_Fixed32UToF32x4_RN:
3347       case Iop_Fixed32SToF32x4_RN:
3348          complainIfUndefined(mce, atom2, NULL);
3349          return mkPCast32x4(mce, vatom1);
3350 
3351       case Iop_F32ToFixed32Ux2_RZ:
3352       case Iop_F32ToFixed32Sx2_RZ:
3353       case Iop_Fixed32UToF32x2_RN:
3354       case Iop_Fixed32SToF32x2_RN:
3355          complainIfUndefined(mce, atom2, NULL);
3356          return mkPCast32x2(mce, vatom1);
3357 
3358       case Iop_QSub8Ux16:
3359       case Iop_QSub8Sx16:
3360       case Iop_Sub8x16:
3361       case Iop_Min8Ux16:
3362       case Iop_Min8Sx16:
3363       case Iop_Max8Ux16:
3364       case Iop_Max8Sx16:
3365       case Iop_CmpGT8Sx16:
3366       case Iop_CmpGT8Ux16:
3367       case Iop_CmpEQ8x16:
3368       case Iop_Avg8Ux16:
3369       case Iop_Avg8Sx16:
3370       case Iop_QAdd8Ux16:
3371       case Iop_QAdd8Sx16:
3372       case Iop_QAddExtUSsatSS8x16:
3373       case Iop_QAddExtSUsatUU8x16:
3374       case Iop_QSal8x16:
3375       case Iop_QShl8x16:
3376       case Iop_Add8x16:
3377       case Iop_Mul8x16:
3378       case Iop_PolynomialMul8x16:
3379       case Iop_PolynomialMulAdd8x16:
3380          return binary8Ix16(mce, vatom1, vatom2);
3381 
3382       case Iop_QSub16Ux8:
3383       case Iop_QSub16Sx8:
3384       case Iop_Sub16x8:
3385       case Iop_Mul16x8:
3386       case Iop_MulHi16Sx8:
3387       case Iop_MulHi16Ux8:
3388       case Iop_Min16Sx8:
3389       case Iop_Min16Ux8:
3390       case Iop_Max16Sx8:
3391       case Iop_Max16Ux8:
3392       case Iop_CmpGT16Sx8:
3393       case Iop_CmpGT16Ux8:
3394       case Iop_CmpEQ16x8:
3395       case Iop_Avg16Ux8:
3396       case Iop_Avg16Sx8:
3397       case Iop_QAdd16Ux8:
3398       case Iop_QAdd16Sx8:
3399       case Iop_QAddExtUSsatSS16x8:
3400       case Iop_QAddExtSUsatUU16x8:
3401       case Iop_QSal16x8:
3402       case Iop_QShl16x8:
3403       case Iop_Add16x8:
3404       case Iop_QDMulHi16Sx8:
3405       case Iop_QRDMulHi16Sx8:
3406       case Iop_PolynomialMulAdd16x8:
3407          return binary16Ix8(mce, vatom1, vatom2);
3408 
3409       case Iop_Sub32x4:
3410       case Iop_CmpGT32Sx4:
3411       case Iop_CmpGT32Ux4:
3412       case Iop_CmpEQ32x4:
3413       case Iop_QAdd32Sx4:
3414       case Iop_QAdd32Ux4:
3415       case Iop_QSub32Sx4:
3416       case Iop_QSub32Ux4:
3417       case Iop_QAddExtUSsatSS32x4:
3418       case Iop_QAddExtSUsatUU32x4:
3419       case Iop_QSal32x4:
3420       case Iop_QShl32x4:
3421       case Iop_Avg32Ux4:
3422       case Iop_Avg32Sx4:
3423       case Iop_Add32x4:
3424       case Iop_Max32Ux4:
3425       case Iop_Max32Sx4:
3426       case Iop_Min32Ux4:
3427       case Iop_Min32Sx4:
3428       case Iop_Mul32x4:
3429       case Iop_QDMulHi32Sx4:
3430       case Iop_QRDMulHi32Sx4:
3431       case Iop_PolynomialMulAdd32x4:
3432          return binary32Ix4(mce, vatom1, vatom2);
3433 
3434       case Iop_Sub64x2:
3435       case Iop_Add64x2:
3436       case Iop_Max64Sx2:
3437       case Iop_Max64Ux2:
3438       case Iop_Min64Sx2:
3439       case Iop_Min64Ux2:
3440       case Iop_CmpEQ64x2:
3441       case Iop_CmpGT64Sx2:
3442       case Iop_CmpGT64Ux2:
3443       case Iop_QSal64x2:
3444       case Iop_QShl64x2:
3445       case Iop_QAdd64Ux2:
3446       case Iop_QAdd64Sx2:
3447       case Iop_QSub64Ux2:
3448       case Iop_QSub64Sx2:
3449       case Iop_QAddExtUSsatSS64x2:
3450       case Iop_QAddExtSUsatUU64x2:
3451       case Iop_PolynomialMulAdd64x2:
3452       case Iop_CipherV128:
3453       case Iop_CipherLV128:
3454       case Iop_NCipherV128:
3455       case Iop_NCipherLV128:
3456       case Iop_MulI128by10E:
3457       case Iop_MulI128by10ECarry:
3458         return binary64Ix2(mce, vatom1, vatom2);
3459 
3460       case Iop_QNarrowBin64Sto32Sx4:
3461       case Iop_QNarrowBin64Uto32Ux4:
3462       case Iop_QNarrowBin32Sto16Sx8:
3463       case Iop_QNarrowBin32Uto16Ux8:
3464       case Iop_QNarrowBin32Sto16Ux8:
3465       case Iop_QNarrowBin16Sto8Sx16:
3466       case Iop_QNarrowBin16Uto8Ux16:
3467       case Iop_QNarrowBin16Sto8Ux16:
3468          return vectorNarrowBinV128(mce, op, vatom1, vatom2);
3469 
3470       case Iop_Min64Fx2:
3471       case Iop_Max64Fx2:
3472       case Iop_CmpLT64Fx2:
3473       case Iop_CmpLE64Fx2:
3474       case Iop_CmpEQ64Fx2:
3475       case Iop_CmpUN64Fx2:
3476       case Iop_RecipStep64Fx2:
3477       case Iop_RSqrtStep64Fx2:
3478          return binary64Fx2(mce, vatom1, vatom2);
3479 
3480       case Iop_Sub64F0x2:
3481       case Iop_Mul64F0x2:
3482       case Iop_Min64F0x2:
3483       case Iop_Max64F0x2:
3484       case Iop_Div64F0x2:
3485       case Iop_CmpLT64F0x2:
3486       case Iop_CmpLE64F0x2:
3487       case Iop_CmpEQ64F0x2:
3488       case Iop_CmpUN64F0x2:
3489       case Iop_Add64F0x2:
3490          return binary64F0x2(mce, vatom1, vatom2);
3491 
3492       case Iop_Min32Fx4:
3493       case Iop_Max32Fx4:
3494       case Iop_CmpLT32Fx4:
3495       case Iop_CmpLE32Fx4:
3496       case Iop_CmpEQ32Fx4:
3497       case Iop_CmpUN32Fx4:
3498       case Iop_CmpGT32Fx4:
3499       case Iop_CmpGE32Fx4:
3500       case Iop_RecipStep32Fx4:
3501       case Iop_RSqrtStep32Fx4:
3502          return binary32Fx4(mce, vatom1, vatom2);
3503 
3504       case Iop_Sub32Fx2:
3505       case Iop_Mul32Fx2:
3506       case Iop_Min32Fx2:
3507       case Iop_Max32Fx2:
3508       case Iop_CmpEQ32Fx2:
3509       case Iop_CmpGT32Fx2:
3510       case Iop_CmpGE32Fx2:
3511       case Iop_Add32Fx2:
3512       case Iop_RecipStep32Fx2:
3513       case Iop_RSqrtStep32Fx2:
3514          return binary32Fx2(mce, vatom1, vatom2);
3515 
3516       case Iop_Sub32F0x4:
3517       case Iop_Mul32F0x4:
3518       case Iop_Min32F0x4:
3519       case Iop_Max32F0x4:
3520       case Iop_Div32F0x4:
3521       case Iop_CmpLT32F0x4:
3522       case Iop_CmpLE32F0x4:
3523       case Iop_CmpEQ32F0x4:
3524       case Iop_CmpUN32F0x4:
3525       case Iop_Add32F0x4:
3526          return binary32F0x4(mce, vatom1, vatom2);
3527 
3528       case Iop_QShlNsatSU8x16:
3529       case Iop_QShlNsatUU8x16:
3530       case Iop_QShlNsatSS8x16:
3531          complainIfUndefined(mce, atom2, NULL);
3532          return mkPCast8x16(mce, vatom1);
3533 
3534       case Iop_QShlNsatSU16x8:
3535       case Iop_QShlNsatUU16x8:
3536       case Iop_QShlNsatSS16x8:
3537          complainIfUndefined(mce, atom2, NULL);
3538          return mkPCast16x8(mce, vatom1);
3539 
3540       case Iop_QShlNsatSU32x4:
3541       case Iop_QShlNsatUU32x4:
3542       case Iop_QShlNsatSS32x4:
3543          complainIfUndefined(mce, atom2, NULL);
3544          return mkPCast32x4(mce, vatom1);
3545 
3546       case Iop_QShlNsatSU64x2:
3547       case Iop_QShlNsatUU64x2:
3548       case Iop_QShlNsatSS64x2:
3549          complainIfUndefined(mce, atom2, NULL);
3550          return mkPCast32x4(mce, vatom1);
3551 
3552       /* Q-and-Qshift-by-imm-and-narrow of the form (V128, I8) -> V128.
3553          To make this simpler, do the following:
3554          * complain if the shift amount (the I8) is undefined
3555          * pcast each lane at the wide width
3556          * truncate each lane to half width
3557          * pcast the resulting 64-bit value to a single bit and use
3558            that as the least significant bit of the upper half of the
3559            result. */
3560       case Iop_QandQShrNnarrow64Uto32Ux2:
3561       case Iop_QandQSarNnarrow64Sto32Sx2:
3562       case Iop_QandQSarNnarrow64Sto32Ux2:
3563       case Iop_QandQRShrNnarrow64Uto32Ux2:
3564       case Iop_QandQRSarNnarrow64Sto32Sx2:
3565       case Iop_QandQRSarNnarrow64Sto32Ux2:
3566       case Iop_QandQShrNnarrow32Uto16Ux4:
3567       case Iop_QandQSarNnarrow32Sto16Sx4:
3568       case Iop_QandQSarNnarrow32Sto16Ux4:
3569       case Iop_QandQRShrNnarrow32Uto16Ux4:
3570       case Iop_QandQRSarNnarrow32Sto16Sx4:
3571       case Iop_QandQRSarNnarrow32Sto16Ux4:
3572       case Iop_QandQShrNnarrow16Uto8Ux8:
3573       case Iop_QandQSarNnarrow16Sto8Sx8:
3574       case Iop_QandQSarNnarrow16Sto8Ux8:
3575       case Iop_QandQRShrNnarrow16Uto8Ux8:
3576       case Iop_QandQRSarNnarrow16Sto8Sx8:
3577       case Iop_QandQRSarNnarrow16Sto8Ux8:
3578       {
3579          IRAtom* (*fnPessim) (MCEnv*, IRAtom*) = NULL;
3580          IROp opNarrow = Iop_INVALID;
3581          switch (op) {
3582             case Iop_QandQShrNnarrow64Uto32Ux2:
3583             case Iop_QandQSarNnarrow64Sto32Sx2:
3584             case Iop_QandQSarNnarrow64Sto32Ux2:
3585             case Iop_QandQRShrNnarrow64Uto32Ux2:
3586             case Iop_QandQRSarNnarrow64Sto32Sx2:
3587             case Iop_QandQRSarNnarrow64Sto32Ux2:
3588                fnPessim = mkPCast64x2;
3589                opNarrow = Iop_NarrowUn64to32x2;
3590                break;
3591             case Iop_QandQShrNnarrow32Uto16Ux4:
3592             case Iop_QandQSarNnarrow32Sto16Sx4:
3593             case Iop_QandQSarNnarrow32Sto16Ux4:
3594             case Iop_QandQRShrNnarrow32Uto16Ux4:
3595             case Iop_QandQRSarNnarrow32Sto16Sx4:
3596             case Iop_QandQRSarNnarrow32Sto16Ux4:
3597                fnPessim = mkPCast32x4;
3598                opNarrow = Iop_NarrowUn32to16x4;
3599                break;
3600             case Iop_QandQShrNnarrow16Uto8Ux8:
3601             case Iop_QandQSarNnarrow16Sto8Sx8:
3602             case Iop_QandQSarNnarrow16Sto8Ux8:
3603             case Iop_QandQRShrNnarrow16Uto8Ux8:
3604             case Iop_QandQRSarNnarrow16Sto8Sx8:
3605             case Iop_QandQRSarNnarrow16Sto8Ux8:
3606                fnPessim = mkPCast16x8;
3607                opNarrow = Iop_NarrowUn16to8x8;
3608                break;
3609             default:
3610                tl_assert(0);
3611          }
3612          complainIfUndefined(mce, atom2, NULL);
3613          // Pessimised shift result
3614          IRAtom* shV
3615             = fnPessim(mce, vatom1);
3616          // Narrowed, pessimised shift result
3617          IRAtom* shVnarrowed
3618             = assignNew('V', mce, Ity_I64, unop(opNarrow, shV));
3619          // Generates: Def--(63)--Def PCast-to-I1(narrowed)
3620          IRAtom* qV = mkPCastXXtoXXlsb(mce, shVnarrowed, Ity_I64);
3621          // and assemble the result
3622          return assignNew('V', mce, Ity_V128,
3623                           binop(Iop_64HLtoV128, qV, shVnarrowed));
3624       }
3625 
3626       case Iop_Mull32Sx2:
3627       case Iop_Mull32Ux2:
3628       case Iop_QDMull32Sx2:
3629          return vectorWidenI64(mce, Iop_Widen32Sto64x2,
3630                                     mkUifU64(mce, vatom1, vatom2));
3631 
3632       case Iop_Mull16Sx4:
3633       case Iop_Mull16Ux4:
3634       case Iop_QDMull16Sx4:
3635          return vectorWidenI64(mce, Iop_Widen16Sto32x4,
3636                                     mkUifU64(mce, vatom1, vatom2));
3637 
3638       case Iop_Mull8Sx8:
3639       case Iop_Mull8Ux8:
3640       case Iop_PolynomialMull8x8:
3641          return vectorWidenI64(mce, Iop_Widen8Sto16x8,
3642                                     mkUifU64(mce, vatom1, vatom2));
3643 
3644       case Iop_PwAdd32x4:
3645          return mkPCast32x4(mce,
3646                assignNew('V', mce, Ity_V128, binop(op, mkPCast32x4(mce, vatom1),
3647                      mkPCast32x4(mce, vatom2))));
3648 
3649       case Iop_PwAdd16x8:
3650          return mkPCast16x8(mce,
3651                assignNew('V', mce, Ity_V128, binop(op, mkPCast16x8(mce, vatom1),
3652                      mkPCast16x8(mce, vatom2))));
3653 
3654       case Iop_PwAdd8x16:
3655          return mkPCast8x16(mce,
3656                assignNew('V', mce, Ity_V128, binop(op, mkPCast8x16(mce, vatom1),
3657                      mkPCast8x16(mce, vatom2))));
3658 
3659       /* V128-bit data-steering */
3660       case Iop_SetV128lo32:
3661       case Iop_SetV128lo64:
3662       case Iop_64HLtoV128:
3663       case Iop_InterleaveLO64x2:
3664       case Iop_InterleaveLO32x4:
3665       case Iop_InterleaveLO16x8:
3666       case Iop_InterleaveLO8x16:
3667       case Iop_InterleaveHI64x2:
3668       case Iop_InterleaveHI32x4:
3669       case Iop_InterleaveHI16x8:
3670       case Iop_InterleaveHI8x16:
3671       case Iop_CatOddLanes8x16:
3672       case Iop_CatOddLanes16x8:
3673       case Iop_CatOddLanes32x4:
3674       case Iop_CatEvenLanes8x16:
3675       case Iop_CatEvenLanes16x8:
3676       case Iop_CatEvenLanes32x4:
3677       case Iop_InterleaveOddLanes8x16:
3678       case Iop_InterleaveOddLanes16x8:
3679       case Iop_InterleaveOddLanes32x4:
3680       case Iop_InterleaveEvenLanes8x16:
3681       case Iop_InterleaveEvenLanes16x8:
3682       case Iop_InterleaveEvenLanes32x4:
3683          return assignNew('V', mce, Ity_V128, binop(op, vatom1, vatom2));
3684 
3685       case Iop_GetElem8x16:
3686          complainIfUndefined(mce, atom2, NULL);
3687          return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
3688       case Iop_GetElem16x8:
3689          complainIfUndefined(mce, atom2, NULL);
3690          return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
3691       case Iop_GetElem32x4:
3692          complainIfUndefined(mce, atom2, NULL);
3693          return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
3694       case Iop_GetElem64x2:
3695          complainIfUndefined(mce, atom2, NULL);
3696          return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
3697 
3698      /* Perm8x16: rearrange values in left arg using steering values
3699         from right arg.  So rearrange the vbits in the same way but
3700         pessimise wrt steering values.  Perm32x4 ditto. */
3701       case Iop_Perm8x16:
3702          return mkUifUV128(
3703                    mce,
3704                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3705                    mkPCast8x16(mce, vatom2)
3706                 );
3707       case Iop_Perm32x4:
3708          return mkUifUV128(
3709                    mce,
3710                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3711                    mkPCast32x4(mce, vatom2)
3712                 );
3713 
3714      /* These two take the lower half of each 16-bit lane, sign/zero
3715         extend it to 32, and multiply together, producing a 32x4
3716         result (and implicitly ignoring half the operand bits).  So
3717         treat it as a bunch of independent 16x8 operations, but then
3718         do 32-bit shifts left-right to copy the lower half results
3719         (which are all 0s or all 1s due to PCasting in binary16Ix8)
3720         into the upper half of each result lane. */
3721       case Iop_MullEven16Ux8:
3722       case Iop_MullEven16Sx8: {
3723          IRAtom* at;
3724          at = binary16Ix8(mce,vatom1,vatom2);
3725          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN32x4, at, mkU8(16)));
3726          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN32x4, at, mkU8(16)));
3727 	 return at;
3728       }
3729 
3730       /* Same deal as Iop_MullEven16{S,U}x8 */
3731       case Iop_MullEven8Ux16:
3732       case Iop_MullEven8Sx16: {
3733          IRAtom* at;
3734          at = binary8Ix16(mce,vatom1,vatom2);
3735          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN16x8, at, mkU8(8)));
3736          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN16x8, at, mkU8(8)));
3737 	 return at;
3738       }
3739 
3740       /* Same deal as Iop_MullEven16{S,U}x8 */
3741       case Iop_MullEven32Ux4:
3742       case Iop_MullEven32Sx4: {
3743          IRAtom* at;
3744          at = binary32Ix4(mce,vatom1,vatom2);
3745          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN64x2, at, mkU8(32)));
3746          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN64x2, at, mkU8(32)));
3747          return at;
3748       }
3749 
3750       /* narrow 2xV128 into 1xV128, hi half from left arg, in a 2 x
3751          32x4 -> 16x8 laneage, discarding the upper half of each lane.
3752          Simply apply same op to the V bits, since this really no more
3753          than a data steering operation. */
3754       case Iop_NarrowBin32to16x8:
3755       case Iop_NarrowBin16to8x16:
3756       case Iop_NarrowBin64to32x4:
3757          return assignNew('V', mce, Ity_V128,
3758                                     binop(op, vatom1, vatom2));
3759 
3760       case Iop_ShrV128:
3761       case Iop_ShlV128:
3762       case Iop_I128StoBCD128:
3763          /* Same scheme as with all other shifts.  Note: 10 Nov 05:
3764             this is wrong now, scalar shifts are done properly lazily.
3765             Vector shifts should be fixed too. */
3766          complainIfUndefined(mce, atom2, NULL);
3767          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
3768 
3769       case Iop_BCDAdd:
3770       case Iop_BCDSub:
3771          return mkLazy2(mce, Ity_V128, vatom1, vatom2);
3772 
3773       /* SHA Iops */
3774       case Iop_SHA256:
3775       case Iop_SHA512:
3776          complainIfUndefined(mce, atom2, NULL);
3777          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
3778 
3779       /* I128-bit data-steering */
3780       case Iop_64HLto128:
3781          return assignNew('V', mce, Ity_I128, binop(op, vatom1, vatom2));
3782 
3783       /* V256-bit SIMD */
3784 
3785       case Iop_Max64Fx4:
3786       case Iop_Min64Fx4:
3787          return binary64Fx4(mce, vatom1, vatom2);
3788 
3789       case Iop_Max32Fx8:
3790       case Iop_Min32Fx8:
3791          return binary32Fx8(mce, vatom1, vatom2);
3792 
3793       /* V256-bit data-steering */
3794       case Iop_V128HLtoV256:
3795          return assignNew('V', mce, Ity_V256, binop(op, vatom1, vatom2));
3796 
3797       /* Scalar floating point */
3798 
3799       case Iop_F32toI64S:
3800       case Iop_F32toI64U:
3801          /* I32(rm) x F32 -> I64 */
3802          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
3803 
3804       case Iop_I64StoF32:
3805          /* I32(rm) x I64 -> F32 */
3806          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
3807 
3808       case Iop_RoundF64toInt:
3809       case Iop_RoundF64toF32:
3810       case Iop_F64toI64S:
3811       case Iop_F64toI64U:
3812       case Iop_I64StoF64:
3813       case Iop_I64UtoF64:
3814       case Iop_SinF64:
3815       case Iop_CosF64:
3816       case Iop_TanF64:
3817       case Iop_2xm1F64:
3818       case Iop_SqrtF64:
3819       case Iop_RecpExpF64:
3820          /* I32(rm) x I64/F64 -> I64/F64 */
3821          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
3822 
3823       case Iop_ShlD64:
3824       case Iop_ShrD64:
3825       case Iop_RoundD64toInt:
3826          /* I32(rm) x D64 -> D64 */
3827          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
3828 
3829       case Iop_ShlD128:
3830       case Iop_ShrD128:
3831       case Iop_RoundD128toInt:
3832          /* I32(rm) x D128 -> D128 */
3833          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
3834 
3835       case Iop_RoundF128toInt:
3836          /* I32(rm) x F128 -> F128 */
3837          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
3838 
3839       case Iop_D64toI64S:
3840       case Iop_D64toI64U:
3841       case Iop_I64StoD64:
3842       case Iop_I64UtoD64:
3843          /* I32(rm) x I64/D64 -> D64/I64 */
3844          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
3845 
3846       case Iop_F32toD32:
3847       case Iop_F64toD32:
3848       case Iop_F128toD32:
3849       case Iop_D32toF32:
3850       case Iop_D64toF32:
3851       case Iop_D128toF32:
3852          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D32/F32 */
3853          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
3854 
3855       case Iop_F32toD64:
3856       case Iop_F64toD64:
3857       case Iop_F128toD64:
3858       case Iop_D32toF64:
3859       case Iop_D64toF64:
3860       case Iop_D128toF64:
3861          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D64/F64 */
3862          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
3863 
3864       case Iop_F32toD128:
3865       case Iop_F64toD128:
3866       case Iop_F128toD128:
3867       case Iop_D32toF128:
3868       case Iop_D64toF128:
3869       case Iop_D128toF128:
3870          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D128/F128 */
3871          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
3872 
3873       case Iop_RoundF32toInt:
3874       case Iop_SqrtF32:
3875       case Iop_RecpExpF32:
3876          /* I32(rm) x I32/F32 -> I32/F32 */
3877          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
3878 
3879       case Iop_SqrtF128:
3880          /* I32(rm) x F128 -> F128 */
3881          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
3882 
3883       case Iop_I32StoF32:
3884       case Iop_I32UtoF32:
3885       case Iop_F32toI32S:
3886       case Iop_F32toI32U:
3887          /* First arg is I32 (rounding mode), second is F32/I32 (data). */
3888          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
3889 
3890       case Iop_F64toF16:
3891       case Iop_F32toF16:
3892          /* First arg is I32 (rounding mode), second is F64/F32 (data). */
3893          return mkLazy2(mce, Ity_I16, vatom1, vatom2);
3894 
3895       case Iop_F128toI32S: /* IRRoundingMode(I32) x F128 -> signed I32  */
3896       case Iop_F128toI32U: /* IRRoundingMode(I32) x F128 -> unsigned I32  */
3897       case Iop_F128toF32:  /* IRRoundingMode(I32) x F128 -> F32         */
3898       case Iop_D128toI32S: /* IRRoundingMode(I32) x D128 -> signed I32  */
3899       case Iop_D128toI32U: /* IRRoundingMode(I32) x D128 -> unsigned I32  */
3900          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
3901 
3902       case Iop_F128toI128S:   /* IRRoundingMode(I32) x F128 -> signed I128 */
3903       case Iop_RndF128:       /* IRRoundingMode(I32) x F128 -> F128 */
3904          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
3905 
3906       case Iop_F128toI64S: /* IRRoundingMode(I32) x F128 -> signed I64  */
3907       case Iop_F128toI64U: /* IRRoundingMode(I32) x F128 -> unsigned I64  */
3908       case Iop_F128toF64:  /* IRRoundingMode(I32) x F128 -> F64         */
3909       case Iop_D128toD64:  /* IRRoundingMode(I64) x D128 -> D64 */
3910       case Iop_D128toI64S: /* IRRoundingMode(I64) x D128 -> signed I64  */
3911       case Iop_D128toI64U: /* IRRoundingMode(I32) x D128 -> unsigned I64  */
3912          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
3913 
3914       case Iop_F64HLtoF128:
3915       case Iop_D64HLtoD128:
3916          return assignNew('V', mce, Ity_I128,
3917                           binop(Iop_64HLto128, vatom1, vatom2));
3918 
3919       case Iop_F64toI32U:
3920       case Iop_F64toI32S:
3921       case Iop_F64toF32:
3922       case Iop_I64UtoF32:
3923       case Iop_D64toI32U:
3924       case Iop_D64toI32S:
3925          /* First arg is I32 (rounding mode), second is F64/D64 (data). */
3926          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
3927 
3928       case Iop_D64toD32:
3929          /* First arg is I32 (rounding mode), second is D64 (data). */
3930          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
3931 
3932       case Iop_F64toI16S:
3933          /* First arg is I32 (rounding mode), second is F64 (data). */
3934          return mkLazy2(mce, Ity_I16, vatom1, vatom2);
3935 
3936       case Iop_InsertExpD64:
3937          /*  I64 x I64 -> D64 */
3938          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
3939 
3940       case Iop_InsertExpD128:
3941          /*  I64 x I128 -> D128 */
3942          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
3943 
3944       case Iop_CmpF32:
3945       case Iop_CmpF64:
3946       case Iop_CmpF128:
3947       case Iop_CmpD64:
3948       case Iop_CmpD128:
3949       case Iop_CmpExpD64:
3950       case Iop_CmpExpD128:
3951          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
3952 
3953       case Iop_MaxNumF32:
3954       case Iop_MinNumF32:
3955          /* F32 x F32 -> F32 */
3956          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
3957 
3958       case Iop_MaxNumF64:
3959       case Iop_MinNumF64:
3960          /* F64 x F64 -> F64 */
3961          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
3962 
3963       /* non-FP after here */
3964 
3965       case Iop_DivModU64to32:
3966       case Iop_DivModS64to32:
3967          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
3968 
3969       case Iop_DivModU128to64:
3970       case Iop_DivModS128to64:
3971          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
3972 
3973       case Iop_8HLto16:
3974          return assignNew('V', mce, Ity_I16, binop(op, vatom1, vatom2));
3975       case Iop_16HLto32:
3976          return assignNew('V', mce, Ity_I32, binop(op, vatom1, vatom2));
3977       case Iop_32HLto64:
3978          return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
3979 
3980       case Iop_DivModS64to64:
3981       case Iop_MullS64:
3982       case Iop_MullU64: {
3983          IRAtom* vLo64 = mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
3984          IRAtom* vHi64 = mkPCastTo(mce, Ity_I64, vLo64);
3985          return assignNew('V', mce, Ity_I128,
3986                           binop(Iop_64HLto128, vHi64, vLo64));
3987       }
3988 
3989       case Iop_MullS32:
3990       case Iop_MullU32: {
3991          IRAtom* vLo32 = mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
3992          IRAtom* vHi32 = mkPCastTo(mce, Ity_I32, vLo32);
3993          return assignNew('V', mce, Ity_I64,
3994                           binop(Iop_32HLto64, vHi32, vLo32));
3995       }
3996 
3997       case Iop_MullS16:
3998       case Iop_MullU16: {
3999          IRAtom* vLo16 = mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
4000          IRAtom* vHi16 = mkPCastTo(mce, Ity_I16, vLo16);
4001          return assignNew('V', mce, Ity_I32,
4002                           binop(Iop_16HLto32, vHi16, vLo16));
4003       }
4004 
4005       case Iop_MullS8:
4006       case Iop_MullU8: {
4007          IRAtom* vLo8 = mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
4008          IRAtom* vHi8 = mkPCastTo(mce, Ity_I8, vLo8);
4009          return assignNew('V', mce, Ity_I16, binop(Iop_8HLto16, vHi8, vLo8));
4010       }
4011 
4012       case Iop_Sad8Ux4: /* maybe we could do better?  ftm, do mkLazy2. */
4013       case Iop_DivS32:
4014       case Iop_DivU32:
4015       case Iop_DivU32E:
4016       case Iop_DivS32E:
4017       case Iop_QAdd32S: /* could probably do better */
4018       case Iop_QSub32S: /* could probably do better */
4019          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4020 
4021       case Iop_DivS64:
4022       case Iop_DivU64:
4023       case Iop_DivS64E:
4024       case Iop_DivU64E:
4025          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4026 
4027       case Iop_Add32:
4028          if (mce->bogusLiterals || mce->useLLVMworkarounds)
4029             return expensiveAddSub(mce,True,Ity_I32,
4030                                    vatom1,vatom2, atom1,atom2);
4031          else
4032             goto cheap_AddSub32;
4033       case Iop_Sub32:
4034          if (mce->bogusLiterals)
4035             return expensiveAddSub(mce,False,Ity_I32,
4036                                    vatom1,vatom2, atom1,atom2);
4037          else
4038             goto cheap_AddSub32;
4039 
4040       cheap_AddSub32:
4041       case Iop_Mul32:
4042          return mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
4043 
4044       case Iop_CmpORD32S:
4045       case Iop_CmpORD32U:
4046       case Iop_CmpORD64S:
4047       case Iop_CmpORD64U:
4048          return doCmpORD(mce, op, vatom1,vatom2, atom1,atom2);
4049 
4050       case Iop_Add64:
4051          if (mce->bogusLiterals || mce->useLLVMworkarounds)
4052             return expensiveAddSub(mce,True,Ity_I64,
4053                                    vatom1,vatom2, atom1,atom2);
4054          else
4055             goto cheap_AddSub64;
4056       case Iop_Sub64:
4057          if (mce->bogusLiterals)
4058             return expensiveAddSub(mce,False,Ity_I64,
4059                                    vatom1,vatom2, atom1,atom2);
4060          else
4061             goto cheap_AddSub64;
4062 
4063       cheap_AddSub64:
4064       case Iop_Mul64:
4065          return mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
4066 
4067       case Iop_Mul16:
4068       case Iop_Add16:
4069       case Iop_Sub16:
4070          return mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
4071 
4072       case Iop_Mul8:
4073       case Iop_Sub8:
4074       case Iop_Add8:
4075          return mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
4076 
4077       case Iop_CmpEQ64:
4078       case Iop_CmpNE64:
4079          if (mce->bogusLiterals)
4080             goto expensive_cmp64;
4081          else
4082             goto cheap_cmp64;
4083 
4084       expensive_cmp64:
4085       case Iop_ExpCmpNE64:
4086          return expensiveCmpEQorNE(mce,Ity_I64, vatom1,vatom2, atom1,atom2 );
4087 
4088       cheap_cmp64:
4089       case Iop_CmpLE64S: case Iop_CmpLE64U:
4090       case Iop_CmpLT64U: case Iop_CmpLT64S:
4091          return mkPCastTo(mce, Ity_I1, mkUifU64(mce, vatom1,vatom2));
4092 
4093       case Iop_CmpEQ32:
4094       case Iop_CmpNE32:
4095          if (mce->bogusLiterals)
4096             goto expensive_cmp32;
4097          else
4098             goto cheap_cmp32;
4099 
4100       expensive_cmp32:
4101       case Iop_ExpCmpNE32:
4102          return expensiveCmpEQorNE(mce,Ity_I32, vatom1,vatom2, atom1,atom2 );
4103 
4104       cheap_cmp32:
4105       case Iop_CmpLE32S: case Iop_CmpLE32U:
4106       case Iop_CmpLT32U: case Iop_CmpLT32S:
4107          return mkPCastTo(mce, Ity_I1, mkUifU32(mce, vatom1,vatom2));
4108 
4109       case Iop_CmpEQ16: case Iop_CmpNE16:
4110          return mkPCastTo(mce, Ity_I1, mkUifU16(mce, vatom1,vatom2));
4111 
4112       case Iop_ExpCmpNE16:
4113          return expensiveCmpEQorNE(mce,Ity_I16, vatom1,vatom2, atom1,atom2 );
4114 
4115       case Iop_CmpEQ8: case Iop_CmpNE8:
4116          return mkPCastTo(mce, Ity_I1, mkUifU8(mce, vatom1,vatom2));
4117 
4118       case Iop_CasCmpEQ8:  case Iop_CasCmpNE8:
4119       case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
4120       case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
4121       case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
4122          /* Just say these all produce a defined result, regardless
4123             of their arguments.  See COMMENT_ON_CasCmpEQ in this file. */
4124          return assignNew('V', mce, Ity_I1, definedOfType(Ity_I1));
4125 
4126       case Iop_Shl64: case Iop_Shr64: case Iop_Sar64:
4127          return scalarShift( mce, Ity_I64, op, vatom1,vatom2, atom1,atom2 );
4128 
4129       case Iop_Shl32: case Iop_Shr32: case Iop_Sar32:
4130          return scalarShift( mce, Ity_I32, op, vatom1,vatom2, atom1,atom2 );
4131 
4132       case Iop_Shl16: case Iop_Shr16: case Iop_Sar16:
4133          return scalarShift( mce, Ity_I16, op, vatom1,vatom2, atom1,atom2 );
4134 
4135       case Iop_Shl8: case Iop_Shr8: case Iop_Sar8:
4136          return scalarShift( mce, Ity_I8, op, vatom1,vatom2, atom1,atom2 );
4137 
4138       case Iop_AndV256:
4139          uifu = mkUifUV256; difd = mkDifDV256;
4140          and_or_ty = Ity_V256; improve = mkImproveANDV256; goto do_And_Or;
4141       case Iop_AndV128:
4142          uifu = mkUifUV128; difd = mkDifDV128;
4143          and_or_ty = Ity_V128; improve = mkImproveANDV128; goto do_And_Or;
4144       case Iop_And64:
4145          uifu = mkUifU64; difd = mkDifD64;
4146          and_or_ty = Ity_I64; improve = mkImproveAND64; goto do_And_Or;
4147       case Iop_And32:
4148          uifu = mkUifU32; difd = mkDifD32;
4149          and_or_ty = Ity_I32; improve = mkImproveAND32; goto do_And_Or;
4150       case Iop_And16:
4151          uifu = mkUifU16; difd = mkDifD16;
4152          and_or_ty = Ity_I16; improve = mkImproveAND16; goto do_And_Or;
4153       case Iop_And8:
4154          uifu = mkUifU8; difd = mkDifD8;
4155          and_or_ty = Ity_I8; improve = mkImproveAND8; goto do_And_Or;
4156 
4157       case Iop_OrV256:
4158          uifu = mkUifUV256; difd = mkDifDV256;
4159          and_or_ty = Ity_V256; improve = mkImproveORV256; goto do_And_Or;
4160       case Iop_OrV128:
4161          uifu = mkUifUV128; difd = mkDifDV128;
4162          and_or_ty = Ity_V128; improve = mkImproveORV128; goto do_And_Or;
4163       case Iop_Or64:
4164          uifu = mkUifU64; difd = mkDifD64;
4165          and_or_ty = Ity_I64; improve = mkImproveOR64; goto do_And_Or;
4166       case Iop_Or32:
4167          uifu = mkUifU32; difd = mkDifD32;
4168          and_or_ty = Ity_I32; improve = mkImproveOR32; goto do_And_Or;
4169       case Iop_Or16:
4170          uifu = mkUifU16; difd = mkDifD16;
4171          and_or_ty = Ity_I16; improve = mkImproveOR16; goto do_And_Or;
4172       case Iop_Or8:
4173          uifu = mkUifU8; difd = mkDifD8;
4174          and_or_ty = Ity_I8; improve = mkImproveOR8; goto do_And_Or;
4175 
4176       do_And_Or:
4177          return
4178          assignNew(
4179             'V', mce,
4180             and_or_ty,
4181             difd(mce, uifu(mce, vatom1, vatom2),
4182                       difd(mce, improve(mce, atom1, vatom1),
4183                                 improve(mce, atom2, vatom2) ) ) );
4184 
4185       case Iop_Xor8:
4186          return mkUifU8(mce, vatom1, vatom2);
4187       case Iop_Xor16:
4188          return mkUifU16(mce, vatom1, vatom2);
4189       case Iop_Xor32:
4190          return mkUifU32(mce, vatom1, vatom2);
4191       case Iop_Xor64:
4192          return mkUifU64(mce, vatom1, vatom2);
4193       case Iop_XorV128:
4194          return mkUifUV128(mce, vatom1, vatom2);
4195       case Iop_XorV256:
4196          return mkUifUV256(mce, vatom1, vatom2);
4197 
4198       /* V256-bit SIMD */
4199 
4200       case Iop_ShrN16x16:
4201       case Iop_ShrN32x8:
4202       case Iop_ShrN64x4:
4203       case Iop_SarN16x16:
4204       case Iop_SarN32x8:
4205       case Iop_ShlN16x16:
4206       case Iop_ShlN32x8:
4207       case Iop_ShlN64x4:
4208          /* Same scheme as with all other shifts.  Note: 22 Oct 05:
4209             this is wrong now, scalar shifts are done properly lazily.
4210             Vector shifts should be fixed too. */
4211          complainIfUndefined(mce, atom2, NULL);
4212          return assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2));
4213 
4214       case Iop_QSub8Ux32:
4215       case Iop_QSub8Sx32:
4216       case Iop_Sub8x32:
4217       case Iop_Min8Ux32:
4218       case Iop_Min8Sx32:
4219       case Iop_Max8Ux32:
4220       case Iop_Max8Sx32:
4221       case Iop_CmpGT8Sx32:
4222       case Iop_CmpEQ8x32:
4223       case Iop_Avg8Ux32:
4224       case Iop_QAdd8Ux32:
4225       case Iop_QAdd8Sx32:
4226       case Iop_Add8x32:
4227          return binary8Ix32(mce, vatom1, vatom2);
4228 
4229       case Iop_QSub16Ux16:
4230       case Iop_QSub16Sx16:
4231       case Iop_Sub16x16:
4232       case Iop_Mul16x16:
4233       case Iop_MulHi16Sx16:
4234       case Iop_MulHi16Ux16:
4235       case Iop_Min16Sx16:
4236       case Iop_Min16Ux16:
4237       case Iop_Max16Sx16:
4238       case Iop_Max16Ux16:
4239       case Iop_CmpGT16Sx16:
4240       case Iop_CmpEQ16x16:
4241       case Iop_Avg16Ux16:
4242       case Iop_QAdd16Ux16:
4243       case Iop_QAdd16Sx16:
4244       case Iop_Add16x16:
4245          return binary16Ix16(mce, vatom1, vatom2);
4246 
4247       case Iop_Sub32x8:
4248       case Iop_CmpGT32Sx8:
4249       case Iop_CmpEQ32x8:
4250       case Iop_Add32x8:
4251       case Iop_Max32Ux8:
4252       case Iop_Max32Sx8:
4253       case Iop_Min32Ux8:
4254       case Iop_Min32Sx8:
4255       case Iop_Mul32x8:
4256          return binary32Ix8(mce, vatom1, vatom2);
4257 
4258       case Iop_Sub64x4:
4259       case Iop_Add64x4:
4260       case Iop_CmpEQ64x4:
4261       case Iop_CmpGT64Sx4:
4262          return binary64Ix4(mce, vatom1, vatom2);
4263 
4264      /* Perm32x8: rearrange values in left arg using steering values
4265         from right arg.  So rearrange the vbits in the same way but
4266         pessimise wrt steering values. */
4267       case Iop_Perm32x8:
4268          return mkUifUV256(
4269                    mce,
4270                    assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2)),
4271                    mkPCast32x8(mce, vatom2)
4272                 );
4273 
4274       /* Q-and-Qshift-by-vector of the form (V128, V128) -> V256.
4275          Handle the shifted results in the same way that other
4276          binary Q ops are handled, eg QSub: UifU the two args,
4277          then pessimise -- which is binaryNIxM.  But for the upper
4278          V128, we require to generate just 1 bit which is the
4279          pessimised shift result, with 127 defined zeroes above it.
4280 
4281          Note that this overly pessimistic in that in fact only the
4282          bottom 8 bits of each lane of the second arg determine the shift
4283          amount.  Really we ought to ignore any undefinedness in the
4284          rest of the lanes of the second arg. */
4285       case Iop_QandSQsh64x2:  case Iop_QandUQsh64x2:
4286       case Iop_QandSQRsh64x2: case Iop_QandUQRsh64x2:
4287       case Iop_QandSQsh32x4:  case Iop_QandUQsh32x4:
4288       case Iop_QandSQRsh32x4: case Iop_QandUQRsh32x4:
4289       case Iop_QandSQsh16x8:  case Iop_QandUQsh16x8:
4290       case Iop_QandSQRsh16x8: case Iop_QandUQRsh16x8:
4291       case Iop_QandSQsh8x16:  case Iop_QandUQsh8x16:
4292       case Iop_QandSQRsh8x16: case Iop_QandUQRsh8x16:
4293       {
4294          // The function to generate the pessimised shift result
4295          IRAtom* (*binaryNIxM)(MCEnv*,IRAtom*,IRAtom*) = NULL;
4296          switch (op) {
4297             case Iop_QandSQsh64x2:
4298             case Iop_QandUQsh64x2:
4299             case Iop_QandSQRsh64x2:
4300             case Iop_QandUQRsh64x2:
4301                binaryNIxM = binary64Ix2;
4302                break;
4303             case Iop_QandSQsh32x4:
4304             case Iop_QandUQsh32x4:
4305             case Iop_QandSQRsh32x4:
4306             case Iop_QandUQRsh32x4:
4307                binaryNIxM = binary32Ix4;
4308                break;
4309             case Iop_QandSQsh16x8:
4310             case Iop_QandUQsh16x8:
4311             case Iop_QandSQRsh16x8:
4312             case Iop_QandUQRsh16x8:
4313                binaryNIxM = binary16Ix8;
4314                break;
4315             case Iop_QandSQsh8x16:
4316             case Iop_QandUQsh8x16:
4317             case Iop_QandSQRsh8x16:
4318             case Iop_QandUQRsh8x16:
4319                binaryNIxM = binary8Ix16;
4320                break;
4321             default:
4322                tl_assert(0);
4323          }
4324          tl_assert(binaryNIxM);
4325          // Pessimised shift result, shV[127:0]
4326          IRAtom* shV = binaryNIxM(mce, vatom1, vatom2);
4327          // Generates: Def--(127)--Def PCast-to-I1(shV)
4328          IRAtom* qV = mkPCastXXtoXXlsb(mce, shV, Ity_V128);
4329          // and assemble the result
4330          return assignNew('V', mce, Ity_V256,
4331                           binop(Iop_V128HLtoV256, qV, shV));
4332       }
4333 
4334       default:
4335          ppIROp(op);
4336          VG_(tool_panic)("memcheck:expr2vbits_Binop");
4337    }
4338 }
4339 
4340 
4341 static
expr2vbits_Unop(MCEnv * mce,IROp op,IRAtom * atom)4342 IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
4343 {
4344    /* For the widening operations {8,16,32}{U,S}to{16,32,64}, the
4345       selection of shadow operation implicitly duplicates the logic in
4346       do_shadow_LoadG and should be kept in sync (in the very unlikely
4347       event that the interpretation of such widening ops changes in
4348       future).  See comment in do_shadow_LoadG. */
4349    IRAtom* vatom = expr2vbits( mce, atom );
4350    tl_assert(isOriginalAtom(mce,atom));
4351    switch (op) {
4352 
4353       case Iop_Abs64Fx2:
4354       case Iop_Neg64Fx2:
4355       case Iop_RSqrtEst64Fx2:
4356       case Iop_RecipEst64Fx2:
4357          return unary64Fx2(mce, vatom);
4358 
4359       case Iop_Sqrt64F0x2:
4360          return unary64F0x2(mce, vatom);
4361 
4362       case Iop_Sqrt32Fx8:
4363       case Iop_RSqrtEst32Fx8:
4364       case Iop_RecipEst32Fx8:
4365          return unary32Fx8(mce, vatom);
4366 
4367       case Iop_Sqrt64Fx4:
4368          return unary64Fx4(mce, vatom);
4369 
4370       case Iop_RecipEst32Fx4:
4371       case Iop_I32UtoFx4:
4372       case Iop_I32StoFx4:
4373       case Iop_QFtoI32Ux4_RZ:
4374       case Iop_QFtoI32Sx4_RZ:
4375       case Iop_RoundF32x4_RM:
4376       case Iop_RoundF32x4_RP:
4377       case Iop_RoundF32x4_RN:
4378       case Iop_RoundF32x4_RZ:
4379       case Iop_RecipEst32Ux4:
4380       case Iop_Abs32Fx4:
4381       case Iop_Neg32Fx4:
4382       case Iop_RSqrtEst32Fx4:
4383          return unary32Fx4(mce, vatom);
4384 
4385       case Iop_I32UtoFx2:
4386       case Iop_I32StoFx2:
4387       case Iop_RecipEst32Fx2:
4388       case Iop_RecipEst32Ux2:
4389       case Iop_Abs32Fx2:
4390       case Iop_Neg32Fx2:
4391       case Iop_RSqrtEst32Fx2:
4392          return unary32Fx2(mce, vatom);
4393 
4394       case Iop_Sqrt32F0x4:
4395       case Iop_RSqrtEst32F0x4:
4396       case Iop_RecipEst32F0x4:
4397          return unary32F0x4(mce, vatom);
4398 
4399       case Iop_32UtoV128:
4400       case Iop_64UtoV128:
4401       case Iop_Dup8x16:
4402       case Iop_Dup16x8:
4403       case Iop_Dup32x4:
4404       case Iop_Reverse1sIn8_x16:
4405       case Iop_Reverse8sIn16_x8:
4406       case Iop_Reverse8sIn32_x4:
4407       case Iop_Reverse16sIn32_x4:
4408       case Iop_Reverse8sIn64_x2:
4409       case Iop_Reverse16sIn64_x2:
4410       case Iop_Reverse32sIn64_x2:
4411       case Iop_V256toV128_1: case Iop_V256toV128_0:
4412       case Iop_ZeroHI64ofV128:
4413       case Iop_ZeroHI96ofV128:
4414       case Iop_ZeroHI112ofV128:
4415       case Iop_ZeroHI120ofV128:
4416          return assignNew('V', mce, Ity_V128, unop(op, vatom));
4417 
4418       case Iop_F128HItoF64:  /* F128 -> high half of F128 */
4419       case Iop_D128HItoD64:  /* D128 -> high half of D128 */
4420          return assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vatom));
4421       case Iop_F128LOtoF64:  /* F128 -> low  half of F128 */
4422       case Iop_D128LOtoD64:  /* D128 -> low  half of D128 */
4423          return assignNew('V', mce, Ity_I64, unop(Iop_128to64, vatom));
4424 
4425       case Iop_NegF128:
4426       case Iop_AbsF128:
4427       case Iop_RndF128:
4428       case Iop_TruncF128toI64S: /* F128 -> I64S */
4429       case Iop_TruncF128toI32S: /* F128 -> I32S (result stored in 64-bits) */
4430       case Iop_TruncF128toI64U: /* F128 -> I64U */
4431       case Iop_TruncF128toI32U: /* F128 -> I32U (result stored in 64-bits) */
4432          return mkPCastTo(mce, Ity_I128, vatom);
4433 
4434       case Iop_BCD128toI128S:
4435       case Iop_MulI128by10:
4436       case Iop_MulI128by10Carry:
4437       case Iop_F16toF64x2:
4438       case Iop_F64toF16x2:
4439          return vatom;
4440 
4441       case Iop_I32StoF128: /* signed I32 -> F128 */
4442       case Iop_I64StoF128: /* signed I64 -> F128 */
4443       case Iop_I32UtoF128: /* unsigned I32 -> F128 */
4444       case Iop_I64UtoF128: /* unsigned I64 -> F128 */
4445       case Iop_F32toF128:  /* F32 -> F128 */
4446       case Iop_F64toF128:  /* F64 -> F128 */
4447       case Iop_I32StoD128: /* signed I64 -> D128 */
4448       case Iop_I64StoD128: /* signed I64 -> D128 */
4449       case Iop_I32UtoD128: /* unsigned I32 -> D128 */
4450       case Iop_I64UtoD128: /* unsigned I64 -> D128 */
4451          return mkPCastTo(mce, Ity_I128, vatom);
4452 
4453       case Iop_F16toF64:
4454       case Iop_F32toF64:
4455       case Iop_I32StoF64:
4456       case Iop_I32UtoF64:
4457       case Iop_NegF64:
4458       case Iop_AbsF64:
4459       case Iop_RSqrtEst5GoodF64:
4460       case Iop_RoundF64toF64_NEAREST:
4461       case Iop_RoundF64toF64_NegINF:
4462       case Iop_RoundF64toF64_PosINF:
4463       case Iop_RoundF64toF64_ZERO:
4464       case Iop_Clz64:
4465       case Iop_D32toD64:
4466       case Iop_I32StoD64:
4467       case Iop_I32UtoD64:
4468       case Iop_ExtractExpD64:    /* D64  -> I64 */
4469       case Iop_ExtractExpD128:   /* D128 -> I64 */
4470       case Iop_ExtractSigD64:    /* D64  -> I64 */
4471       case Iop_ExtractSigD128:   /* D128 -> I64 */
4472       case Iop_DPBtoBCD:
4473       case Iop_BCDtoDPB:
4474          return mkPCastTo(mce, Ity_I64, vatom);
4475 
4476       case Iop_D64toD128:
4477          return mkPCastTo(mce, Ity_I128, vatom);
4478 
4479       case Iop_Clz32:
4480       case Iop_TruncF64asF32:
4481       case Iop_NegF32:
4482       case Iop_AbsF32:
4483       case Iop_F16toF32:
4484          return mkPCastTo(mce, Ity_I32, vatom);
4485 
4486       case Iop_Ctz32:
4487       case Iop_Ctz64:
4488          return expensiveCountTrailingZeroes(mce, op, atom, vatom);
4489 
4490       case Iop_1Uto64:
4491       case Iop_1Sto64:
4492       case Iop_8Uto64:
4493       case Iop_8Sto64:
4494       case Iop_16Uto64:
4495       case Iop_16Sto64:
4496       case Iop_32Sto64:
4497       case Iop_32Uto64:
4498       case Iop_V128to64:
4499       case Iop_V128HIto64:
4500       case Iop_128HIto64:
4501       case Iop_128to64:
4502       case Iop_Dup8x8:
4503       case Iop_Dup16x4:
4504       case Iop_Dup32x2:
4505       case Iop_Reverse8sIn16_x4:
4506       case Iop_Reverse8sIn32_x2:
4507       case Iop_Reverse16sIn32_x2:
4508       case Iop_Reverse8sIn64_x1:
4509       case Iop_Reverse16sIn64_x1:
4510       case Iop_Reverse32sIn64_x1:
4511       case Iop_V256to64_0: case Iop_V256to64_1:
4512       case Iop_V256to64_2: case Iop_V256to64_3:
4513          return assignNew('V', mce, Ity_I64, unop(op, vatom));
4514 
4515       case Iop_64to32:
4516       case Iop_64HIto32:
4517       case Iop_1Uto32:
4518       case Iop_1Sto32:
4519       case Iop_8Uto32:
4520       case Iop_16Uto32:
4521       case Iop_16Sto32:
4522       case Iop_8Sto32:
4523       case Iop_V128to32:
4524          return assignNew('V', mce, Ity_I32, unop(op, vatom));
4525 
4526       case Iop_8Sto16:
4527       case Iop_8Uto16:
4528       case Iop_32to16:
4529       case Iop_32HIto16:
4530       case Iop_64to16:
4531       case Iop_GetMSBs8x16:
4532          return assignNew('V', mce, Ity_I16, unop(op, vatom));
4533 
4534       case Iop_1Uto8:
4535       case Iop_1Sto8:
4536       case Iop_16to8:
4537       case Iop_16HIto8:
4538       case Iop_32to8:
4539       case Iop_64to8:
4540       case Iop_GetMSBs8x8:
4541          return assignNew('V', mce, Ity_I8, unop(op, vatom));
4542 
4543       case Iop_32to1:
4544          return assignNew('V', mce, Ity_I1, unop(Iop_32to1, vatom));
4545 
4546       case Iop_64to1:
4547          return assignNew('V', mce, Ity_I1, unop(Iop_64to1, vatom));
4548 
4549       case Iop_ReinterpF64asI64:
4550       case Iop_ReinterpI64asF64:
4551       case Iop_ReinterpI32asF32:
4552       case Iop_ReinterpF32asI32:
4553       case Iop_ReinterpI64asD64:
4554       case Iop_ReinterpD64asI64:
4555       case Iop_NotV256:
4556       case Iop_NotV128:
4557       case Iop_Not64:
4558       case Iop_Not32:
4559       case Iop_Not16:
4560       case Iop_Not8:
4561       case Iop_Not1:
4562          return vatom;
4563 
4564       case Iop_CmpNEZ8x8:
4565       case Iop_Cnt8x8:
4566       case Iop_Clz8x8:
4567       case Iop_Cls8x8:
4568       case Iop_Abs8x8:
4569          return mkPCast8x8(mce, vatom);
4570 
4571       case Iop_CmpNEZ8x16:
4572       case Iop_Cnt8x16:
4573       case Iop_Clz8x16:
4574       case Iop_Cls8x16:
4575       case Iop_Abs8x16:
4576       case Iop_Ctz8x16:
4577          return mkPCast8x16(mce, vatom);
4578 
4579       case Iop_CmpNEZ16x4:
4580       case Iop_Clz16x4:
4581       case Iop_Cls16x4:
4582       case Iop_Abs16x4:
4583          return mkPCast16x4(mce, vatom);
4584 
4585       case Iop_CmpNEZ16x8:
4586       case Iop_Clz16x8:
4587       case Iop_Cls16x8:
4588       case Iop_Abs16x8:
4589       case Iop_Ctz16x8:
4590          return mkPCast16x8(mce, vatom);
4591 
4592       case Iop_CmpNEZ32x2:
4593       case Iop_Clz32x2:
4594       case Iop_Cls32x2:
4595       case Iop_FtoI32Ux2_RZ:
4596       case Iop_FtoI32Sx2_RZ:
4597       case Iop_Abs32x2:
4598          return mkPCast32x2(mce, vatom);
4599 
4600       case Iop_CmpNEZ32x4:
4601       case Iop_Clz32x4:
4602       case Iop_Cls32x4:
4603       case Iop_FtoI32Ux4_RZ:
4604       case Iop_FtoI32Sx4_RZ:
4605       case Iop_Abs32x4:
4606       case Iop_RSqrtEst32Ux4:
4607       case Iop_Ctz32x4:
4608          return mkPCast32x4(mce, vatom);
4609 
4610       case Iop_CmpwNEZ32:
4611          return mkPCastTo(mce, Ity_I32, vatom);
4612 
4613       case Iop_CmpwNEZ64:
4614          return mkPCastTo(mce, Ity_I64, vatom);
4615 
4616       case Iop_CmpNEZ64x2:
4617       case Iop_CipherSV128:
4618       case Iop_Clz64x2:
4619       case Iop_Abs64x2:
4620       case Iop_Ctz64x2:
4621          return mkPCast64x2(mce, vatom);
4622 
4623       case Iop_PwBitMtxXpose64x2:
4624          return assignNew('V', mce, Ity_V128, unop(op, vatom));
4625 
4626       case Iop_NarrowUn16to8x8:
4627       case Iop_NarrowUn32to16x4:
4628       case Iop_NarrowUn64to32x2:
4629       case Iop_QNarrowUn16Sto8Sx8:
4630       case Iop_QNarrowUn16Sto8Ux8:
4631       case Iop_QNarrowUn16Uto8Ux8:
4632       case Iop_QNarrowUn32Sto16Sx4:
4633       case Iop_QNarrowUn32Sto16Ux4:
4634       case Iop_QNarrowUn32Uto16Ux4:
4635       case Iop_QNarrowUn64Sto32Sx2:
4636       case Iop_QNarrowUn64Sto32Ux2:
4637       case Iop_QNarrowUn64Uto32Ux2:
4638       case Iop_F32toF16x4:
4639          return vectorNarrowUnV128(mce, op, vatom);
4640 
4641       case Iop_Widen8Sto16x8:
4642       case Iop_Widen8Uto16x8:
4643       case Iop_Widen16Sto32x4:
4644       case Iop_Widen16Uto32x4:
4645       case Iop_Widen32Sto64x2:
4646       case Iop_Widen32Uto64x2:
4647       case Iop_F16toF32x4:
4648          return vectorWidenI64(mce, op, vatom);
4649 
4650       case Iop_PwAddL32Ux2:
4651       case Iop_PwAddL32Sx2:
4652          return mkPCastTo(mce, Ity_I64,
4653                assignNew('V', mce, Ity_I64, unop(op, mkPCast32x2(mce, vatom))));
4654 
4655       case Iop_PwAddL16Ux4:
4656       case Iop_PwAddL16Sx4:
4657          return mkPCast32x2(mce,
4658                assignNew('V', mce, Ity_I64, unop(op, mkPCast16x4(mce, vatom))));
4659 
4660       case Iop_PwAddL8Ux8:
4661       case Iop_PwAddL8Sx8:
4662          return mkPCast16x4(mce,
4663                assignNew('V', mce, Ity_I64, unop(op, mkPCast8x8(mce, vatom))));
4664 
4665       case Iop_PwAddL32Ux4:
4666       case Iop_PwAddL32Sx4:
4667          return mkPCast64x2(mce,
4668                assignNew('V', mce, Ity_V128, unop(op, mkPCast32x4(mce, vatom))));
4669 
4670       case Iop_PwAddL16Ux8:
4671       case Iop_PwAddL16Sx8:
4672          return mkPCast32x4(mce,
4673                assignNew('V', mce, Ity_V128, unop(op, mkPCast16x8(mce, vatom))));
4674 
4675       case Iop_PwAddL8Ux16:
4676       case Iop_PwAddL8Sx16:
4677          return mkPCast16x8(mce,
4678                assignNew('V', mce, Ity_V128, unop(op, mkPCast8x16(mce, vatom))));
4679 
4680       case Iop_I64UtoF32:
4681       default:
4682          ppIROp(op);
4683          VG_(tool_panic)("memcheck:expr2vbits_Unop");
4684    }
4685 }
4686 
4687 
4688 /* Worker function -- do not call directly.  See comments on
4689    expr2vbits_Load for the meaning of |guard|.
4690 
4691    Generates IR to (1) perform a definedness test of |addr|, (2)
4692    perform a validity test of |addr|, and (3) return the Vbits for the
4693    location indicated by |addr|.  All of this only happens when
4694    |guard| is NULL or |guard| evaluates to True at run time.
4695 
4696    If |guard| evaluates to False at run time, the returned value is
4697    the IR-mandated 0x55..55 value, and no checks nor shadow loads are
4698    performed.
4699 
4700    The definedness of |guard| itself is not checked.  That is assumed
4701    to have been done before this point, by the caller. */
4702 static
expr2vbits_Load_WRK(MCEnv * mce,IREndness end,IRType ty,IRAtom * addr,UInt bias,IRAtom * guard)4703 IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
4704                               IREndness end, IRType ty,
4705                               IRAtom* addr, UInt bias, IRAtom* guard )
4706 {
4707    tl_assert(isOriginalAtom(mce,addr));
4708    tl_assert(end == Iend_LE || end == Iend_BE);
4709 
4710    /* First, emit a definedness test for the address.  This also sets
4711       the address (shadow) to 'defined' following the test. */
4712    complainIfUndefined( mce, addr, guard );
4713 
4714    /* Now cook up a call to the relevant helper function, to read the
4715       data V bits from shadow memory. */
4716    ty = shadowTypeV(ty);
4717 
4718    void*        helper           = NULL;
4719    const HChar* hname            = NULL;
4720    Bool         ret_via_outparam = False;
4721 
4722    if (end == Iend_LE) {
4723       switch (ty) {
4724          case Ity_V256: helper = &MC_(helperc_LOADV256le);
4725                         hname = "MC_(helperc_LOADV256le)";
4726                         ret_via_outparam = True;
4727                         break;
4728          case Ity_V128: helper = &MC_(helperc_LOADV128le);
4729                         hname = "MC_(helperc_LOADV128le)";
4730                         ret_via_outparam = True;
4731                         break;
4732          case Ity_I64:  helper = &MC_(helperc_LOADV64le);
4733                         hname = "MC_(helperc_LOADV64le)";
4734                         break;
4735          case Ity_I32:  helper = &MC_(helperc_LOADV32le);
4736                         hname = "MC_(helperc_LOADV32le)";
4737                         break;
4738          case Ity_I16:  helper = &MC_(helperc_LOADV16le);
4739                         hname = "MC_(helperc_LOADV16le)";
4740                         break;
4741          case Ity_I8:   helper = &MC_(helperc_LOADV8);
4742                         hname = "MC_(helperc_LOADV8)";
4743                         break;
4744          default:       ppIRType(ty);
4745                         VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(LE)");
4746       }
4747    } else {
4748       switch (ty) {
4749          case Ity_V256: helper = &MC_(helperc_LOADV256be);
4750                         hname = "MC_(helperc_LOADV256be)";
4751                         ret_via_outparam = True;
4752                         break;
4753          case Ity_V128: helper = &MC_(helperc_LOADV128be);
4754                         hname = "MC_(helperc_LOADV128be)";
4755                         ret_via_outparam = True;
4756                         break;
4757          case Ity_I64:  helper = &MC_(helperc_LOADV64be);
4758                         hname = "MC_(helperc_LOADV64be)";
4759                         break;
4760          case Ity_I32:  helper = &MC_(helperc_LOADV32be);
4761                         hname = "MC_(helperc_LOADV32be)";
4762                         break;
4763          case Ity_I16:  helper = &MC_(helperc_LOADV16be);
4764                         hname = "MC_(helperc_LOADV16be)";
4765                         break;
4766          case Ity_I8:   helper = &MC_(helperc_LOADV8);
4767                         hname = "MC_(helperc_LOADV8)";
4768                         break;
4769          default:       ppIRType(ty);
4770                         VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(BE)");
4771       }
4772    }
4773 
4774    tl_assert(helper);
4775    tl_assert(hname);
4776 
4777    /* Generate the actual address into addrAct. */
4778    IRAtom* addrAct;
4779    if (bias == 0) {
4780       addrAct = addr;
4781    } else {
4782       IROp    mkAdd;
4783       IRAtom* eBias;
4784       IRType  tyAddr  = mce->hWordTy;
4785       tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
4786       mkAdd   = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
4787       eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
4788       addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias) );
4789    }
4790 
4791    /* We need to have a place to park the V bits we're just about to
4792       read. */
4793    IRTemp datavbits = newTemp(mce, ty, VSh);
4794 
4795    /* Here's the call. */
4796    IRDirty* di;
4797    if (ret_via_outparam) {
4798       di = unsafeIRDirty_1_N( datavbits,
4799                               2/*regparms*/,
4800                               hname, VG_(fnptr_to_fnentry)( helper ),
4801                               mkIRExprVec_2( IRExpr_VECRET(), addrAct ) );
4802    } else {
4803       di = unsafeIRDirty_1_N( datavbits,
4804                               1/*regparms*/,
4805                               hname, VG_(fnptr_to_fnentry)( helper ),
4806                               mkIRExprVec_1( addrAct ) );
4807    }
4808 
4809    setHelperAnns( mce, di );
4810    if (guard) {
4811       di->guard = guard;
4812       /* Ideally the didn't-happen return value here would be all-ones
4813          (all-undefined), so it'd be obvious if it got used
4814          inadvertently.  We can get by with the IR-mandated default
4815          value (0b01 repeating, 0x55 etc) as that'll still look pretty
4816          undefined if it ever leaks out. */
4817    }
4818    stmt( 'V', mce, IRStmt_Dirty(di) );
4819 
4820    return mkexpr(datavbits);
4821 }
4822 
4823 
4824 /* Generate IR to do a shadow load.  The helper is expected to check
4825    the validity of the address and return the V bits for that address.
4826    This can optionally be controlled by a guard, which is assumed to
4827    be True if NULL.  In the case where the guard is False at runtime,
4828    the helper will return the didn't-do-the-call value of 0x55..55.
4829    Since that means "completely undefined result", the caller of
4830    this function will need to fix up the result somehow in that
4831    case.
4832 
4833    Caller of this function is also expected to have checked the
4834    definedness of |guard| before this point.
4835 */
4836 static
expr2vbits_Load(MCEnv * mce,IREndness end,IRType ty,IRAtom * addr,UInt bias,IRAtom * guard)4837 IRAtom* expr2vbits_Load ( MCEnv* mce,
4838                           IREndness end, IRType ty,
4839                           IRAtom* addr, UInt bias,
4840                           IRAtom* guard )
4841 {
4842    tl_assert(end == Iend_LE || end == Iend_BE);
4843    switch (shadowTypeV(ty)) {
4844       case Ity_I8:
4845       case Ity_I16:
4846       case Ity_I32:
4847       case Ity_I64:
4848       case Ity_V128:
4849       case Ity_V256:
4850          return expr2vbits_Load_WRK(mce, end, ty, addr, bias, guard);
4851       default:
4852          VG_(tool_panic)("expr2vbits_Load");
4853    }
4854 }
4855 
4856 
4857 /* The most general handler for guarded loads.  Assumes the
4858    definedness of GUARD has already been checked by the caller.  A
4859    GUARD of NULL is assumed to mean "always True".  Generates code to
4860    check the definedness and validity of ADDR.
4861 
4862    Generate IR to do a shadow load from ADDR and return the V bits.
4863    The loaded type is TY.  The loaded data is then (shadow) widened by
4864    using VWIDEN, which can be Iop_INVALID to denote a no-op.  If GUARD
4865    evaluates to False at run time then the returned Vbits are simply
4866    VALT instead.  Note therefore that the argument type of VWIDEN must
4867    be TY and the result type of VWIDEN must equal the type of VALT.
4868 */
4869 static
expr2vbits_Load_guarded_General(MCEnv * mce,IREndness end,IRType ty,IRAtom * addr,UInt bias,IRAtom * guard,IROp vwiden,IRAtom * valt)4870 IRAtom* expr2vbits_Load_guarded_General ( MCEnv* mce,
4871                                           IREndness end, IRType ty,
4872                                           IRAtom* addr, UInt bias,
4873                                           IRAtom* guard,
4874                                           IROp vwiden, IRAtom* valt )
4875 {
4876    /* Sanity check the conversion operation, and also set TYWIDE. */
4877    IRType tyWide = Ity_INVALID;
4878    switch (vwiden) {
4879       case Iop_INVALID:
4880          tyWide = ty;
4881          break;
4882       case Iop_16Uto32: case Iop_16Sto32: case Iop_8Uto32: case Iop_8Sto32:
4883          tyWide = Ity_I32;
4884          break;
4885       default:
4886          VG_(tool_panic)("memcheck:expr2vbits_Load_guarded_General");
4887    }
4888 
4889    /* If the guard evaluates to True, this will hold the loaded V bits
4890       at TY.  If the guard evaluates to False, this will be all
4891       ones, meaning "all undefined", in which case we will have to
4892       replace it using an ITE below. */
4893    IRAtom* iftrue1
4894       = assignNew('V', mce, ty,
4895                   expr2vbits_Load(mce, end, ty, addr, bias, guard));
4896    /* Now (shadow-) widen the loaded V bits to the desired width.  In
4897       the guard-is-False case, the allowable widening operators will
4898       in the worst case (unsigned widening) at least leave the
4899       pre-widened part as being marked all-undefined, and in the best
4900       case (signed widening) mark the whole widened result as
4901       undefined.  Anyway, it doesn't matter really, since in this case
4902       we will replace said value with the default value |valt| using an
4903       ITE. */
4904    IRAtom* iftrue2
4905       = vwiden == Iop_INVALID
4906            ? iftrue1
4907            : assignNew('V', mce, tyWide, unop(vwiden, iftrue1));
4908    /* These are the V bits we will return if the load doesn't take
4909       place. */
4910    IRAtom* iffalse
4911       = valt;
4912    /* Prepare the cond for the ITE.  Convert a NULL cond into
4913       something that iropt knows how to fold out later. */
4914    IRAtom* cond
4915       = guard == NULL  ? mkU1(1)  : guard;
4916    /* And assemble the final result. */
4917    return assignNew('V', mce, tyWide, IRExpr_ITE(cond, iftrue2, iffalse));
4918 }
4919 
4920 
4921 /* A simpler handler for guarded loads, in which there is no
4922    conversion operation, and the default V bit return (when the guard
4923    evaluates to False at runtime) is "all defined".  If there is no
4924    guard expression or the guard is always TRUE this function behaves
4925    like expr2vbits_Load.  It is assumed that definedness of GUARD has
4926    already been checked at the call site. */
4927 static
expr2vbits_Load_guarded_Simple(MCEnv * mce,IREndness end,IRType ty,IRAtom * addr,UInt bias,IRAtom * guard)4928 IRAtom* expr2vbits_Load_guarded_Simple ( MCEnv* mce,
4929                                          IREndness end, IRType ty,
4930                                          IRAtom* addr, UInt bias,
4931                                          IRAtom *guard )
4932 {
4933    return expr2vbits_Load_guarded_General(
4934              mce, end, ty, addr, bias, guard, Iop_INVALID, definedOfType(ty)
4935           );
4936 }
4937 
4938 
4939 static
expr2vbits_ITE(MCEnv * mce,IRAtom * cond,IRAtom * iftrue,IRAtom * iffalse)4940 IRAtom* expr2vbits_ITE ( MCEnv* mce,
4941                          IRAtom* cond, IRAtom* iftrue, IRAtom* iffalse )
4942 {
4943    IRAtom *vbitsC, *vbits0, *vbits1;
4944    IRType ty;
4945    /* Given ITE(cond, iftrue,  iffalse),  generate
4946             ITE(cond, iftrue#, iffalse#) `UifU` PCast(cond#)
4947       That is, steer the V bits like the originals, but trash the
4948       result if the steering value is undefined.  This gives
4949       lazy propagation. */
4950    tl_assert(isOriginalAtom(mce, cond));
4951    tl_assert(isOriginalAtom(mce, iftrue));
4952    tl_assert(isOriginalAtom(mce, iffalse));
4953 
4954    vbitsC = expr2vbits(mce, cond);
4955    vbits1 = expr2vbits(mce, iftrue);
4956    vbits0 = expr2vbits(mce, iffalse);
4957    ty = typeOfIRExpr(mce->sb->tyenv, vbits0);
4958 
4959    return
4960       mkUifU(mce, ty, assignNew('V', mce, ty,
4961                                      IRExpr_ITE(cond, vbits1, vbits0)),
4962                       mkPCastTo(mce, ty, vbitsC) );
4963 }
4964 
4965 /* --------- This is the main expression-handling function. --------- */
4966 
4967 static
expr2vbits(MCEnv * mce,IRExpr * e)4968 IRExpr* expr2vbits ( MCEnv* mce, IRExpr* e )
4969 {
4970    switch (e->tag) {
4971 
4972       case Iex_Get:
4973          return shadow_GET( mce, e->Iex.Get.offset, e->Iex.Get.ty );
4974 
4975       case Iex_GetI:
4976          return shadow_GETI( mce, e->Iex.GetI.descr,
4977                                   e->Iex.GetI.ix, e->Iex.GetI.bias );
4978 
4979       case Iex_RdTmp:
4980          return IRExpr_RdTmp( findShadowTmpV(mce, e->Iex.RdTmp.tmp) );
4981 
4982       case Iex_Const:
4983          return definedOfType(shadowTypeV(typeOfIRExpr(mce->sb->tyenv, e)));
4984 
4985       case Iex_Qop:
4986          return expr2vbits_Qop(
4987                    mce,
4988                    e->Iex.Qop.details->op,
4989                    e->Iex.Qop.details->arg1, e->Iex.Qop.details->arg2,
4990                    e->Iex.Qop.details->arg3, e->Iex.Qop.details->arg4
4991                 );
4992 
4993       case Iex_Triop:
4994          return expr2vbits_Triop(
4995                    mce,
4996                    e->Iex.Triop.details->op,
4997                    e->Iex.Triop.details->arg1, e->Iex.Triop.details->arg2,
4998                    e->Iex.Triop.details->arg3
4999                 );
5000 
5001       case Iex_Binop:
5002          return expr2vbits_Binop(
5003                    mce,
5004                    e->Iex.Binop.op,
5005                    e->Iex.Binop.arg1, e->Iex.Binop.arg2
5006                 );
5007 
5008       case Iex_Unop:
5009          return expr2vbits_Unop( mce, e->Iex.Unop.op, e->Iex.Unop.arg );
5010 
5011       case Iex_Load:
5012          return expr2vbits_Load( mce, e->Iex.Load.end,
5013                                       e->Iex.Load.ty,
5014                                       e->Iex.Load.addr, 0/*addr bias*/,
5015                                       NULL/* guard == "always True"*/ );
5016 
5017       case Iex_CCall:
5018          return mkLazyN( mce, e->Iex.CCall.args,
5019                               e->Iex.CCall.retty,
5020                               e->Iex.CCall.cee );
5021 
5022       case Iex_ITE:
5023          return expr2vbits_ITE( mce, e->Iex.ITE.cond, e->Iex.ITE.iftrue,
5024                                      e->Iex.ITE.iffalse);
5025 
5026       default:
5027          VG_(printf)("\n");
5028          ppIRExpr(e);
5029          VG_(printf)("\n");
5030          VG_(tool_panic)("memcheck: expr2vbits");
5031    }
5032 }
5033 
5034 /*------------------------------------------------------------*/
5035 /*--- Generate shadow stmts from all kinds of IRStmts.     ---*/
5036 /*------------------------------------------------------------*/
5037 
5038 /* Widen a value to the host word size. */
5039 
5040 static
zwidenToHostWord(MCEnv * mce,IRAtom * vatom)5041 IRExpr* zwidenToHostWord ( MCEnv* mce, IRAtom* vatom )
5042 {
5043    IRType ty, tyH;
5044 
5045    /* vatom is vbits-value and as such can only have a shadow type. */
5046    tl_assert(isShadowAtom(mce,vatom));
5047 
5048    ty  = typeOfIRExpr(mce->sb->tyenv, vatom);
5049    tyH = mce->hWordTy;
5050 
5051    if (tyH == Ity_I32) {
5052       switch (ty) {
5053          case Ity_I32:
5054             return vatom;
5055          case Ity_I16:
5056             return assignNew('V', mce, tyH, unop(Iop_16Uto32, vatom));
5057          case Ity_I8:
5058             return assignNew('V', mce, tyH, unop(Iop_8Uto32, vatom));
5059          default:
5060             goto unhandled;
5061       }
5062    } else
5063    if (tyH == Ity_I64) {
5064       switch (ty) {
5065          case Ity_I32:
5066             return assignNew('V', mce, tyH, unop(Iop_32Uto64, vatom));
5067          case Ity_I16:
5068             return assignNew('V', mce, tyH, unop(Iop_32Uto64,
5069                    assignNew('V', mce, Ity_I32, unop(Iop_16Uto32, vatom))));
5070          case Ity_I8:
5071             return assignNew('V', mce, tyH, unop(Iop_32Uto64,
5072                    assignNew('V', mce, Ity_I32, unop(Iop_8Uto32, vatom))));
5073          default:
5074             goto unhandled;
5075       }
5076    } else {
5077       goto unhandled;
5078    }
5079   unhandled:
5080    VG_(printf)("\nty = "); ppIRType(ty); VG_(printf)("\n");
5081    VG_(tool_panic)("zwidenToHostWord");
5082 }
5083 
5084 
5085 /* Generate a shadow store.  |addr| is always the original address
5086    atom.  You can pass in either originals or V-bits for the data
5087    atom, but obviously not both.  This function generates a check for
5088    the definedness and (indirectly) the validity of |addr|, but only
5089    when |guard| evaluates to True at run time (or is NULL).
5090 
5091    |guard| :: Ity_I1 controls whether the store really happens; NULL
5092    means it unconditionally does.  Note that |guard| itself is not
5093    checked for definedness; the caller of this function must do that
5094    if necessary.
5095 */
5096 static
do_shadow_Store(MCEnv * mce,IREndness end,IRAtom * addr,UInt bias,IRAtom * data,IRAtom * vdata,IRAtom * guard)5097 void do_shadow_Store ( MCEnv* mce,
5098                        IREndness end,
5099                        IRAtom* addr, UInt bias,
5100                        IRAtom* data, IRAtom* vdata,
5101                        IRAtom* guard )
5102 {
5103    IROp     mkAdd;
5104    IRType   ty, tyAddr;
5105    void*    helper = NULL;
5106    const HChar* hname = NULL;
5107    IRConst* c;
5108 
5109    tyAddr = mce->hWordTy;
5110    mkAdd  = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
5111    tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
5112    tl_assert( end == Iend_LE || end == Iend_BE );
5113 
5114    if (data) {
5115       tl_assert(!vdata);
5116       tl_assert(isOriginalAtom(mce, data));
5117       tl_assert(bias == 0);
5118       vdata = expr2vbits( mce, data );
5119    } else {
5120       tl_assert(vdata);
5121    }
5122 
5123    tl_assert(isOriginalAtom(mce,addr));
5124    tl_assert(isShadowAtom(mce,vdata));
5125 
5126    if (guard) {
5127       tl_assert(isOriginalAtom(mce, guard));
5128       tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
5129    }
5130 
5131    ty = typeOfIRExpr(mce->sb->tyenv, vdata);
5132 
5133    // If we're not doing undefined value checking, pretend that this value
5134    // is "all valid".  That lets Vex's optimiser remove some of the V bit
5135    // shadow computation ops that precede it.
5136    if (MC_(clo_mc_level) == 1) {
5137       switch (ty) {
5138          case Ity_V256: // V256 weirdness -- used four times
5139                         c = IRConst_V256(V_BITS32_DEFINED); break;
5140          case Ity_V128: // V128 weirdness -- used twice
5141                         c = IRConst_V128(V_BITS16_DEFINED); break;
5142          case Ity_I64:  c = IRConst_U64 (V_BITS64_DEFINED); break;
5143          case Ity_I32:  c = IRConst_U32 (V_BITS32_DEFINED); break;
5144          case Ity_I16:  c = IRConst_U16 (V_BITS16_DEFINED); break;
5145          case Ity_I8:   c = IRConst_U8  (V_BITS8_DEFINED);  break;
5146          default:       VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
5147       }
5148       vdata = IRExpr_Const( c );
5149    }
5150 
5151    /* First, emit a definedness test for the address.  This also sets
5152       the address (shadow) to 'defined' following the test.  Both of
5153       those actions are gated on |guard|. */
5154    complainIfUndefined( mce, addr, guard );
5155 
5156    /* Now decide which helper function to call to write the data V
5157       bits into shadow memory. */
5158    if (end == Iend_LE) {
5159       switch (ty) {
5160          case Ity_V256: /* we'll use the helper four times */
5161          case Ity_V128: /* we'll use the helper twice */
5162          case Ity_I64: helper = &MC_(helperc_STOREV64le);
5163                        hname = "MC_(helperc_STOREV64le)";
5164                        break;
5165          case Ity_I32: helper = &MC_(helperc_STOREV32le);
5166                        hname = "MC_(helperc_STOREV32le)";
5167                        break;
5168          case Ity_I16: helper = &MC_(helperc_STOREV16le);
5169                        hname = "MC_(helperc_STOREV16le)";
5170                        break;
5171          case Ity_I8:  helper = &MC_(helperc_STOREV8);
5172                        hname = "MC_(helperc_STOREV8)";
5173                        break;
5174          default:      VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
5175       }
5176    } else {
5177       switch (ty) {
5178          case Ity_V128: /* we'll use the helper twice */
5179          case Ity_I64: helper = &MC_(helperc_STOREV64be);
5180                        hname = "MC_(helperc_STOREV64be)";
5181                        break;
5182          case Ity_I32: helper = &MC_(helperc_STOREV32be);
5183                        hname = "MC_(helperc_STOREV32be)";
5184                        break;
5185          case Ity_I16: helper = &MC_(helperc_STOREV16be);
5186                        hname = "MC_(helperc_STOREV16be)";
5187                        break;
5188          case Ity_I8:  helper = &MC_(helperc_STOREV8);
5189                        hname = "MC_(helperc_STOREV8)";
5190                        break;
5191          /* Note, no V256 case here, because no big-endian target that
5192             we support, has 256 vectors. */
5193          default:      VG_(tool_panic)("memcheck:do_shadow_Store(BE)");
5194       }
5195    }
5196 
5197    if (UNLIKELY(ty == Ity_V256)) {
5198 
5199       /* V256-bit case -- phrased in terms of 64 bit units (Qs), with
5200          Q3 being the most significant lane. */
5201       /* These are the offsets of the Qs in memory. */
5202       Int     offQ0, offQ1, offQ2, offQ3;
5203 
5204       /* Various bits for constructing the 4 lane helper calls */
5205       IRDirty *diQ0,    *diQ1,    *diQ2,    *diQ3;
5206       IRAtom  *addrQ0,  *addrQ1,  *addrQ2,  *addrQ3;
5207       IRAtom  *vdataQ0, *vdataQ1, *vdataQ2, *vdataQ3;
5208       IRAtom  *eBiasQ0, *eBiasQ1, *eBiasQ2, *eBiasQ3;
5209 
5210       if (end == Iend_LE) {
5211          offQ0 = 0; offQ1 = 8; offQ2 = 16; offQ3 = 24;
5212       } else {
5213          offQ3 = 0; offQ2 = 8; offQ1 = 16; offQ0 = 24;
5214       }
5215 
5216       eBiasQ0 = tyAddr==Ity_I32 ? mkU32(bias+offQ0) : mkU64(bias+offQ0);
5217       addrQ0  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ0) );
5218       vdataQ0 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_0, vdata));
5219       diQ0    = unsafeIRDirty_0_N(
5220                    1/*regparms*/,
5221                    hname, VG_(fnptr_to_fnentry)( helper ),
5222                    mkIRExprVec_2( addrQ0, vdataQ0 )
5223                 );
5224 
5225       eBiasQ1 = tyAddr==Ity_I32 ? mkU32(bias+offQ1) : mkU64(bias+offQ1);
5226       addrQ1  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ1) );
5227       vdataQ1 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_1, vdata));
5228       diQ1    = unsafeIRDirty_0_N(
5229                    1/*regparms*/,
5230                    hname, VG_(fnptr_to_fnentry)( helper ),
5231                    mkIRExprVec_2( addrQ1, vdataQ1 )
5232                 );
5233 
5234       eBiasQ2 = tyAddr==Ity_I32 ? mkU32(bias+offQ2) : mkU64(bias+offQ2);
5235       addrQ2  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ2) );
5236       vdataQ2 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_2, vdata));
5237       diQ2    = unsafeIRDirty_0_N(
5238                    1/*regparms*/,
5239                    hname, VG_(fnptr_to_fnentry)( helper ),
5240                    mkIRExprVec_2( addrQ2, vdataQ2 )
5241                 );
5242 
5243       eBiasQ3 = tyAddr==Ity_I32 ? mkU32(bias+offQ3) : mkU64(bias+offQ3);
5244       addrQ3  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ3) );
5245       vdataQ3 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_3, vdata));
5246       diQ3    = unsafeIRDirty_0_N(
5247                    1/*regparms*/,
5248                    hname, VG_(fnptr_to_fnentry)( helper ),
5249                    mkIRExprVec_2( addrQ3, vdataQ3 )
5250                 );
5251 
5252       if (guard)
5253          diQ0->guard = diQ1->guard = diQ2->guard = diQ3->guard = guard;
5254 
5255       setHelperAnns( mce, diQ0 );
5256       setHelperAnns( mce, diQ1 );
5257       setHelperAnns( mce, diQ2 );
5258       setHelperAnns( mce, diQ3 );
5259       stmt( 'V', mce, IRStmt_Dirty(diQ0) );
5260       stmt( 'V', mce, IRStmt_Dirty(diQ1) );
5261       stmt( 'V', mce, IRStmt_Dirty(diQ2) );
5262       stmt( 'V', mce, IRStmt_Dirty(diQ3) );
5263 
5264    }
5265    else if (UNLIKELY(ty == Ity_V128)) {
5266 
5267       /* V128-bit case */
5268       /* See comment in next clause re 64-bit regparms */
5269       /* also, need to be careful about endianness */
5270 
5271       Int     offLo64, offHi64;
5272       IRDirty *diLo64, *diHi64;
5273       IRAtom  *addrLo64, *addrHi64;
5274       IRAtom  *vdataLo64, *vdataHi64;
5275       IRAtom  *eBiasLo64, *eBiasHi64;
5276 
5277       if (end == Iend_LE) {
5278          offLo64 = 0;
5279          offHi64 = 8;
5280       } else {
5281          offLo64 = 8;
5282          offHi64 = 0;
5283       }
5284 
5285       eBiasLo64 = tyAddr==Ity_I32 ? mkU32(bias+offLo64) : mkU64(bias+offLo64);
5286       addrLo64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasLo64) );
5287       vdataLo64 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vdata));
5288       diLo64    = unsafeIRDirty_0_N(
5289                      1/*regparms*/,
5290                      hname, VG_(fnptr_to_fnentry)( helper ),
5291                      mkIRExprVec_2( addrLo64, vdataLo64 )
5292                   );
5293       eBiasHi64 = tyAddr==Ity_I32 ? mkU32(bias+offHi64) : mkU64(bias+offHi64);
5294       addrHi64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasHi64) );
5295       vdataHi64 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, vdata));
5296       diHi64    = unsafeIRDirty_0_N(
5297                      1/*regparms*/,
5298                      hname, VG_(fnptr_to_fnentry)( helper ),
5299                      mkIRExprVec_2( addrHi64, vdataHi64 )
5300                   );
5301       if (guard) diLo64->guard = guard;
5302       if (guard) diHi64->guard = guard;
5303       setHelperAnns( mce, diLo64 );
5304       setHelperAnns( mce, diHi64 );
5305       stmt( 'V', mce, IRStmt_Dirty(diLo64) );
5306       stmt( 'V', mce, IRStmt_Dirty(diHi64) );
5307 
5308    } else {
5309 
5310       IRDirty *di;
5311       IRAtom  *addrAct;
5312 
5313       /* 8/16/32/64-bit cases */
5314       /* Generate the actual address into addrAct. */
5315       if (bias == 0) {
5316          addrAct = addr;
5317       } else {
5318          IRAtom* eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
5319          addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias));
5320       }
5321 
5322       if (ty == Ity_I64) {
5323          /* We can't do this with regparm 2 on 32-bit platforms, since
5324             the back ends aren't clever enough to handle 64-bit
5325             regparm args.  Therefore be different. */
5326          di = unsafeIRDirty_0_N(
5327                  1/*regparms*/,
5328                  hname, VG_(fnptr_to_fnentry)( helper ),
5329                  mkIRExprVec_2( addrAct, vdata )
5330               );
5331       } else {
5332          di = unsafeIRDirty_0_N(
5333                  2/*regparms*/,
5334                  hname, VG_(fnptr_to_fnentry)( helper ),
5335                  mkIRExprVec_2( addrAct,
5336                                 zwidenToHostWord( mce, vdata ))
5337               );
5338       }
5339       if (guard) di->guard = guard;
5340       setHelperAnns( mce, di );
5341       stmt( 'V', mce, IRStmt_Dirty(di) );
5342    }
5343 
5344 }
5345 
5346 
5347 /* Do lazy pessimistic propagation through a dirty helper call, by
5348    looking at the annotations on it.  This is the most complex part of
5349    Memcheck. */
5350 
szToITy(Int n)5351 static IRType szToITy ( Int n )
5352 {
5353    switch (n) {
5354       case 1: return Ity_I8;
5355       case 2: return Ity_I16;
5356       case 4: return Ity_I32;
5357       case 8: return Ity_I64;
5358       default: VG_(tool_panic)("szToITy(memcheck)");
5359    }
5360 }
5361 
5362 static
do_shadow_Dirty(MCEnv * mce,IRDirty * d)5363 void do_shadow_Dirty ( MCEnv* mce, IRDirty* d )
5364 {
5365    Int       i, k, n, toDo, gSz, gOff;
5366    IRAtom    *src, *here, *curr;
5367    IRType    tySrc, tyDst;
5368    IRTemp    dst;
5369    IREndness end;
5370 
5371    /* What's the native endianness?  We need to know this. */
5372 #  if defined(VG_BIGENDIAN)
5373    end = Iend_BE;
5374 #  elif defined(VG_LITTLEENDIAN)
5375    end = Iend_LE;
5376 #  else
5377 #    error "Unknown endianness"
5378 #  endif
5379 
5380    /* First check the guard. */
5381    complainIfUndefined(mce, d->guard, NULL);
5382 
5383    /* Now round up all inputs and PCast over them. */
5384    curr = definedOfType(Ity_I32);
5385 
5386    /* Inputs: unmasked args
5387       Note: arguments are evaluated REGARDLESS of the guard expression */
5388    for (i = 0; d->args[i]; i++) {
5389       IRAtom* arg = d->args[i];
5390       if ( (d->cee->mcx_mask & (1<<i))
5391            || UNLIKELY(is_IRExpr_VECRET_or_GSPTR(arg)) ) {
5392          /* ignore this arg */
5393       } else {
5394          here = mkPCastTo( mce, Ity_I32, expr2vbits(mce, arg) );
5395          curr = mkUifU32(mce, here, curr);
5396       }
5397    }
5398 
5399    /* Inputs: guest state that we read. */
5400    for (i = 0; i < d->nFxState; i++) {
5401       tl_assert(d->fxState[i].fx != Ifx_None);
5402       if (d->fxState[i].fx == Ifx_Write)
5403          continue;
5404 
5405       /* Enumerate the described state segments */
5406       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
5407          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
5408          gSz  = d->fxState[i].size;
5409 
5410          /* Ignore any sections marked as 'always defined'. */
5411          if (isAlwaysDefd(mce, gOff, gSz)) {
5412             if (0)
5413             VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
5414                         gOff, gSz);
5415             continue;
5416          }
5417 
5418          /* This state element is read or modified.  So we need to
5419             consider it.  If larger than 8 bytes, deal with it in
5420             8-byte chunks. */
5421          while (True) {
5422             tl_assert(gSz >= 0);
5423             if (gSz == 0) break;
5424             n = gSz <= 8 ? gSz : 8;
5425             /* update 'curr' with UifU of the state slice
5426                gOff .. gOff+n-1 */
5427             tySrc = szToITy( n );
5428 
5429             /* Observe the guard expression. If it is false use an
5430                all-bits-defined bit pattern */
5431             IRAtom *cond, *iffalse, *iftrue;
5432 
5433             cond    = assignNew('V', mce, Ity_I1, d->guard);
5434             iftrue  = assignNew('V', mce, tySrc, shadow_GET(mce, gOff, tySrc));
5435             iffalse = assignNew('V', mce, tySrc, definedOfType(tySrc));
5436             src     = assignNew('V', mce, tySrc,
5437                                 IRExpr_ITE(cond, iftrue, iffalse));
5438 
5439             here = mkPCastTo( mce, Ity_I32, src );
5440             curr = mkUifU32(mce, here, curr);
5441             gSz -= n;
5442             gOff += n;
5443          }
5444       }
5445    }
5446 
5447    /* Inputs: memory.  First set up some info needed regardless of
5448       whether we're doing reads or writes. */
5449 
5450    if (d->mFx != Ifx_None) {
5451       /* Because we may do multiple shadow loads/stores from the same
5452          base address, it's best to do a single test of its
5453          definedness right now.  Post-instrumentation optimisation
5454          should remove all but this test. */
5455       IRType tyAddr;
5456       tl_assert(d->mAddr);
5457       complainIfUndefined(mce, d->mAddr, d->guard);
5458 
5459       tyAddr = typeOfIRExpr(mce->sb->tyenv, d->mAddr);
5460       tl_assert(tyAddr == Ity_I32 || tyAddr == Ity_I64);
5461       tl_assert(tyAddr == mce->hWordTy); /* not really right */
5462    }
5463 
5464    /* Deal with memory inputs (reads or modifies) */
5465    if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
5466       toDo   = d->mSize;
5467       /* chew off 32-bit chunks.  We don't care about the endianness
5468          since it's all going to be condensed down to a single bit,
5469          but nevertheless choose an endianness which is hopefully
5470          native to the platform. */
5471       while (toDo >= 4) {
5472          here = mkPCastTo(
5473                    mce, Ity_I32,
5474                    expr2vbits_Load_guarded_Simple(
5475                       mce, end, Ity_I32, d->mAddr, d->mSize - toDo, d->guard )
5476                 );
5477          curr = mkUifU32(mce, here, curr);
5478          toDo -= 4;
5479       }
5480       /* chew off 16-bit chunks */
5481       while (toDo >= 2) {
5482          here = mkPCastTo(
5483                    mce, Ity_I32,
5484                    expr2vbits_Load_guarded_Simple(
5485                       mce, end, Ity_I16, d->mAddr, d->mSize - toDo, d->guard )
5486                 );
5487          curr = mkUifU32(mce, here, curr);
5488          toDo -= 2;
5489       }
5490       /* chew off the remaining 8-bit chunk, if any */
5491       if (toDo == 1) {
5492          here = mkPCastTo(
5493                    mce, Ity_I32,
5494                    expr2vbits_Load_guarded_Simple(
5495                       mce, end, Ity_I8, d->mAddr, d->mSize - toDo, d->guard )
5496                 );
5497          curr = mkUifU32(mce, here, curr);
5498          toDo -= 1;
5499       }
5500       tl_assert(toDo == 0);
5501    }
5502 
5503    /* Whew!  So curr is a 32-bit V-value summarising pessimistically
5504       all the inputs to the helper.  Now we need to re-distribute the
5505       results to all destinations. */
5506 
5507    /* Outputs: the destination temporary, if there is one. */
5508    if (d->tmp != IRTemp_INVALID) {
5509       dst   = findShadowTmpV(mce, d->tmp);
5510       tyDst = typeOfIRTemp(mce->sb->tyenv, d->tmp);
5511       assign( 'V', mce, dst, mkPCastTo( mce, tyDst, curr) );
5512    }
5513 
5514    /* Outputs: guest state that we write or modify. */
5515    for (i = 0; i < d->nFxState; i++) {
5516       tl_assert(d->fxState[i].fx != Ifx_None);
5517       if (d->fxState[i].fx == Ifx_Read)
5518          continue;
5519 
5520       /* Enumerate the described state segments */
5521       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
5522          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
5523          gSz  = d->fxState[i].size;
5524 
5525          /* Ignore any sections marked as 'always defined'. */
5526          if (isAlwaysDefd(mce, gOff, gSz))
5527             continue;
5528 
5529          /* This state element is written or modified.  So we need to
5530             consider it.  If larger than 8 bytes, deal with it in
5531             8-byte chunks. */
5532          while (True) {
5533             tl_assert(gSz >= 0);
5534             if (gSz == 0) break;
5535             n = gSz <= 8 ? gSz : 8;
5536             /* Write suitably-casted 'curr' to the state slice
5537                gOff .. gOff+n-1 */
5538             tyDst = szToITy( n );
5539             do_shadow_PUT( mce, gOff,
5540                                 NULL, /* original atom */
5541                                 mkPCastTo( mce, tyDst, curr ), d->guard );
5542             gSz -= n;
5543             gOff += n;
5544          }
5545       }
5546    }
5547 
5548    /* Outputs: memory that we write or modify.  Same comments about
5549       endianness as above apply. */
5550    if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
5551       toDo   = d->mSize;
5552       /* chew off 32-bit chunks */
5553       while (toDo >= 4) {
5554          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
5555                           NULL, /* original data */
5556                           mkPCastTo( mce, Ity_I32, curr ),
5557                           d->guard );
5558          toDo -= 4;
5559       }
5560       /* chew off 16-bit chunks */
5561       while (toDo >= 2) {
5562          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
5563                           NULL, /* original data */
5564                           mkPCastTo( mce, Ity_I16, curr ),
5565                           d->guard );
5566          toDo -= 2;
5567       }
5568       /* chew off the remaining 8-bit chunk, if any */
5569       if (toDo == 1) {
5570          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
5571                           NULL, /* original data */
5572                           mkPCastTo( mce, Ity_I8, curr ),
5573                           d->guard );
5574          toDo -= 1;
5575       }
5576       tl_assert(toDo == 0);
5577    }
5578 
5579 }
5580 
5581 
5582 /* We have an ABI hint telling us that [base .. base+len-1] is to
5583    become undefined ("writable").  Generate code to call a helper to
5584    notify the A/V bit machinery of this fact.
5585 
5586    We call
5587    void MC_(helperc_MAKE_STACK_UNINIT) ( Addr base, UWord len,
5588                                                     Addr nia );
5589 */
5590 static
do_AbiHint(MCEnv * mce,IRExpr * base,Int len,IRExpr * nia)5591 void do_AbiHint ( MCEnv* mce, IRExpr* base, Int len, IRExpr* nia )
5592 {
5593    IRDirty* di;
5594 
5595    if (MC_(clo_mc_level) == 3) {
5596       di = unsafeIRDirty_0_N(
5597               3/*regparms*/,
5598               "MC_(helperc_MAKE_STACK_UNINIT_w_o)",
5599               VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_w_o) ),
5600               mkIRExprVec_3( base, mkIRExpr_HWord( (UInt)len), nia )
5601            );
5602    } else {
5603       /* We ignore the supplied nia, since it is irrelevant. */
5604       tl_assert(MC_(clo_mc_level) == 2 || MC_(clo_mc_level) == 1);
5605       /* Special-case the len==128 case, since that is for amd64-ELF,
5606          which is a very common target. */
5607       if (len == 128) {
5608          di = unsafeIRDirty_0_N(
5609                  1/*regparms*/,
5610                  "MC_(helperc_MAKE_STACK_UNINIT_128_no_o)",
5611                  VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_128_no_o)),
5612                  mkIRExprVec_1( base )
5613               );
5614       } else {
5615          di = unsafeIRDirty_0_N(
5616                  2/*regparms*/,
5617                  "MC_(helperc_MAKE_STACK_UNINIT_no_o)",
5618                  VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_no_o) ),
5619                  mkIRExprVec_2( base, mkIRExpr_HWord( (UInt)len) )
5620               );
5621       }
5622    }
5623 
5624    stmt( 'V', mce, IRStmt_Dirty(di) );
5625 }
5626 
5627 
5628 /* ------ Dealing with IRCAS (big and complex) ------ */
5629 
5630 /* FWDS */
5631 static IRAtom* gen_load_b  ( MCEnv* mce, Int szB,
5632                              IRAtom* baseaddr, Int offset );
5633 static IRAtom* gen_maxU32  ( MCEnv* mce, IRAtom* b1, IRAtom* b2 );
5634 static void    gen_store_b ( MCEnv* mce, Int szB,
5635                              IRAtom* baseaddr, Int offset, IRAtom* dataB,
5636                              IRAtom* guard );
5637 
5638 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas );
5639 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas );
5640 
5641 
5642 /* Either ORIG and SHADOW are both IRExpr.RdTmps, or they are both
5643    IRExpr.Consts, else this asserts.  If they are both Consts, it
5644    doesn't do anything.  So that just leaves the RdTmp case.
5645 
5646    In which case: this assigns the shadow value SHADOW to the IR
5647    shadow temporary associated with ORIG.  That is, ORIG, being an
5648    original temporary, will have a shadow temporary associated with
5649    it.  However, in the case envisaged here, there will so far have
5650    been no IR emitted to actually write a shadow value into that
5651    temporary.  What this routine does is to (emit IR to) copy the
5652    value in SHADOW into said temporary, so that after this call,
5653    IRExpr.RdTmps of ORIG's shadow temp will correctly pick up the
5654    value in SHADOW.
5655 
5656    Point is to allow callers to compute "by hand" a shadow value for
5657    ORIG, and force it to be associated with ORIG.
5658 
5659    How do we know that that shadow associated with ORIG has not so far
5660    been assigned to?  Well, we don't per se know that, but supposing
5661    it had.  Then this routine would create a second assignment to it,
5662    and later the IR sanity checker would barf.  But that never
5663    happens.  QED.
5664 */
bind_shadow_tmp_to_orig(UChar how,MCEnv * mce,IRAtom * orig,IRAtom * shadow)5665 static void bind_shadow_tmp_to_orig ( UChar how,
5666                                       MCEnv* mce,
5667                                       IRAtom* orig, IRAtom* shadow )
5668 {
5669    tl_assert(isOriginalAtom(mce, orig));
5670    tl_assert(isShadowAtom(mce, shadow));
5671    switch (orig->tag) {
5672       case Iex_Const:
5673          tl_assert(shadow->tag == Iex_Const);
5674          break;
5675       case Iex_RdTmp:
5676          tl_assert(shadow->tag == Iex_RdTmp);
5677          if (how == 'V') {
5678             assign('V', mce, findShadowTmpV(mce,orig->Iex.RdTmp.tmp),
5679                    shadow);
5680          } else {
5681             tl_assert(how == 'B');
5682             assign('B', mce, findShadowTmpB(mce,orig->Iex.RdTmp.tmp),
5683                    shadow);
5684          }
5685          break;
5686       default:
5687          tl_assert(0);
5688    }
5689 }
5690 
5691 
5692 static
do_shadow_CAS(MCEnv * mce,IRCAS * cas)5693 void do_shadow_CAS ( MCEnv* mce, IRCAS* cas )
5694 {
5695    /* Scheme is (both single- and double- cases):
5696 
5697       1. fetch data#,dataB (the proposed new value)
5698 
5699       2. fetch expd#,expdB (what we expect to see at the address)
5700 
5701       3. check definedness of address
5702 
5703       4. load old#,oldB from shadow memory; this also checks
5704          addressibility of the address
5705 
5706       5. the CAS itself
5707 
5708       6. compute "expected == old".  See COMMENT_ON_CasCmpEQ below.
5709 
5710       7. if "expected == old" (as computed by (6))
5711             store data#,dataB to shadow memory
5712 
5713       Note that 5 reads 'old' but 4 reads 'old#'.  Similarly, 5 stores
5714       'data' but 7 stores 'data#'.  Hence it is possible for the
5715       shadow data to be incorrectly checked and/or updated:
5716 
5717       * 7 is at least gated correctly, since the 'expected == old'
5718         condition is derived from outputs of 5.  However, the shadow
5719         write could happen too late: imagine after 5 we are
5720         descheduled, a different thread runs, writes a different
5721         (shadow) value at the address, and then we resume, hence
5722         overwriting the shadow value written by the other thread.
5723 
5724       Because the original memory access is atomic, there's no way to
5725       make both the original and shadow accesses into a single atomic
5726       thing, hence this is unavoidable.
5727 
5728       At least as Valgrind stands, I don't think it's a problem, since
5729       we're single threaded *and* we guarantee that there are no
5730       context switches during the execution of any specific superblock
5731       -- context switches can only happen at superblock boundaries.
5732 
5733       If Valgrind ever becomes MT in the future, then it might be more
5734       of a problem.  A possible kludge would be to artificially
5735       associate with the location, a lock, which we must acquire and
5736       release around the transaction as a whole.  Hmm, that probably
5737       would't work properly since it only guards us against other
5738       threads doing CASs on the same location, not against other
5739       threads doing normal reads and writes.
5740 
5741       ------------------------------------------------------------
5742 
5743       COMMENT_ON_CasCmpEQ:
5744 
5745       Note two things.  Firstly, in the sequence above, we compute
5746       "expected == old", but we don't check definedness of it.  Why
5747       not?  Also, the x86 and amd64 front ends use
5748       Iop_CasCmp{EQ,NE}{8,16,32,64} comparisons to make the equivalent
5749       determination (expected == old ?) for themselves, and we also
5750       don't check definedness for those primops; we just say that the
5751       result is defined.  Why?  Details follow.
5752 
5753       x86/amd64 contains various forms of locked insns:
5754       * lock prefix before all basic arithmetic insn;
5755         eg lock xorl %reg1,(%reg2)
5756       * atomic exchange reg-mem
5757       * compare-and-swaps
5758 
5759       Rather than attempt to represent them all, which would be a
5760       royal PITA, I used a result from Maurice Herlihy
5761       (http://en.wikipedia.org/wiki/Maurice_Herlihy), in which he
5762       demonstrates that compare-and-swap is a primitive more general
5763       than the other two, and so can be used to represent all of them.
5764       So the translation scheme for (eg) lock incl (%reg) is as
5765       follows:
5766 
5767         again:
5768          old = * %reg
5769          new = old + 1
5770          atomically { if (* %reg == old) { * %reg = new } else { goto again } }
5771 
5772       The "atomically" is the CAS bit.  The scheme is always the same:
5773       get old value from memory, compute new value, atomically stuff
5774       new value back in memory iff the old value has not changed (iow,
5775       no other thread modified it in the meantime).  If it has changed
5776       then we've been out-raced and we have to start over.
5777 
5778       Now that's all very neat, but it has the bad side effect of
5779       introducing an explicit equality test into the translation.
5780       Consider the behaviour of said code on a memory location which
5781       is uninitialised.  We will wind up doing a comparison on
5782       uninitialised data, and mc duly complains.
5783 
5784       What's difficult about this is, the common case is that the
5785       location is uncontended, and so we're usually comparing the same
5786       value (* %reg) with itself.  So we shouldn't complain even if it
5787       is undefined.  But mc doesn't know that.
5788 
5789       My solution is to mark the == in the IR specially, so as to tell
5790       mc that it almost certainly compares a value with itself, and we
5791       should just regard the result as always defined.  Rather than
5792       add a bit to all IROps, I just cloned Iop_CmpEQ{8,16,32,64} into
5793       Iop_CasCmpEQ{8,16,32,64} so as not to disturb anything else.
5794 
5795       So there's always the question of, can this give a false
5796       negative?  eg, imagine that initially, * %reg is defined; and we
5797       read that; but then in the gap between the read and the CAS, a
5798       different thread writes an undefined (and different) value at
5799       the location.  Then the CAS in this thread will fail and we will
5800       go back to "again:", but without knowing that the trip back
5801       there was based on an undefined comparison.  No matter; at least
5802       the other thread won the race and the location is correctly
5803       marked as undefined.  What if it wrote an uninitialised version
5804       of the same value that was there originally, though?
5805 
5806       etc etc.  Seems like there's a small corner case in which we
5807       might lose the fact that something's defined -- we're out-raced
5808       in between the "old = * reg" and the "atomically {", _and_ the
5809       other thread is writing in an undefined version of what's
5810       already there.  Well, that seems pretty unlikely.
5811 
5812       ---
5813 
5814       If we ever need to reinstate it .. code which generates a
5815       definedness test for "expected == old" was removed at r10432 of
5816       this file.
5817    */
5818    if (cas->oldHi == IRTemp_INVALID) {
5819       do_shadow_CAS_single( mce, cas );
5820    } else {
5821       do_shadow_CAS_double( mce, cas );
5822    }
5823 }
5824 
5825 
do_shadow_CAS_single(MCEnv * mce,IRCAS * cas)5826 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas )
5827 {
5828    IRAtom *vdataLo = NULL, *bdataLo = NULL;
5829    IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
5830    IRAtom *voldLo  = NULL, *boldLo  = NULL;
5831    IRAtom *expd_eq_old = NULL;
5832    IROp   opCasCmpEQ;
5833    Int    elemSzB;
5834    IRType elemTy;
5835    Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
5836 
5837    /* single CAS */
5838    tl_assert(cas->oldHi == IRTemp_INVALID);
5839    tl_assert(cas->expdHi == NULL);
5840    tl_assert(cas->dataHi == NULL);
5841 
5842    elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
5843    switch (elemTy) {
5844       case Ity_I8:  elemSzB = 1; opCasCmpEQ = Iop_CasCmpEQ8;  break;
5845       case Ity_I16: elemSzB = 2; opCasCmpEQ = Iop_CasCmpEQ16; break;
5846       case Ity_I32: elemSzB = 4; opCasCmpEQ = Iop_CasCmpEQ32; break;
5847       case Ity_I64: elemSzB = 8; opCasCmpEQ = Iop_CasCmpEQ64; break;
5848       default: tl_assert(0); /* IR defn disallows any other types */
5849    }
5850 
5851    /* 1. fetch data# (the proposed new value) */
5852    tl_assert(isOriginalAtom(mce, cas->dataLo));
5853    vdataLo
5854       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo));
5855    tl_assert(isShadowAtom(mce, vdataLo));
5856    if (otrak) {
5857       bdataLo
5858          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
5859       tl_assert(isShadowAtom(mce, bdataLo));
5860    }
5861 
5862    /* 2. fetch expected# (what we expect to see at the address) */
5863    tl_assert(isOriginalAtom(mce, cas->expdLo));
5864    vexpdLo
5865       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo));
5866    tl_assert(isShadowAtom(mce, vexpdLo));
5867    if (otrak) {
5868       bexpdLo
5869          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
5870       tl_assert(isShadowAtom(mce, bexpdLo));
5871    }
5872 
5873    /* 3. check definedness of address */
5874    /* 4. fetch old# from shadow memory; this also checks
5875          addressibility of the address */
5876    voldLo
5877       = assignNew(
5878            'V', mce, elemTy,
5879            expr2vbits_Load(
5880               mce,
5881               cas->end, elemTy, cas->addr, 0/*Addr bias*/,
5882               NULL/*always happens*/
5883         ));
5884    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
5885    if (otrak) {
5886       boldLo
5887          = assignNew('B', mce, Ity_I32,
5888                      gen_load_b(mce, elemSzB, cas->addr, 0/*addr bias*/));
5889       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
5890    }
5891 
5892    /* 5. the CAS itself */
5893    stmt( 'C', mce, IRStmt_CAS(cas) );
5894 
5895    /* 6. compute "expected == old" */
5896    /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
5897    /* Note that 'C' is kinda faking it; it is indeed a non-shadow
5898       tree, but it's not copied from the input block. */
5899    expd_eq_old
5900       = assignNew('C', mce, Ity_I1,
5901                   binop(opCasCmpEQ, cas->expdLo, mkexpr(cas->oldLo)));
5902 
5903    /* 7. if "expected == old"
5904             store data# to shadow memory */
5905    do_shadow_Store( mce, cas->end, cas->addr, 0/*bias*/,
5906                     NULL/*data*/, vdataLo/*vdata*/,
5907                     expd_eq_old/*guard for store*/ );
5908    if (otrak) {
5909       gen_store_b( mce, elemSzB, cas->addr, 0/*offset*/,
5910                    bdataLo/*bdata*/,
5911                    expd_eq_old/*guard for store*/ );
5912    }
5913 }
5914 
5915 
do_shadow_CAS_double(MCEnv * mce,IRCAS * cas)5916 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas )
5917 {
5918    IRAtom *vdataHi = NULL, *bdataHi = NULL;
5919    IRAtom *vdataLo = NULL, *bdataLo = NULL;
5920    IRAtom *vexpdHi = NULL, *bexpdHi = NULL;
5921    IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
5922    IRAtom *voldHi  = NULL, *boldHi  = NULL;
5923    IRAtom *voldLo  = NULL, *boldLo  = NULL;
5924    IRAtom *xHi = NULL, *xLo = NULL, *xHL = NULL;
5925    IRAtom *expd_eq_old = NULL, *zero = NULL;
5926    IROp   opCasCmpEQ, opOr, opXor;
5927    Int    elemSzB, memOffsLo, memOffsHi;
5928    IRType elemTy;
5929    Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
5930 
5931    /* double CAS */
5932    tl_assert(cas->oldHi != IRTemp_INVALID);
5933    tl_assert(cas->expdHi != NULL);
5934    tl_assert(cas->dataHi != NULL);
5935 
5936    elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
5937    switch (elemTy) {
5938       case Ity_I8:
5939          opCasCmpEQ = Iop_CasCmpEQ8; opOr = Iop_Or8; opXor = Iop_Xor8;
5940          elemSzB = 1; zero = mkU8(0);
5941          break;
5942       case Ity_I16:
5943          opCasCmpEQ = Iop_CasCmpEQ16; opOr = Iop_Or16; opXor = Iop_Xor16;
5944          elemSzB = 2; zero = mkU16(0);
5945          break;
5946       case Ity_I32:
5947          opCasCmpEQ = Iop_CasCmpEQ32; opOr = Iop_Or32; opXor = Iop_Xor32;
5948          elemSzB = 4; zero = mkU32(0);
5949          break;
5950       case Ity_I64:
5951          opCasCmpEQ = Iop_CasCmpEQ64; opOr = Iop_Or64; opXor = Iop_Xor64;
5952          elemSzB = 8; zero = mkU64(0);
5953          break;
5954       default:
5955          tl_assert(0); /* IR defn disallows any other types */
5956    }
5957 
5958    /* 1. fetch data# (the proposed new value) */
5959    tl_assert(isOriginalAtom(mce, cas->dataHi));
5960    tl_assert(isOriginalAtom(mce, cas->dataLo));
5961    vdataHi
5962       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataHi));
5963    vdataLo
5964       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo));
5965    tl_assert(isShadowAtom(mce, vdataHi));
5966    tl_assert(isShadowAtom(mce, vdataLo));
5967    if (otrak) {
5968       bdataHi
5969          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataHi));
5970       bdataLo
5971          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
5972       tl_assert(isShadowAtom(mce, bdataHi));
5973       tl_assert(isShadowAtom(mce, bdataLo));
5974    }
5975 
5976    /* 2. fetch expected# (what we expect to see at the address) */
5977    tl_assert(isOriginalAtom(mce, cas->expdHi));
5978    tl_assert(isOriginalAtom(mce, cas->expdLo));
5979    vexpdHi
5980       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdHi));
5981    vexpdLo
5982       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo));
5983    tl_assert(isShadowAtom(mce, vexpdHi));
5984    tl_assert(isShadowAtom(mce, vexpdLo));
5985    if (otrak) {
5986       bexpdHi
5987          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdHi));
5988       bexpdLo
5989          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
5990       tl_assert(isShadowAtom(mce, bexpdHi));
5991       tl_assert(isShadowAtom(mce, bexpdLo));
5992    }
5993 
5994    /* 3. check definedness of address */
5995    /* 4. fetch old# from shadow memory; this also checks
5996          addressibility of the address */
5997    if (cas->end == Iend_LE) {
5998       memOffsLo = 0;
5999       memOffsHi = elemSzB;
6000    } else {
6001       tl_assert(cas->end == Iend_BE);
6002       memOffsLo = elemSzB;
6003       memOffsHi = 0;
6004    }
6005    voldHi
6006       = assignNew(
6007            'V', mce, elemTy,
6008            expr2vbits_Load(
6009               mce,
6010               cas->end, elemTy, cas->addr, memOffsHi/*Addr bias*/,
6011               NULL/*always happens*/
6012         ));
6013    voldLo
6014       = assignNew(
6015            'V', mce, elemTy,
6016            expr2vbits_Load(
6017               mce,
6018               cas->end, elemTy, cas->addr, memOffsLo/*Addr bias*/,
6019               NULL/*always happens*/
6020         ));
6021    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldHi), voldHi);
6022    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
6023    if (otrak) {
6024       boldHi
6025          = assignNew('B', mce, Ity_I32,
6026                      gen_load_b(mce, elemSzB, cas->addr,
6027                                 memOffsHi/*addr bias*/));
6028       boldLo
6029          = assignNew('B', mce, Ity_I32,
6030                      gen_load_b(mce, elemSzB, cas->addr,
6031                                 memOffsLo/*addr bias*/));
6032       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldHi), boldHi);
6033       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
6034    }
6035 
6036    /* 5. the CAS itself */
6037    stmt( 'C', mce, IRStmt_CAS(cas) );
6038 
6039    /* 6. compute "expected == old" */
6040    /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
6041    /* Note that 'C' is kinda faking it; it is indeed a non-shadow
6042       tree, but it's not copied from the input block. */
6043    /*
6044       xHi = oldHi ^ expdHi;
6045       xLo = oldLo ^ expdLo;
6046       xHL = xHi | xLo;
6047       expd_eq_old = xHL == 0;
6048    */
6049    xHi = assignNew('C', mce, elemTy,
6050                    binop(opXor, cas->expdHi, mkexpr(cas->oldHi)));
6051    xLo = assignNew('C', mce, elemTy,
6052                    binop(opXor, cas->expdLo, mkexpr(cas->oldLo)));
6053    xHL = assignNew('C', mce, elemTy,
6054                    binop(opOr, xHi, xLo));
6055    expd_eq_old
6056       = assignNew('C', mce, Ity_I1,
6057                   binop(opCasCmpEQ, xHL, zero));
6058 
6059    /* 7. if "expected == old"
6060             store data# to shadow memory */
6061    do_shadow_Store( mce, cas->end, cas->addr, memOffsHi/*bias*/,
6062                     NULL/*data*/, vdataHi/*vdata*/,
6063                     expd_eq_old/*guard for store*/ );
6064    do_shadow_Store( mce, cas->end, cas->addr, memOffsLo/*bias*/,
6065                     NULL/*data*/, vdataLo/*vdata*/,
6066                     expd_eq_old/*guard for store*/ );
6067    if (otrak) {
6068       gen_store_b( mce, elemSzB, cas->addr, memOffsHi/*offset*/,
6069                    bdataHi/*bdata*/,
6070                    expd_eq_old/*guard for store*/ );
6071       gen_store_b( mce, elemSzB, cas->addr, memOffsLo/*offset*/,
6072                    bdataLo/*bdata*/,
6073                    expd_eq_old/*guard for store*/ );
6074    }
6075 }
6076 
6077 
6078 /* ------ Dealing with LL/SC (not difficult) ------ */
6079 
do_shadow_LLSC(MCEnv * mce,IREndness stEnd,IRTemp stResult,IRExpr * stAddr,IRExpr * stStoredata)6080 static void do_shadow_LLSC ( MCEnv*    mce,
6081                              IREndness stEnd,
6082                              IRTemp    stResult,
6083                              IRExpr*   stAddr,
6084                              IRExpr*   stStoredata )
6085 {
6086    /* In short: treat a load-linked like a normal load followed by an
6087       assignment of the loaded (shadow) data to the result temporary.
6088       Treat a store-conditional like a normal store, and mark the
6089       result temporary as defined. */
6090    IRType resTy  = typeOfIRTemp(mce->sb->tyenv, stResult);
6091    IRTemp resTmp = findShadowTmpV(mce, stResult);
6092 
6093    tl_assert(isIRAtom(stAddr));
6094    if (stStoredata)
6095       tl_assert(isIRAtom(stStoredata));
6096 
6097    if (stStoredata == NULL) {
6098       /* Load Linked */
6099       /* Just treat this as a normal load, followed by an assignment of
6100          the value to .result. */
6101       /* Stay sane */
6102       tl_assert(resTy == Ity_I64 || resTy == Ity_I32
6103                 || resTy == Ity_I16 || resTy == Ity_I8);
6104       assign( 'V', mce, resTmp,
6105                    expr2vbits_Load(
6106                       mce, stEnd, resTy, stAddr, 0/*addr bias*/,
6107                       NULL/*always happens*/) );
6108    } else {
6109       /* Store Conditional */
6110       /* Stay sane */
6111       IRType dataTy = typeOfIRExpr(mce->sb->tyenv,
6112                                    stStoredata);
6113       tl_assert(dataTy == Ity_I64 || dataTy == Ity_I32
6114                 || dataTy == Ity_I16 || dataTy == Ity_I8);
6115       do_shadow_Store( mce, stEnd,
6116                             stAddr, 0/* addr bias */,
6117                             stStoredata,
6118                             NULL /* shadow data */,
6119                             NULL/*guard*/ );
6120       /* This is a store conditional, so it writes to .result a value
6121          indicating whether or not the store succeeded.  Just claim
6122          this value is always defined.  In the PowerPC interpretation
6123          of store-conditional, definedness of the success indication
6124          depends on whether the address of the store matches the
6125          reservation address.  But we can't tell that here (and
6126          anyway, we're not being PowerPC-specific).  At least we are
6127          guaranteed that the definedness of the store address, and its
6128          addressibility, will be checked as per normal.  So it seems
6129          pretty safe to just say that the success indication is always
6130          defined.
6131 
6132          In schemeS, for origin tracking, we must correspondingly set
6133          a no-origin value for the origin shadow of .result.
6134       */
6135       tl_assert(resTy == Ity_I1);
6136       assign( 'V', mce, resTmp, definedOfType(resTy) );
6137    }
6138 }
6139 
6140 
6141 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
6142 
do_shadow_StoreG(MCEnv * mce,IRStoreG * sg)6143 static void do_shadow_StoreG ( MCEnv* mce, IRStoreG* sg )
6144 {
6145    complainIfUndefined(mce, sg->guard, NULL);
6146    /* do_shadow_Store will generate code to check the definedness and
6147       validity of sg->addr, in the case where sg->guard evaluates to
6148       True at run-time. */
6149    do_shadow_Store( mce, sg->end,
6150                     sg->addr, 0/* addr bias */,
6151                     sg->data,
6152                     NULL /* shadow data */,
6153                     sg->guard );
6154 }
6155 
do_shadow_LoadG(MCEnv * mce,IRLoadG * lg)6156 static void do_shadow_LoadG ( MCEnv* mce, IRLoadG* lg )
6157 {
6158    complainIfUndefined(mce, lg->guard, NULL);
6159    /* expr2vbits_Load_guarded_General will generate code to check the
6160       definedness and validity of lg->addr, in the case where
6161       lg->guard evaluates to True at run-time. */
6162 
6163    /* Look at the LoadG's built-in conversion operation, to determine
6164       the source (actual loaded data) type, and the equivalent IROp.
6165       NOTE that implicitly we are taking a widening operation to be
6166       applied to original atoms and producing one that applies to V
6167       bits.  Since signed and unsigned widening are self-shadowing,
6168       this is a straight copy of the op (modulo swapping from the
6169       IRLoadGOp form to the IROp form).  Note also therefore that this
6170       implicitly duplicates the logic to do with said widening ops in
6171       expr2vbits_Unop.  See comment at the start of expr2vbits_Unop. */
6172    IROp   vwiden   = Iop_INVALID;
6173    IRType loadedTy = Ity_INVALID;
6174    switch (lg->cvt) {
6175       case ILGop_IdentV128: loadedTy = Ity_V128; vwiden = Iop_INVALID; break;
6176       case ILGop_Ident64:   loadedTy = Ity_I64;  vwiden = Iop_INVALID; break;
6177       case ILGop_Ident32:   loadedTy = Ity_I32;  vwiden = Iop_INVALID; break;
6178       case ILGop_16Uto32:   loadedTy = Ity_I16;  vwiden = Iop_16Uto32; break;
6179       case ILGop_16Sto32:   loadedTy = Ity_I16;  vwiden = Iop_16Sto32; break;
6180       case ILGop_8Uto32:    loadedTy = Ity_I8;   vwiden = Iop_8Uto32;  break;
6181       case ILGop_8Sto32:    loadedTy = Ity_I8;   vwiden = Iop_8Sto32;  break;
6182       default: VG_(tool_panic)("do_shadow_LoadG");
6183    }
6184 
6185    IRAtom* vbits_alt
6186       = expr2vbits( mce, lg->alt );
6187    IRAtom* vbits_final
6188       = expr2vbits_Load_guarded_General(mce, lg->end, loadedTy,
6189                                         lg->addr, 0/*addr bias*/,
6190                                         lg->guard, vwiden, vbits_alt );
6191    /* And finally, bind the V bits to the destination temporary. */
6192    assign( 'V', mce, findShadowTmpV(mce, lg->dst), vbits_final );
6193 }
6194 
6195 
6196 /*------------------------------------------------------------*/
6197 /*--- Memcheck main                                        ---*/
6198 /*------------------------------------------------------------*/
6199 
6200 static void schemeS ( MCEnv* mce, IRStmt* st );
6201 
isBogusAtom(IRAtom * at)6202 static Bool isBogusAtom ( IRAtom* at )
6203 {
6204    ULong n = 0;
6205    IRConst* con;
6206    tl_assert(isIRAtom(at));
6207    if (at->tag == Iex_RdTmp)
6208       return False;
6209    tl_assert(at->tag == Iex_Const);
6210    con = at->Iex.Const.con;
6211    switch (con->tag) {
6212       case Ico_U1:   return False;
6213       case Ico_U8:   n = (ULong)con->Ico.U8; break;
6214       case Ico_U16:  n = (ULong)con->Ico.U16; break;
6215       case Ico_U32:  n = (ULong)con->Ico.U32; break;
6216       case Ico_U64:  n = (ULong)con->Ico.U64; break;
6217       case Ico_F32:  return False;
6218       case Ico_F64:  return False;
6219       case Ico_F32i: return False;
6220       case Ico_F64i: return False;
6221       case Ico_V128: return False;
6222       case Ico_V256: return False;
6223       default: ppIRExpr(at); tl_assert(0);
6224    }
6225    /* VG_(printf)("%llx\n", n); */
6226    return (/*32*/    n == 0xFEFEFEFFULL
6227            /*32*/ || n == 0x80808080ULL
6228            /*32*/ || n == 0x7F7F7F7FULL
6229            /*32*/ || n == 0x7EFEFEFFULL
6230            /*32*/ || n == 0x81010100ULL
6231            /*64*/ || n == 0xFFFFFFFFFEFEFEFFULL
6232            /*64*/ || n == 0xFEFEFEFEFEFEFEFFULL
6233            /*64*/ || n == 0x0000000000008080ULL
6234            /*64*/ || n == 0x8080808080808080ULL
6235            /*64*/ || n == 0x0101010101010101ULL
6236           );
6237 }
6238 
checkForBogusLiterals(IRStmt * st)6239 static Bool checkForBogusLiterals ( /*FLAT*/ IRStmt* st )
6240 {
6241    Int      i;
6242    IRExpr*  e;
6243    IRDirty* d;
6244    IRCAS*   cas;
6245    switch (st->tag) {
6246       case Ist_WrTmp:
6247          e = st->Ist.WrTmp.data;
6248          switch (e->tag) {
6249             case Iex_Get:
6250             case Iex_RdTmp:
6251                return False;
6252             case Iex_Const:
6253                return isBogusAtom(e);
6254             case Iex_Unop:
6255                return isBogusAtom(e->Iex.Unop.arg)
6256                       || e->Iex.Unop.op == Iop_GetMSBs8x16;
6257             case Iex_GetI:
6258                return isBogusAtom(e->Iex.GetI.ix);
6259             case Iex_Binop:
6260                return isBogusAtom(e->Iex.Binop.arg1)
6261                       || isBogusAtom(e->Iex.Binop.arg2);
6262             case Iex_Triop:
6263                return isBogusAtom(e->Iex.Triop.details->arg1)
6264                       || isBogusAtom(e->Iex.Triop.details->arg2)
6265                       || isBogusAtom(e->Iex.Triop.details->arg3);
6266             case Iex_Qop:
6267                return isBogusAtom(e->Iex.Qop.details->arg1)
6268                       || isBogusAtom(e->Iex.Qop.details->arg2)
6269                       || isBogusAtom(e->Iex.Qop.details->arg3)
6270                       || isBogusAtom(e->Iex.Qop.details->arg4);
6271             case Iex_ITE:
6272                return isBogusAtom(e->Iex.ITE.cond)
6273                       || isBogusAtom(e->Iex.ITE.iftrue)
6274                       || isBogusAtom(e->Iex.ITE.iffalse);
6275             case Iex_Load:
6276                return isBogusAtom(e->Iex.Load.addr);
6277             case Iex_CCall:
6278                for (i = 0; e->Iex.CCall.args[i]; i++)
6279                   if (isBogusAtom(e->Iex.CCall.args[i]))
6280                      return True;
6281                return False;
6282             default:
6283                goto unhandled;
6284          }
6285       case Ist_Dirty:
6286          d = st->Ist.Dirty.details;
6287          for (i = 0; d->args[i]; i++) {
6288             IRAtom* atom = d->args[i];
6289             if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(atom))) {
6290                if (isBogusAtom(atom))
6291                   return True;
6292             }
6293          }
6294          if (isBogusAtom(d->guard))
6295             return True;
6296          if (d->mAddr && isBogusAtom(d->mAddr))
6297             return True;
6298          return False;
6299       case Ist_Put:
6300          return isBogusAtom(st->Ist.Put.data);
6301       case Ist_PutI:
6302          return isBogusAtom(st->Ist.PutI.details->ix)
6303                 || isBogusAtom(st->Ist.PutI.details->data);
6304       case Ist_Store:
6305          return isBogusAtom(st->Ist.Store.addr)
6306                 || isBogusAtom(st->Ist.Store.data);
6307       case Ist_StoreG: {
6308          IRStoreG* sg = st->Ist.StoreG.details;
6309          return isBogusAtom(sg->addr) || isBogusAtom(sg->data)
6310                 || isBogusAtom(sg->guard);
6311       }
6312       case Ist_LoadG: {
6313          IRLoadG* lg = st->Ist.LoadG.details;
6314          return isBogusAtom(lg->addr) || isBogusAtom(lg->alt)
6315                 || isBogusAtom(lg->guard);
6316       }
6317       case Ist_Exit:
6318          return isBogusAtom(st->Ist.Exit.guard);
6319       case Ist_AbiHint:
6320          return isBogusAtom(st->Ist.AbiHint.base)
6321                 || isBogusAtom(st->Ist.AbiHint.nia);
6322       case Ist_NoOp:
6323       case Ist_IMark:
6324       case Ist_MBE:
6325          return False;
6326       case Ist_CAS:
6327          cas = st->Ist.CAS.details;
6328          return isBogusAtom(cas->addr)
6329                 || (cas->expdHi ? isBogusAtom(cas->expdHi) : False)
6330                 || isBogusAtom(cas->expdLo)
6331                 || (cas->dataHi ? isBogusAtom(cas->dataHi) : False)
6332                 || isBogusAtom(cas->dataLo);
6333       case Ist_LLSC:
6334          return isBogusAtom(st->Ist.LLSC.addr)
6335                 || (st->Ist.LLSC.storedata
6336                        ? isBogusAtom(st->Ist.LLSC.storedata)
6337                        : False);
6338       default:
6339       unhandled:
6340          ppIRStmt(st);
6341          VG_(tool_panic)("hasBogusLiterals");
6342    }
6343 }
6344 
6345 
MC_(instrument)6346 IRSB* MC_(instrument) ( VgCallbackClosure* closure,
6347                         IRSB* sb_in,
6348                         const VexGuestLayout* layout,
6349                         const VexGuestExtents* vge,
6350                         const VexArchInfo* archinfo_host,
6351                         IRType gWordTy, IRType hWordTy )
6352 {
6353    Bool    verboze = 0||False;
6354    Int     i, j, first_stmt;
6355    IRStmt* st;
6356    MCEnv   mce;
6357    IRSB*   sb_out;
6358 
6359    if (gWordTy != hWordTy) {
6360       /* We don't currently support this case. */
6361       VG_(tool_panic)("host/guest word size mismatch");
6362    }
6363 
6364    /* Check we're not completely nuts */
6365    tl_assert(sizeof(UWord)  == sizeof(void*));
6366    tl_assert(sizeof(Word)   == sizeof(void*));
6367    tl_assert(sizeof(Addr)   == sizeof(void*));
6368    tl_assert(sizeof(ULong)  == 8);
6369    tl_assert(sizeof(Long)   == 8);
6370    tl_assert(sizeof(UInt)   == 4);
6371    tl_assert(sizeof(Int)    == 4);
6372 
6373    tl_assert(MC_(clo_mc_level) >= 1 && MC_(clo_mc_level) <= 3);
6374 
6375    /* Set up SB */
6376    sb_out = deepCopyIRSBExceptStmts(sb_in);
6377 
6378    /* Set up the running environment.  Both .sb and .tmpMap are
6379       modified as we go along.  Note that tmps are added to both
6380       .sb->tyenv and .tmpMap together, so the valid index-set for
6381       those two arrays should always be identical. */
6382    VG_(memset)(&mce, 0, sizeof(mce));
6383    mce.sb             = sb_out;
6384    mce.trace          = verboze;
6385    mce.layout         = layout;
6386    mce.hWordTy        = hWordTy;
6387    mce.bogusLiterals  = False;
6388 
6389    /* Do expensive interpretation for Iop_Add32 and Iop_Add64 on
6390       Darwin.  10.7 is mostly built with LLVM, which uses these for
6391       bitfield inserts, and we get a lot of false errors if the cheap
6392       interpretation is used, alas.  Could solve this much better if
6393       we knew which of such adds came from x86/amd64 LEA instructions,
6394       since these are the only ones really needing the expensive
6395       interpretation, but that would require some way to tag them in
6396       the _toIR.c front ends, which is a lot of faffing around.  So
6397       for now just use the slow and blunt-instrument solution. */
6398    mce.useLLVMworkarounds = False;
6399 #  if defined(VGO_darwin)
6400    mce.useLLVMworkarounds = True;
6401 #  endif
6402 
6403    mce.tmpMap = VG_(newXA)( VG_(malloc), "mc.MC_(instrument).1", VG_(free),
6404                             sizeof(TempMapEnt));
6405    VG_(hintSizeXA) (mce.tmpMap, sb_in->tyenv->types_used);
6406    for (i = 0; i < sb_in->tyenv->types_used; i++) {
6407       TempMapEnt ent;
6408       ent.kind    = Orig;
6409       ent.shadowV = IRTemp_INVALID;
6410       ent.shadowB = IRTemp_INVALID;
6411       VG_(addToXA)( mce.tmpMap, &ent );
6412    }
6413    tl_assert( VG_(sizeXA)( mce.tmpMap ) == sb_in->tyenv->types_used );
6414 
6415    if (MC_(clo_expensive_definedness_checks)) {
6416       /* For expensive definedness checking skip looking for bogus
6417          literals. */
6418       mce.bogusLiterals = True;
6419    } else {
6420       /* Make a preliminary inspection of the statements, to see if there
6421          are any dodgy-looking literals.  If there are, we generate
6422          extra-detailed (hence extra-expensive) instrumentation in
6423          places.  Scan the whole bb even if dodgyness is found earlier,
6424          so that the flatness assertion is applied to all stmts. */
6425       Bool bogus = False;
6426 
6427       for (i = 0; i < sb_in->stmts_used; i++) {
6428          st = sb_in->stmts[i];
6429          tl_assert(st);
6430          tl_assert(isFlatIRStmt(st));
6431 
6432          if (!bogus) {
6433             bogus = checkForBogusLiterals(st);
6434             if (0 && bogus) {
6435                VG_(printf)("bogus: ");
6436                ppIRStmt(st);
6437                VG_(printf)("\n");
6438             }
6439             if (bogus) break;
6440          }
6441       }
6442       mce.bogusLiterals = bogus;
6443    }
6444 
6445    /* Copy verbatim any IR preamble preceding the first IMark */
6446 
6447    tl_assert(mce.sb == sb_out);
6448    tl_assert(mce.sb != sb_in);
6449 
6450    i = 0;
6451    while (i < sb_in->stmts_used && sb_in->stmts[i]->tag != Ist_IMark) {
6452 
6453       st = sb_in->stmts[i];
6454       tl_assert(st);
6455       tl_assert(isFlatIRStmt(st));
6456 
6457       stmt( 'C', &mce, sb_in->stmts[i] );
6458       i++;
6459    }
6460 
6461    /* Nasty problem.  IR optimisation of the pre-instrumented IR may
6462       cause the IR following the preamble to contain references to IR
6463       temporaries defined in the preamble.  Because the preamble isn't
6464       instrumented, these temporaries don't have any shadows.
6465       Nevertheless uses of them following the preamble will cause
6466       memcheck to generate references to their shadows.  End effect is
6467       to cause IR sanity check failures, due to references to
6468       non-existent shadows.  This is only evident for the complex
6469       preambles used for function wrapping on TOC-afflicted platforms
6470       (ppc64-linux).
6471 
6472       The following loop therefore scans the preamble looking for
6473       assignments to temporaries.  For each one found it creates an
6474       assignment to the corresponding (V) shadow temp, marking it as
6475       'defined'.  This is the same resulting IR as if the main
6476       instrumentation loop before had been applied to the statement
6477       'tmp = CONSTANT'.
6478 
6479       Similarly, if origin tracking is enabled, we must generate an
6480       assignment for the corresponding origin (B) shadow, claiming
6481       no-origin, as appropriate for a defined value.
6482    */
6483    for (j = 0; j < i; j++) {
6484       if (sb_in->stmts[j]->tag == Ist_WrTmp) {
6485          /* findShadowTmpV checks its arg is an original tmp;
6486             no need to assert that here. */
6487          IRTemp tmp_o = sb_in->stmts[j]->Ist.WrTmp.tmp;
6488          IRTemp tmp_v = findShadowTmpV(&mce, tmp_o);
6489          IRType ty_v  = typeOfIRTemp(sb_out->tyenv, tmp_v);
6490          assign( 'V', &mce, tmp_v, definedOfType( ty_v ) );
6491          if (MC_(clo_mc_level) == 3) {
6492             IRTemp tmp_b = findShadowTmpB(&mce, tmp_o);
6493             tl_assert(typeOfIRTemp(sb_out->tyenv, tmp_b) == Ity_I32);
6494             assign( 'B', &mce, tmp_b, mkU32(0)/* UNKNOWN ORIGIN */);
6495          }
6496          if (0) {
6497             VG_(printf)("create shadow tmp(s) for preamble tmp [%d] ty ", j);
6498             ppIRType( ty_v );
6499             VG_(printf)("\n");
6500          }
6501       }
6502    }
6503 
6504    /* Iterate over the remaining stmts to generate instrumentation. */
6505 
6506    tl_assert(sb_in->stmts_used > 0);
6507    tl_assert(i >= 0);
6508    tl_assert(i < sb_in->stmts_used);
6509    tl_assert(sb_in->stmts[i]->tag == Ist_IMark);
6510 
6511    for (/* use current i*/; i < sb_in->stmts_used; i++) {
6512 
6513       st = sb_in->stmts[i];
6514       first_stmt = sb_out->stmts_used;
6515 
6516       if (verboze) {
6517          VG_(printf)("\n");
6518          ppIRStmt(st);
6519          VG_(printf)("\n");
6520       }
6521 
6522       if (MC_(clo_mc_level) == 3) {
6523          /* See comments on case Ist_CAS below. */
6524          if (st->tag != Ist_CAS)
6525             schemeS( &mce, st );
6526       }
6527 
6528       /* Generate instrumentation code for each stmt ... */
6529 
6530       switch (st->tag) {
6531 
6532          case Ist_WrTmp:
6533             assign( 'V', &mce, findShadowTmpV(&mce, st->Ist.WrTmp.tmp),
6534                                expr2vbits( &mce, st->Ist.WrTmp.data) );
6535             break;
6536 
6537          case Ist_Put:
6538             do_shadow_PUT( &mce,
6539                            st->Ist.Put.offset,
6540                            st->Ist.Put.data,
6541                            NULL /* shadow atom */, NULL /* guard */ );
6542             break;
6543 
6544          case Ist_PutI:
6545             do_shadow_PUTI( &mce, st->Ist.PutI.details);
6546             break;
6547 
6548          case Ist_Store:
6549             do_shadow_Store( &mce, st->Ist.Store.end,
6550                                    st->Ist.Store.addr, 0/* addr bias */,
6551                                    st->Ist.Store.data,
6552                                    NULL /* shadow data */,
6553                                    NULL/*guard*/ );
6554             break;
6555 
6556          case Ist_StoreG:
6557             do_shadow_StoreG( &mce, st->Ist.StoreG.details );
6558             break;
6559 
6560          case Ist_LoadG:
6561             do_shadow_LoadG( &mce, st->Ist.LoadG.details );
6562             break;
6563 
6564          case Ist_Exit:
6565             complainIfUndefined( &mce, st->Ist.Exit.guard, NULL );
6566             break;
6567 
6568          case Ist_IMark:
6569             break;
6570 
6571          case Ist_NoOp:
6572          case Ist_MBE:
6573             break;
6574 
6575          case Ist_Dirty:
6576             do_shadow_Dirty( &mce, st->Ist.Dirty.details );
6577             break;
6578 
6579          case Ist_AbiHint:
6580             do_AbiHint( &mce, st->Ist.AbiHint.base,
6581                               st->Ist.AbiHint.len,
6582                               st->Ist.AbiHint.nia );
6583             break;
6584 
6585          case Ist_CAS:
6586             do_shadow_CAS( &mce, st->Ist.CAS.details );
6587             /* Note, do_shadow_CAS copies the CAS itself to the output
6588                block, because it needs to add instrumentation both
6589                before and after it.  Hence skip the copy below.  Also
6590                skip the origin-tracking stuff (call to schemeS) above,
6591                since that's all tangled up with it too; do_shadow_CAS
6592                does it all. */
6593             break;
6594 
6595          case Ist_LLSC:
6596             do_shadow_LLSC( &mce,
6597                             st->Ist.LLSC.end,
6598                             st->Ist.LLSC.result,
6599                             st->Ist.LLSC.addr,
6600                             st->Ist.LLSC.storedata );
6601             break;
6602 
6603          default:
6604             VG_(printf)("\n");
6605             ppIRStmt(st);
6606             VG_(printf)("\n");
6607             VG_(tool_panic)("memcheck: unhandled IRStmt");
6608 
6609       } /* switch (st->tag) */
6610 
6611       if (0 && verboze) {
6612          for (j = first_stmt; j < sb_out->stmts_used; j++) {
6613             VG_(printf)("   ");
6614             ppIRStmt(sb_out->stmts[j]);
6615             VG_(printf)("\n");
6616          }
6617          VG_(printf)("\n");
6618       }
6619 
6620       /* ... and finally copy the stmt itself to the output.  Except,
6621          skip the copy of IRCASs; see comments on case Ist_CAS
6622          above. */
6623       if (st->tag != Ist_CAS)
6624          stmt('C', &mce, st);
6625    }
6626 
6627    /* Now we need to complain if the jump target is undefined. */
6628    first_stmt = sb_out->stmts_used;
6629 
6630    if (verboze) {
6631       VG_(printf)("sb_in->next = ");
6632       ppIRExpr(sb_in->next);
6633       VG_(printf)("\n\n");
6634    }
6635 
6636    complainIfUndefined( &mce, sb_in->next, NULL );
6637 
6638    if (0 && verboze) {
6639       for (j = first_stmt; j < sb_out->stmts_used; j++) {
6640          VG_(printf)("   ");
6641          ppIRStmt(sb_out->stmts[j]);
6642          VG_(printf)("\n");
6643       }
6644       VG_(printf)("\n");
6645    }
6646 
6647    /* If this fails, there's been some serious snafu with tmp management,
6648       that should be investigated. */
6649    tl_assert( VG_(sizeXA)( mce.tmpMap ) == mce.sb->tyenv->types_used );
6650    VG_(deleteXA)( mce.tmpMap );
6651 
6652    tl_assert(mce.sb == sb_out);
6653    return sb_out;
6654 }
6655 
6656 
6657 /*------------------------------------------------------------*/
6658 /*--- Post-tree-build final tidying                        ---*/
6659 /*------------------------------------------------------------*/
6660 
6661 /* This exploits the observation that Memcheck often produces
6662    repeated conditional calls of the form
6663 
6664    Dirty G MC_(helperc_value_check0/1/4/8_fail)(UInt otag)
6665 
6666    with the same guard expression G guarding the same helper call.
6667    The second and subsequent calls are redundant.  This usually
6668    results from instrumentation of guest code containing multiple
6669    memory references at different constant offsets from the same base
6670    register.  After optimisation of the instrumentation, you get a
6671    test for the definedness of the base register for each memory
6672    reference, which is kinda pointless.  MC_(final_tidy) therefore
6673    looks for such repeated calls and removes all but the first. */
6674 
6675 
6676 /* With some testing on perf/bz2.c, on amd64 and x86, compiled with
6677    gcc-5.3.1 -O2, it appears that 16 entries in the array are enough to
6678    get almost all the benefits of this transformation whilst causing
6679    the slide-back case to just often enough to be verifiably
6680    correct.  For posterity, the numbers are:
6681 
6682    bz2-32
6683 
6684    1   4,336 (112,212 -> 1,709,473; ratio 15.2)
6685    2   4,336 (112,194 -> 1,669,895; ratio 14.9)
6686    3   4,336 (112,194 -> 1,660,713; ratio 14.8)
6687    4   4,336 (112,194 -> 1,658,555; ratio 14.8)
6688    5   4,336 (112,194 -> 1,655,447; ratio 14.8)
6689    6   4,336 (112,194 -> 1,655,101; ratio 14.8)
6690    7   4,336 (112,194 -> 1,654,858; ratio 14.7)
6691    8   4,336 (112,194 -> 1,654,810; ratio 14.7)
6692    10  4,336 (112,194 -> 1,654,621; ratio 14.7)
6693    12  4,336 (112,194 -> 1,654,678; ratio 14.7)
6694    16  4,336 (112,194 -> 1,654,494; ratio 14.7)
6695    32  4,336 (112,194 -> 1,654,602; ratio 14.7)
6696    inf 4,336 (112,194 -> 1,654,602; ratio 14.7)
6697 
6698    bz2-64
6699 
6700    1   4,113 (107,329 -> 1,822,171; ratio 17.0)
6701    2   4,113 (107,329 -> 1,806,443; ratio 16.8)
6702    3   4,113 (107,329 -> 1,803,967; ratio 16.8)
6703    4   4,113 (107,329 -> 1,802,785; ratio 16.8)
6704    5   4,113 (107,329 -> 1,802,412; ratio 16.8)
6705    6   4,113 (107,329 -> 1,802,062; ratio 16.8)
6706    7   4,113 (107,329 -> 1,801,976; ratio 16.8)
6707    8   4,113 (107,329 -> 1,801,886; ratio 16.8)
6708    10  4,113 (107,329 -> 1,801,653; ratio 16.8)
6709    12  4,113 (107,329 -> 1,801,526; ratio 16.8)
6710    16  4,113 (107,329 -> 1,801,298; ratio 16.8)
6711    32  4,113 (107,329 -> 1,800,827; ratio 16.8)
6712    inf 4,113 (107,329 -> 1,800,827; ratio 16.8)
6713 */
6714 
6715 /* Structs for recording which (helper, guard) pairs we have already
6716    seen. */
6717 
6718 #define N_TIDYING_PAIRS 16
6719 
6720 typedef
6721    struct { void* entry; IRExpr* guard; }
6722    Pair;
6723 
6724 typedef
6725    struct {
6726       Pair pairs[N_TIDYING_PAIRS +1/*for bounds checking*/];
6727       UInt pairsUsed;
6728    }
6729    Pairs;
6730 
6731 
6732 /* Return True if e1 and e2 definitely denote the same value (used to
6733    compare guards).  Return False if unknown; False is the safe
6734    answer.  Since guest registers and guest memory do not have the
6735    SSA property we must return False if any Gets or Loads appear in
6736    the expression.  This implicitly assumes that e1 and e2 have the
6737    same IR type, which is always true here -- the type is Ity_I1. */
6738 
sameIRValue(IRExpr * e1,IRExpr * e2)6739 static Bool sameIRValue ( IRExpr* e1, IRExpr* e2 )
6740 {
6741    if (e1->tag != e2->tag)
6742       return False;
6743    switch (e1->tag) {
6744       case Iex_Const:
6745          return eqIRConst( e1->Iex.Const.con, e2->Iex.Const.con );
6746       case Iex_Binop:
6747          return e1->Iex.Binop.op == e2->Iex.Binop.op
6748                 && sameIRValue(e1->Iex.Binop.arg1, e2->Iex.Binop.arg1)
6749                 && sameIRValue(e1->Iex.Binop.arg2, e2->Iex.Binop.arg2);
6750       case Iex_Unop:
6751          return e1->Iex.Unop.op == e2->Iex.Unop.op
6752                 && sameIRValue(e1->Iex.Unop.arg, e2->Iex.Unop.arg);
6753       case Iex_RdTmp:
6754          return e1->Iex.RdTmp.tmp == e2->Iex.RdTmp.tmp;
6755       case Iex_ITE:
6756          return sameIRValue( e1->Iex.ITE.cond, e2->Iex.ITE.cond )
6757                 && sameIRValue( e1->Iex.ITE.iftrue,  e2->Iex.ITE.iftrue )
6758                 && sameIRValue( e1->Iex.ITE.iffalse, e2->Iex.ITE.iffalse );
6759       case Iex_Qop:
6760       case Iex_Triop:
6761       case Iex_CCall:
6762          /* be lazy.  Could define equality for these, but they never
6763             appear to be used. */
6764          return False;
6765       case Iex_Get:
6766       case Iex_GetI:
6767       case Iex_Load:
6768          /* be conservative - these may not give the same value each
6769             time */
6770          return False;
6771       case Iex_Binder:
6772          /* should never see this */
6773          /* fallthrough */
6774       default:
6775          VG_(printf)("mc_translate.c: sameIRValue: unhandled: ");
6776          ppIRExpr(e1);
6777          VG_(tool_panic)("memcheck:sameIRValue");
6778          return False;
6779    }
6780 }
6781 
6782 /* See if 'pairs' already has an entry for (entry, guard).  Return
6783    True if so.  If not, add an entry. */
6784 
6785 static
check_or_add(Pairs * tidyingEnv,IRExpr * guard,void * entry)6786 Bool check_or_add ( Pairs* tidyingEnv, IRExpr* guard, void* entry )
6787 {
6788    UInt i, n = tidyingEnv->pairsUsed;
6789    tl_assert(n <= N_TIDYING_PAIRS);
6790    for (i = 0; i < n; i++) {
6791       if (tidyingEnv->pairs[i].entry == entry
6792           && sameIRValue(tidyingEnv->pairs[i].guard, guard))
6793          return True;
6794    }
6795    /* (guard, entry) wasn't found in the array.  Add it at the end.
6796       If the array is already full, slide the entries one slot
6797       backwards.  This means we will lose to ability to detect
6798       duplicates from the pair in slot zero, but that happens so
6799       rarely that it's unlikely to have much effect on overall code
6800       quality.  Also, this strategy loses the check for the oldest
6801       tracked exit (memory reference, basically) and so that is (I'd
6802       guess) least likely to be re-used after this point. */
6803    tl_assert(i == n);
6804    if (n == N_TIDYING_PAIRS) {
6805       for (i = 1; i < N_TIDYING_PAIRS; i++) {
6806          tidyingEnv->pairs[i-1] = tidyingEnv->pairs[i];
6807       }
6808       tidyingEnv->pairs[N_TIDYING_PAIRS-1].entry = entry;
6809       tidyingEnv->pairs[N_TIDYING_PAIRS-1].guard = guard;
6810    } else {
6811       tl_assert(n < N_TIDYING_PAIRS);
6812       tidyingEnv->pairs[n].entry = entry;
6813       tidyingEnv->pairs[n].guard = guard;
6814       n++;
6815       tidyingEnv->pairsUsed = n;
6816    }
6817    return False;
6818 }
6819 
is_helperc_value_checkN_fail(const HChar * name)6820 static Bool is_helperc_value_checkN_fail ( const HChar* name )
6821 {
6822    /* This is expensive because it happens a lot.  We are checking to
6823       see whether |name| is one of the following 8 strings:
6824 
6825          MC_(helperc_value_check8_fail_no_o)
6826          MC_(helperc_value_check4_fail_no_o)
6827          MC_(helperc_value_check0_fail_no_o)
6828          MC_(helperc_value_check1_fail_no_o)
6829          MC_(helperc_value_check8_fail_w_o)
6830          MC_(helperc_value_check0_fail_w_o)
6831          MC_(helperc_value_check1_fail_w_o)
6832          MC_(helperc_value_check4_fail_w_o)
6833 
6834       To speed it up, check the common prefix just once, rather than
6835       all 8 times.
6836    */
6837    const HChar* prefix = "MC_(helperc_value_check";
6838 
6839    HChar n, p;
6840    while (True) {
6841       n = *name;
6842       p = *prefix;
6843       if (p == 0) break; /* ran off the end of the prefix */
6844       /* We still have some prefix to use */
6845       if (n == 0) return False; /* have prefix, but name ran out */
6846       if (n != p) return False; /* have both pfx and name, but no match */
6847       name++;
6848       prefix++;
6849    }
6850 
6851    /* Check the part after the prefix. */
6852    tl_assert(*prefix == 0 && *name != 0);
6853    return    0==VG_(strcmp)(name, "8_fail_no_o)")
6854           || 0==VG_(strcmp)(name, "4_fail_no_o)")
6855           || 0==VG_(strcmp)(name, "0_fail_no_o)")
6856           || 0==VG_(strcmp)(name, "1_fail_no_o)")
6857           || 0==VG_(strcmp)(name, "8_fail_w_o)")
6858           || 0==VG_(strcmp)(name, "4_fail_w_o)")
6859           || 0==VG_(strcmp)(name, "0_fail_w_o)")
6860           || 0==VG_(strcmp)(name, "1_fail_w_o)");
6861 }
6862 
MC_(final_tidy)6863 IRSB* MC_(final_tidy) ( IRSB* sb_in )
6864 {
6865    Int       i;
6866    IRStmt*   st;
6867    IRDirty*  di;
6868    IRExpr*   guard;
6869    IRCallee* cee;
6870    Bool      alreadyPresent;
6871    Pairs     pairs;
6872 
6873    pairs.pairsUsed = 0;
6874 
6875    pairs.pairs[N_TIDYING_PAIRS].entry = (void*)0x123;
6876    pairs.pairs[N_TIDYING_PAIRS].guard = (IRExpr*)0x456;
6877 
6878    /* Scan forwards through the statements.  Each time a call to one
6879       of the relevant helpers is seen, check if we have made a
6880       previous call to the same helper using the same guard
6881       expression, and if so, delete the call. */
6882    for (i = 0; i < sb_in->stmts_used; i++) {
6883       st = sb_in->stmts[i];
6884       tl_assert(st);
6885       if (st->tag != Ist_Dirty)
6886          continue;
6887       di = st->Ist.Dirty.details;
6888       guard = di->guard;
6889       tl_assert(guard);
6890       if (0) { ppIRExpr(guard); VG_(printf)("\n"); }
6891       cee = di->cee;
6892       if (!is_helperc_value_checkN_fail( cee->name ))
6893          continue;
6894        /* Ok, we have a call to helperc_value_check0/1/4/8_fail with
6895           guard 'guard'.  Check if we have already seen a call to this
6896           function with the same guard.  If so, delete it.  If not,
6897           add it to the set of calls we do know about. */
6898       alreadyPresent = check_or_add( &pairs, guard, cee->addr );
6899       if (alreadyPresent) {
6900          sb_in->stmts[i] = IRStmt_NoOp();
6901          if (0) VG_(printf)("XX\n");
6902       }
6903    }
6904 
6905    tl_assert(pairs.pairs[N_TIDYING_PAIRS].entry == (void*)0x123);
6906    tl_assert(pairs.pairs[N_TIDYING_PAIRS].guard == (IRExpr*)0x456);
6907 
6908    return sb_in;
6909 }
6910 
6911 #undef N_TIDYING_PAIRS
6912 
6913 
6914 /*------------------------------------------------------------*/
6915 /*--- Origin tracking stuff                                ---*/
6916 /*------------------------------------------------------------*/
6917 
6918 /* Almost identical to findShadowTmpV. */
findShadowTmpB(MCEnv * mce,IRTemp orig)6919 static IRTemp findShadowTmpB ( MCEnv* mce, IRTemp orig )
6920 {
6921    TempMapEnt* ent;
6922    /* VG_(indexXA) range-checks 'orig', hence no need to check
6923       here. */
6924    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
6925    tl_assert(ent->kind == Orig);
6926    if (ent->shadowB == IRTemp_INVALID) {
6927       IRTemp tmpB
6928         = newTemp( mce, Ity_I32, BSh );
6929       /* newTemp may cause mce->tmpMap to resize, hence previous results
6930          from VG_(indexXA) are invalid. */
6931       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
6932       tl_assert(ent->kind == Orig);
6933       tl_assert(ent->shadowB == IRTemp_INVALID);
6934       ent->shadowB = tmpB;
6935    }
6936    return ent->shadowB;
6937 }
6938 
gen_maxU32(MCEnv * mce,IRAtom * b1,IRAtom * b2)6939 static IRAtom* gen_maxU32 ( MCEnv* mce, IRAtom* b1, IRAtom* b2 )
6940 {
6941    return assignNew( 'B', mce, Ity_I32, binop(Iop_Max32U, b1, b2) );
6942 }
6943 
6944 
6945 /* Make a guarded origin load, with no special handling in the
6946    didn't-happen case.  A GUARD of NULL is assumed to mean "always
6947    True".
6948 
6949    Generate IR to do a shadow origins load from BASEADDR+OFFSET and
6950    return the otag.  The loaded size is SZB.  If GUARD evaluates to
6951    False at run time then the returned otag is zero.
6952 */
gen_guarded_load_b(MCEnv * mce,Int szB,IRAtom * baseaddr,Int offset,IRExpr * guard)6953 static IRAtom* gen_guarded_load_b ( MCEnv* mce, Int szB,
6954                                     IRAtom* baseaddr,
6955                                     Int offset, IRExpr* guard )
6956 {
6957    void*    hFun;
6958    const HChar* hName;
6959    IRTemp   bTmp;
6960    IRDirty* di;
6961    IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
6962    IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
6963    IRAtom*  ea    = baseaddr;
6964    if (offset != 0) {
6965       IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
6966                                    : mkU64( (Long)(Int)offset );
6967       ea = assignNew( 'B', mce, aTy, binop(opAdd, ea, off));
6968    }
6969    bTmp = newTemp(mce, mce->hWordTy, BSh);
6970 
6971    switch (szB) {
6972       case 1: hFun  = (void*)&MC_(helperc_b_load1);
6973               hName = "MC_(helperc_b_load1)";
6974               break;
6975       case 2: hFun  = (void*)&MC_(helperc_b_load2);
6976               hName = "MC_(helperc_b_load2)";
6977               break;
6978       case 4: hFun  = (void*)&MC_(helperc_b_load4);
6979               hName = "MC_(helperc_b_load4)";
6980               break;
6981       case 8: hFun  = (void*)&MC_(helperc_b_load8);
6982               hName = "MC_(helperc_b_load8)";
6983               break;
6984       case 16: hFun  = (void*)&MC_(helperc_b_load16);
6985                hName = "MC_(helperc_b_load16)";
6986                break;
6987       case 32: hFun  = (void*)&MC_(helperc_b_load32);
6988                hName = "MC_(helperc_b_load32)";
6989                break;
6990       default:
6991          VG_(printf)("mc_translate.c: gen_load_b: unhandled szB == %d\n", szB);
6992          tl_assert(0);
6993    }
6994    di = unsafeIRDirty_1_N(
6995            bTmp, 1/*regparms*/, hName, VG_(fnptr_to_fnentry)( hFun ),
6996            mkIRExprVec_1( ea )
6997         );
6998    if (guard) {
6999       di->guard = guard;
7000       /* Ideally the didn't-happen return value here would be
7001          all-zeroes (unknown-origin), so it'd be harmless if it got
7002          used inadvertently.  We slum it out with the IR-mandated
7003          default value (0b01 repeating, 0x55 etc) as that'll probably
7004          trump all legitimate otags via Max32, and it's pretty
7005          obviously bogus. */
7006    }
7007    /* no need to mess with any annotations.  This call accesses
7008       neither guest state nor guest memory. */
7009    stmt( 'B', mce, IRStmt_Dirty(di) );
7010    if (mce->hWordTy == Ity_I64) {
7011       /* 64-bit host */
7012       IRTemp bTmp32 = newTemp(mce, Ity_I32, BSh);
7013       assign( 'B', mce, bTmp32, unop(Iop_64to32, mkexpr(bTmp)) );
7014       return mkexpr(bTmp32);
7015    } else {
7016       /* 32-bit host */
7017       return mkexpr(bTmp);
7018    }
7019 }
7020 
7021 
7022 /* Generate IR to do a shadow origins load from BASEADDR+OFFSET.  The
7023    loaded size is SZB.  The load is regarded as unconditional (always
7024    happens).
7025 */
gen_load_b(MCEnv * mce,Int szB,IRAtom * baseaddr,Int offset)7026 static IRAtom* gen_load_b ( MCEnv* mce, Int szB, IRAtom* baseaddr,
7027                             Int offset )
7028 {
7029    return gen_guarded_load_b(mce, szB, baseaddr, offset, NULL/*guard*/);
7030 }
7031 
7032 
7033 /* The most general handler for guarded origin loads.  A GUARD of NULL
7034    is assumed to mean "always True".
7035 
7036    Generate IR to do a shadow origin load from ADDR+BIAS and return
7037    the B bits.  The loaded type is TY.  If GUARD evaluates to False at
7038    run time then the returned B bits are simply BALT instead.
7039 */
7040 static
expr2ori_Load_guarded_General(MCEnv * mce,IRType ty,IRAtom * addr,UInt bias,IRAtom * guard,IRAtom * balt)7041 IRAtom* expr2ori_Load_guarded_General ( MCEnv* mce,
7042                                         IRType ty,
7043                                         IRAtom* addr, UInt bias,
7044                                         IRAtom* guard, IRAtom* balt )
7045 {
7046    /* If the guard evaluates to True, this will hold the loaded
7047       origin.  If the guard evaluates to False, this will be zero,
7048       meaning "unknown origin", in which case we will have to replace
7049       it using an ITE below. */
7050    IRAtom* iftrue
7051       = assignNew('B', mce, Ity_I32,
7052                   gen_guarded_load_b(mce, sizeofIRType(ty),
7053                                      addr, bias, guard));
7054    /* These are the bits we will return if the load doesn't take
7055       place. */
7056    IRAtom* iffalse
7057       = balt;
7058    /* Prepare the cond for the ITE.  Convert a NULL cond into
7059       something that iropt knows how to fold out later. */
7060    IRAtom* cond
7061       = guard == NULL  ? mkU1(1)  : guard;
7062    /* And assemble the final result. */
7063    return assignNew('B', mce, Ity_I32, IRExpr_ITE(cond, iftrue, iffalse));
7064 }
7065 
7066 
7067 /* Generate a shadow origins store.  guard :: Ity_I1 controls whether
7068    the store really happens; NULL means it unconditionally does. */
gen_store_b(MCEnv * mce,Int szB,IRAtom * baseaddr,Int offset,IRAtom * dataB,IRAtom * guard)7069 static void gen_store_b ( MCEnv* mce, Int szB,
7070                           IRAtom* baseaddr, Int offset, IRAtom* dataB,
7071                           IRAtom* guard )
7072 {
7073    void*    hFun;
7074    const HChar* hName;
7075    IRDirty* di;
7076    IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
7077    IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
7078    IRAtom*  ea    = baseaddr;
7079    if (guard) {
7080       tl_assert(isOriginalAtom(mce, guard));
7081       tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
7082    }
7083    if (offset != 0) {
7084       IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
7085                                    : mkU64( (Long)(Int)offset );
7086       ea = assignNew(  'B', mce, aTy, binop(opAdd, ea, off));
7087    }
7088    if (mce->hWordTy == Ity_I64)
7089       dataB = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, dataB));
7090 
7091    switch (szB) {
7092       case 1: hFun  = (void*)&MC_(helperc_b_store1);
7093               hName = "MC_(helperc_b_store1)";
7094               break;
7095       case 2: hFun  = (void*)&MC_(helperc_b_store2);
7096               hName = "MC_(helperc_b_store2)";
7097               break;
7098       case 4: hFun  = (void*)&MC_(helperc_b_store4);
7099               hName = "MC_(helperc_b_store4)";
7100               break;
7101       case 8: hFun  = (void*)&MC_(helperc_b_store8);
7102               hName = "MC_(helperc_b_store8)";
7103               break;
7104       case 16: hFun  = (void*)&MC_(helperc_b_store16);
7105                hName = "MC_(helperc_b_store16)";
7106                break;
7107       case 32: hFun  = (void*)&MC_(helperc_b_store32);
7108                hName = "MC_(helperc_b_store32)";
7109                break;
7110       default:
7111          tl_assert(0);
7112    }
7113    di = unsafeIRDirty_0_N( 2/*regparms*/,
7114            hName, VG_(fnptr_to_fnentry)( hFun ),
7115            mkIRExprVec_2( ea, dataB )
7116         );
7117    /* no need to mess with any annotations.  This call accesses
7118       neither guest state nor guest memory. */
7119    if (guard) di->guard = guard;
7120    stmt( 'B', mce, IRStmt_Dirty(di) );
7121 }
7122 
narrowTo32(MCEnv * mce,IRAtom * e)7123 static IRAtom* narrowTo32 ( MCEnv* mce, IRAtom* e ) {
7124    IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
7125    if (eTy == Ity_I64)
7126       return assignNew( 'B', mce, Ity_I32, unop(Iop_64to32, e) );
7127    if (eTy == Ity_I32)
7128       return e;
7129    tl_assert(0);
7130 }
7131 
zWidenFrom32(MCEnv * mce,IRType dstTy,IRAtom * e)7132 static IRAtom* zWidenFrom32 ( MCEnv* mce, IRType dstTy, IRAtom* e ) {
7133    IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
7134    tl_assert(eTy == Ity_I32);
7135    if (dstTy == Ity_I64)
7136       return assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, e) );
7137    tl_assert(0);
7138 }
7139 
7140 
schemeE(MCEnv * mce,IRExpr * e)7141 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e )
7142 {
7143    tl_assert(MC_(clo_mc_level) == 3);
7144 
7145    switch (e->tag) {
7146 
7147       case Iex_GetI: {
7148          IRRegArray* descr_b;
7149          IRAtom      *t1, *t2, *t3, *t4;
7150          IRRegArray* descr      = e->Iex.GetI.descr;
7151          IRType equivIntTy
7152             = MC_(get_otrack_reg_array_equiv_int_type)(descr);
7153          /* If this array is unshadowable for whatever reason, use the
7154             usual approximation. */
7155          if (equivIntTy == Ity_INVALID)
7156             return mkU32(0);
7157          tl_assert(sizeofIRType(equivIntTy) >= 4);
7158          tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
7159          descr_b = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
7160                                  equivIntTy, descr->nElems );
7161          /* Do a shadow indexed get of the same size, giving t1.  Take
7162             the bottom 32 bits of it, giving t2.  Compute into t3 the
7163             origin for the index (almost certainly zero, but there's
7164             no harm in being completely general here, since iropt will
7165             remove any useless code), and fold it in, giving a final
7166             value t4. */
7167          t1 = assignNew( 'B', mce, equivIntTy,
7168                           IRExpr_GetI( descr_b, e->Iex.GetI.ix,
7169                                                 e->Iex.GetI.bias ));
7170          t2 = narrowTo32( mce, t1 );
7171          t3 = schemeE( mce, e->Iex.GetI.ix );
7172          t4 = gen_maxU32( mce, t2, t3 );
7173          return t4;
7174       }
7175       case Iex_CCall: {
7176          Int i;
7177          IRAtom*  here;
7178          IRExpr** args = e->Iex.CCall.args;
7179          IRAtom*  curr = mkU32(0);
7180          for (i = 0; args[i]; i++) {
7181             tl_assert(i < 32);
7182             tl_assert(isOriginalAtom(mce, args[i]));
7183             /* Only take notice of this arg if the callee's
7184                mc-exclusion mask does not say it is to be excluded. */
7185             if (e->Iex.CCall.cee->mcx_mask & (1<<i)) {
7186                /* the arg is to be excluded from definedness checking.
7187                   Do nothing. */
7188                if (0) VG_(printf)("excluding %s(%d)\n",
7189                                   e->Iex.CCall.cee->name, i);
7190             } else {
7191                /* calculate the arg's definedness, and pessimistically
7192                   merge it in. */
7193                here = schemeE( mce, args[i] );
7194                curr = gen_maxU32( mce, curr, here );
7195             }
7196          }
7197          return curr;
7198       }
7199       case Iex_Load: {
7200          Int dszB;
7201          dszB = sizeofIRType(e->Iex.Load.ty);
7202          /* assert that the B value for the address is already
7203             available (somewhere) */
7204          tl_assert(isIRAtom(e->Iex.Load.addr));
7205          tl_assert(mce->hWordTy == Ity_I32 || mce->hWordTy == Ity_I64);
7206          return gen_load_b( mce, dszB, e->Iex.Load.addr, 0 );
7207       }
7208       case Iex_ITE: {
7209          IRAtom* b1 = schemeE( mce, e->Iex.ITE.cond );
7210          IRAtom* b3 = schemeE( mce, e->Iex.ITE.iftrue );
7211          IRAtom* b2 = schemeE( mce, e->Iex.ITE.iffalse );
7212          return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ));
7213       }
7214       case Iex_Qop: {
7215          IRAtom* b1 = schemeE( mce, e->Iex.Qop.details->arg1 );
7216          IRAtom* b2 = schemeE( mce, e->Iex.Qop.details->arg2 );
7217          IRAtom* b3 = schemeE( mce, e->Iex.Qop.details->arg3 );
7218          IRAtom* b4 = schemeE( mce, e->Iex.Qop.details->arg4 );
7219          return gen_maxU32( mce, gen_maxU32( mce, b1, b2 ),
7220                                  gen_maxU32( mce, b3, b4 ) );
7221       }
7222       case Iex_Triop: {
7223          IRAtom* b1 = schemeE( mce, e->Iex.Triop.details->arg1 );
7224          IRAtom* b2 = schemeE( mce, e->Iex.Triop.details->arg2 );
7225          IRAtom* b3 = schemeE( mce, e->Iex.Triop.details->arg3 );
7226          return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ) );
7227       }
7228       case Iex_Binop: {
7229          switch (e->Iex.Binop.op) {
7230             case Iop_CasCmpEQ8:  case Iop_CasCmpNE8:
7231             case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
7232             case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
7233             case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
7234                /* Just say these all produce a defined result,
7235                   regardless of their arguments.  See
7236                   COMMENT_ON_CasCmpEQ in this file. */
7237                return mkU32(0);
7238             default: {
7239                IRAtom* b1 = schemeE( mce, e->Iex.Binop.arg1 );
7240                IRAtom* b2 = schemeE( mce, e->Iex.Binop.arg2 );
7241                return gen_maxU32( mce, b1, b2 );
7242             }
7243          }
7244          tl_assert(0);
7245          /*NOTREACHED*/
7246       }
7247       case Iex_Unop: {
7248          IRAtom* b1 = schemeE( mce, e->Iex.Unop.arg );
7249          return b1;
7250       }
7251       case Iex_Const:
7252          return mkU32(0);
7253       case Iex_RdTmp:
7254          return mkexpr( findShadowTmpB( mce, e->Iex.RdTmp.tmp ));
7255       case Iex_Get: {
7256          Int b_offset = MC_(get_otrack_shadow_offset)(
7257                            e->Iex.Get.offset,
7258                            sizeofIRType(e->Iex.Get.ty)
7259                         );
7260          tl_assert(b_offset >= -1
7261                    && b_offset <= mce->layout->total_sizeB -4);
7262          if (b_offset >= 0) {
7263             /* FIXME: this isn't an atom! */
7264             return IRExpr_Get( b_offset + 2*mce->layout->total_sizeB,
7265                                Ity_I32 );
7266          }
7267          return mkU32(0);
7268       }
7269       default:
7270          VG_(printf)("mc_translate.c: schemeE: unhandled: ");
7271          ppIRExpr(e);
7272          VG_(tool_panic)("memcheck:schemeE");
7273    }
7274 }
7275 
7276 
do_origins_Dirty(MCEnv * mce,IRDirty * d)7277 static void do_origins_Dirty ( MCEnv* mce, IRDirty* d )
7278 {
7279    // This is a hacked version of do_shadow_Dirty
7280    Int       i, k, n, toDo, gSz, gOff;
7281    IRAtom    *here, *curr;
7282    IRTemp    dst;
7283 
7284    /* First check the guard. */
7285    curr = schemeE( mce, d->guard );
7286 
7287    /* Now round up all inputs and maxU32 over them. */
7288 
7289    /* Inputs: unmasked args
7290       Note: arguments are evaluated REGARDLESS of the guard expression */
7291    for (i = 0; d->args[i]; i++) {
7292       IRAtom* arg = d->args[i];
7293       if ( (d->cee->mcx_mask & (1<<i))
7294            || UNLIKELY(is_IRExpr_VECRET_or_GSPTR(arg)) ) {
7295          /* ignore this arg */
7296       } else {
7297          here = schemeE( mce, arg );
7298          curr = gen_maxU32( mce, curr, here );
7299       }
7300    }
7301 
7302    /* Inputs: guest state that we read. */
7303    for (i = 0; i < d->nFxState; i++) {
7304       tl_assert(d->fxState[i].fx != Ifx_None);
7305       if (d->fxState[i].fx == Ifx_Write)
7306          continue;
7307 
7308       /* Enumerate the described state segments */
7309       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
7310          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
7311          gSz  = d->fxState[i].size;
7312 
7313          /* Ignore any sections marked as 'always defined'. */
7314          if (isAlwaysDefd(mce, gOff, gSz)) {
7315             if (0)
7316             VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
7317                         gOff, gSz);
7318             continue;
7319          }
7320 
7321          /* This state element is read or modified.  So we need to
7322             consider it.  If larger than 4 bytes, deal with it in
7323             4-byte chunks. */
7324          while (True) {
7325             Int b_offset;
7326             tl_assert(gSz >= 0);
7327             if (gSz == 0) break;
7328             n = gSz <= 4 ? gSz : 4;
7329             /* update 'curr' with maxU32 of the state slice
7330                gOff .. gOff+n-1 */
7331             b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
7332             if (b_offset != -1) {
7333                /* Observe the guard expression. If it is false use 0, i.e.
7334                   nothing is known about the origin */
7335                IRAtom *cond, *iffalse, *iftrue;
7336 
7337                cond = assignNew( 'B', mce, Ity_I1, d->guard);
7338                iffalse = mkU32(0);
7339                iftrue  = assignNew( 'B', mce, Ity_I32,
7340                                     IRExpr_Get(b_offset
7341                                                  + 2*mce->layout->total_sizeB,
7342                                                Ity_I32));
7343                here = assignNew( 'B', mce, Ity_I32,
7344                                  IRExpr_ITE(cond, iftrue, iffalse));
7345                curr = gen_maxU32( mce, curr, here );
7346             }
7347             gSz -= n;
7348             gOff += n;
7349          }
7350       }
7351    }
7352 
7353    /* Inputs: memory */
7354 
7355    if (d->mFx != Ifx_None) {
7356       /* Because we may do multiple shadow loads/stores from the same
7357          base address, it's best to do a single test of its
7358          definedness right now.  Post-instrumentation optimisation
7359          should remove all but this test. */
7360       tl_assert(d->mAddr);
7361       here = schemeE( mce, d->mAddr );
7362       curr = gen_maxU32( mce, curr, here );
7363    }
7364 
7365    /* Deal with memory inputs (reads or modifies) */
7366    if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
7367       toDo   = d->mSize;
7368       /* chew off 32-bit chunks.  We don't care about the endianness
7369          since it's all going to be condensed down to a single bit,
7370          but nevertheless choose an endianness which is hopefully
7371          native to the platform. */
7372       while (toDo >= 4) {
7373          here = gen_guarded_load_b( mce, 4, d->mAddr, d->mSize - toDo,
7374                                     d->guard );
7375          curr = gen_maxU32( mce, curr, here );
7376          toDo -= 4;
7377       }
7378       /* handle possible 16-bit excess */
7379       while (toDo >= 2) {
7380          here = gen_guarded_load_b( mce, 2, d->mAddr, d->mSize - toDo,
7381                                     d->guard );
7382          curr = gen_maxU32( mce, curr, here );
7383          toDo -= 2;
7384       }
7385       /* chew off the remaining 8-bit chunk, if any */
7386       if (toDo == 1) {
7387          here = gen_guarded_load_b( mce, 1, d->mAddr, d->mSize - toDo,
7388                                     d->guard );
7389          curr = gen_maxU32( mce, curr, here );
7390          toDo -= 1;
7391       }
7392       tl_assert(toDo == 0);
7393    }
7394 
7395    /* Whew!  So curr is a 32-bit B-value which should give an origin
7396       of some use if any of the inputs to the helper are undefined.
7397       Now we need to re-distribute the results to all destinations. */
7398 
7399    /* Outputs: the destination temporary, if there is one. */
7400    if (d->tmp != IRTemp_INVALID) {
7401       dst   = findShadowTmpB(mce, d->tmp);
7402       assign( 'V', mce, dst, curr );
7403    }
7404 
7405    /* Outputs: guest state that we write or modify. */
7406    for (i = 0; i < d->nFxState; i++) {
7407       tl_assert(d->fxState[i].fx != Ifx_None);
7408       if (d->fxState[i].fx == Ifx_Read)
7409          continue;
7410 
7411       /* Enumerate the described state segments */
7412       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
7413          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
7414          gSz  = d->fxState[i].size;
7415 
7416          /* Ignore any sections marked as 'always defined'. */
7417          if (isAlwaysDefd(mce, gOff, gSz))
7418             continue;
7419 
7420          /* This state element is written or modified.  So we need to
7421             consider it.  If larger than 4 bytes, deal with it in
7422             4-byte chunks. */
7423          while (True) {
7424             Int b_offset;
7425             tl_assert(gSz >= 0);
7426             if (gSz == 0) break;
7427             n = gSz <= 4 ? gSz : 4;
7428             /* Write 'curr' to the state slice gOff .. gOff+n-1 */
7429             b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
7430             if (b_offset != -1) {
7431 
7432                /* If the guard expression evaluates to false we simply Put
7433                   the value that is already stored in the guest state slot */
7434                IRAtom *cond, *iffalse;
7435 
7436                cond    = assignNew('B', mce, Ity_I1,
7437                                    d->guard);
7438                iffalse = assignNew('B', mce, Ity_I32,
7439                                    IRExpr_Get(b_offset +
7440                                               2*mce->layout->total_sizeB,
7441                                               Ity_I32));
7442                curr = assignNew('V', mce, Ity_I32,
7443                                 IRExpr_ITE(cond, curr, iffalse));
7444 
7445                stmt( 'B', mce, IRStmt_Put(b_offset
7446                                           + 2*mce->layout->total_sizeB,
7447                                           curr ));
7448             }
7449             gSz -= n;
7450             gOff += n;
7451          }
7452       }
7453    }
7454 
7455    /* Outputs: memory that we write or modify.  Same comments about
7456       endianness as above apply. */
7457    if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
7458       toDo   = d->mSize;
7459       /* chew off 32-bit chunks */
7460       while (toDo >= 4) {
7461          gen_store_b( mce, 4, d->mAddr, d->mSize - toDo, curr,
7462                       d->guard );
7463          toDo -= 4;
7464       }
7465       /* handle possible 16-bit excess */
7466       while (toDo >= 2) {
7467          gen_store_b( mce, 2, d->mAddr, d->mSize - toDo, curr,
7468                       d->guard );
7469          toDo -= 2;
7470       }
7471       /* chew off the remaining 8-bit chunk, if any */
7472       if (toDo == 1) {
7473          gen_store_b( mce, 1, d->mAddr, d->mSize - toDo, curr,
7474                       d->guard );
7475          toDo -= 1;
7476       }
7477       tl_assert(toDo == 0);
7478    }
7479 }
7480 
7481 
7482 /* Generate IR for origin shadowing for a general guarded store. */
do_origins_Store_guarded(MCEnv * mce,IREndness stEnd,IRExpr * stAddr,IRExpr * stData,IRExpr * guard)7483 static void do_origins_Store_guarded ( MCEnv* mce,
7484                                        IREndness stEnd,
7485                                        IRExpr* stAddr,
7486                                        IRExpr* stData,
7487                                        IRExpr* guard )
7488 {
7489    Int     dszB;
7490    IRAtom* dataB;
7491    /* assert that the B value for the address is already available
7492       (somewhere), since the call to schemeE will want to see it.
7493       XXXX how does this actually ensure that?? */
7494    tl_assert(isIRAtom(stAddr));
7495    tl_assert(isIRAtom(stData));
7496    dszB  = sizeofIRType( typeOfIRExpr(mce->sb->tyenv, stData ) );
7497    dataB = schemeE( mce, stData );
7498    gen_store_b( mce, dszB, stAddr, 0/*offset*/, dataB, guard );
7499 }
7500 
7501 
7502 /* Generate IR for origin shadowing for a plain store. */
do_origins_Store_plain(MCEnv * mce,IREndness stEnd,IRExpr * stAddr,IRExpr * stData)7503 static void do_origins_Store_plain ( MCEnv* mce,
7504                                      IREndness stEnd,
7505                                      IRExpr* stAddr,
7506                                      IRExpr* stData )
7507 {
7508    do_origins_Store_guarded ( mce, stEnd, stAddr, stData,
7509                               NULL/*guard*/ );
7510 }
7511 
7512 
7513 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
7514 
do_origins_StoreG(MCEnv * mce,IRStoreG * sg)7515 static void do_origins_StoreG ( MCEnv* mce, IRStoreG* sg )
7516 {
7517    do_origins_Store_guarded( mce, sg->end, sg->addr,
7518                              sg->data, sg->guard );
7519 }
7520 
do_origins_LoadG(MCEnv * mce,IRLoadG * lg)7521 static void do_origins_LoadG ( MCEnv* mce, IRLoadG* lg )
7522 {
7523    IRType loadedTy = Ity_INVALID;
7524    switch (lg->cvt) {
7525       case ILGop_IdentV128: loadedTy = Ity_V128; break;
7526       case ILGop_Ident64:   loadedTy = Ity_I64;  break;
7527       case ILGop_Ident32:   loadedTy = Ity_I32;  break;
7528       case ILGop_16Uto32:   loadedTy = Ity_I16;  break;
7529       case ILGop_16Sto32:   loadedTy = Ity_I16;  break;
7530       case ILGop_8Uto32:    loadedTy = Ity_I8;   break;
7531       case ILGop_8Sto32:    loadedTy = Ity_I8;   break;
7532       default: VG_(tool_panic)("schemeS.IRLoadG");
7533    }
7534    IRAtom* ori_alt
7535       = schemeE( mce,lg->alt );
7536    IRAtom* ori_final
7537       = expr2ori_Load_guarded_General(mce, loadedTy,
7538                                       lg->addr, 0/*addr bias*/,
7539                                       lg->guard, ori_alt );
7540    /* And finally, bind the origin to the destination temporary. */
7541    assign( 'B', mce, findShadowTmpB(mce, lg->dst), ori_final );
7542 }
7543 
7544 
schemeS(MCEnv * mce,IRStmt * st)7545 static void schemeS ( MCEnv* mce, IRStmt* st )
7546 {
7547    tl_assert(MC_(clo_mc_level) == 3);
7548 
7549    switch (st->tag) {
7550 
7551       case Ist_AbiHint:
7552          /* The value-check instrumenter handles this - by arranging
7553             to pass the address of the next instruction to
7554             MC_(helperc_MAKE_STACK_UNINIT).  This is all that needs to
7555             happen for origin tracking w.r.t. AbiHints.  So there is
7556             nothing to do here. */
7557          break;
7558 
7559       case Ist_PutI: {
7560          IRPutI *puti = st->Ist.PutI.details;
7561          IRRegArray* descr_b;
7562          IRAtom      *t1, *t2, *t3, *t4;
7563          IRRegArray* descr = puti->descr;
7564          IRType equivIntTy
7565             = MC_(get_otrack_reg_array_equiv_int_type)(descr);
7566          /* If this array is unshadowable for whatever reason,
7567             generate no code. */
7568          if (equivIntTy == Ity_INVALID)
7569             break;
7570          tl_assert(sizeofIRType(equivIntTy) >= 4);
7571          tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
7572          descr_b
7573             = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
7574                             equivIntTy, descr->nElems );
7575          /* Compute a value to Put - the conjoinment of the origin for
7576             the data to be Put-ted (obviously) and of the index value
7577             (not so obviously). */
7578          t1 = schemeE( mce, puti->data );
7579          t2 = schemeE( mce, puti->ix );
7580          t3 = gen_maxU32( mce, t1, t2 );
7581          t4 = zWidenFrom32( mce, equivIntTy, t3 );
7582          stmt( 'B', mce, IRStmt_PutI( mkIRPutI(descr_b, puti->ix,
7583                                                puti->bias, t4) ));
7584          break;
7585       }
7586 
7587       case Ist_Dirty:
7588          do_origins_Dirty( mce, st->Ist.Dirty.details );
7589          break;
7590 
7591       case Ist_Store:
7592          do_origins_Store_plain( mce, st->Ist.Store.end,
7593                                       st->Ist.Store.addr,
7594                                       st->Ist.Store.data );
7595          break;
7596 
7597       case Ist_StoreG:
7598          do_origins_StoreG( mce, st->Ist.StoreG.details );
7599          break;
7600 
7601       case Ist_LoadG:
7602          do_origins_LoadG( mce, st->Ist.LoadG.details );
7603          break;
7604 
7605       case Ist_LLSC: {
7606          /* In short: treat a load-linked like a normal load followed
7607             by an assignment of the loaded (shadow) data the result
7608             temporary.  Treat a store-conditional like a normal store,
7609             and mark the result temporary as defined. */
7610          if (st->Ist.LLSC.storedata == NULL) {
7611             /* Load Linked */
7612             IRType resTy
7613                = typeOfIRTemp(mce->sb->tyenv, st->Ist.LLSC.result);
7614             IRExpr* vanillaLoad
7615                = IRExpr_Load(st->Ist.LLSC.end, resTy, st->Ist.LLSC.addr);
7616             tl_assert(resTy == Ity_I64 || resTy == Ity_I32
7617                       || resTy == Ity_I16 || resTy == Ity_I8);
7618             assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
7619                               schemeE(mce, vanillaLoad));
7620          } else {
7621             /* Store conditional */
7622             do_origins_Store_plain( mce, st->Ist.LLSC.end,
7623                                     st->Ist.LLSC.addr,
7624                                     st->Ist.LLSC.storedata );
7625             /* For the rationale behind this, see comments at the
7626                place where the V-shadow for .result is constructed, in
7627                do_shadow_LLSC.  In short, we regard .result as
7628                always-defined. */
7629             assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
7630                               mkU32(0) );
7631          }
7632          break;
7633       }
7634 
7635       case Ist_Put: {
7636          Int b_offset
7637             = MC_(get_otrack_shadow_offset)(
7638                  st->Ist.Put.offset,
7639                  sizeofIRType(typeOfIRExpr(mce->sb->tyenv, st->Ist.Put.data))
7640               );
7641          if (b_offset >= 0) {
7642             /* FIXME: this isn't an atom! */
7643             stmt( 'B', mce, IRStmt_Put(b_offset + 2*mce->layout->total_sizeB,
7644                                        schemeE( mce, st->Ist.Put.data )) );
7645          }
7646          break;
7647       }
7648 
7649       case Ist_WrTmp:
7650          assign( 'B', mce, findShadowTmpB(mce, st->Ist.WrTmp.tmp),
7651                            schemeE(mce, st->Ist.WrTmp.data) );
7652          break;
7653 
7654       case Ist_MBE:
7655       case Ist_NoOp:
7656       case Ist_Exit:
7657       case Ist_IMark:
7658          break;
7659 
7660       default:
7661          VG_(printf)("mc_translate.c: schemeS: unhandled: ");
7662          ppIRStmt(st);
7663          VG_(tool_panic)("memcheck:schemeS");
7664    }
7665 }
7666 
7667 
7668 /*------------------------------------------------------------*/
7669 /*--- Startup assertion checking                           ---*/
7670 /*------------------------------------------------------------*/
7671 
MC_(do_instrumentation_startup_checks)7672 void MC_(do_instrumentation_startup_checks)( void )
7673 {
7674    /* Make a best-effort check to see that is_helperc_value_checkN_fail
7675       is working as we expect. */
7676 
7677 #  define CHECK(_expected, _string) \
7678       tl_assert((_expected) == is_helperc_value_checkN_fail(_string))
7679 
7680    /* It should identify these 8, and no others, as targets. */
7681    CHECK(True, "MC_(helperc_value_check8_fail_no_o)");
7682    CHECK(True, "MC_(helperc_value_check4_fail_no_o)");
7683    CHECK(True, "MC_(helperc_value_check0_fail_no_o)");
7684    CHECK(True, "MC_(helperc_value_check1_fail_no_o)");
7685    CHECK(True, "MC_(helperc_value_check8_fail_w_o)");
7686    CHECK(True, "MC_(helperc_value_check0_fail_w_o)");
7687    CHECK(True, "MC_(helperc_value_check1_fail_w_o)");
7688    CHECK(True, "MC_(helperc_value_check4_fail_w_o)");
7689 
7690    /* Ad-hoc selection of other strings gathered via a quick test. */
7691    CHECK(False, "amd64g_dirtyhelper_CPUID_avx2");
7692    CHECK(False, "amd64g_dirtyhelper_RDTSC");
7693    CHECK(False, "MC_(helperc_b_load1)");
7694    CHECK(False, "MC_(helperc_b_load2)");
7695    CHECK(False, "MC_(helperc_b_load4)");
7696    CHECK(False, "MC_(helperc_b_load8)");
7697    CHECK(False, "MC_(helperc_b_load16)");
7698    CHECK(False, "MC_(helperc_b_load32)");
7699    CHECK(False, "MC_(helperc_b_store1)");
7700    CHECK(False, "MC_(helperc_b_store2)");
7701    CHECK(False, "MC_(helperc_b_store4)");
7702    CHECK(False, "MC_(helperc_b_store8)");
7703    CHECK(False, "MC_(helperc_b_store16)");
7704    CHECK(False, "MC_(helperc_b_store32)");
7705    CHECK(False, "MC_(helperc_LOADV8)");
7706    CHECK(False, "MC_(helperc_LOADV16le)");
7707    CHECK(False, "MC_(helperc_LOADV32le)");
7708    CHECK(False, "MC_(helperc_LOADV64le)");
7709    CHECK(False, "MC_(helperc_LOADV128le)");
7710    CHECK(False, "MC_(helperc_LOADV256le)");
7711    CHECK(False, "MC_(helperc_STOREV16le)");
7712    CHECK(False, "MC_(helperc_STOREV32le)");
7713    CHECK(False, "MC_(helperc_STOREV64le)");
7714    CHECK(False, "MC_(helperc_STOREV8)");
7715    CHECK(False, "track_die_mem_stack_8");
7716    CHECK(False, "track_new_mem_stack_8_w_ECU");
7717    CHECK(False, "MC_(helperc_MAKE_STACK_UNINIT_w_o)");
7718    CHECK(False, "VG_(unknown_SP_update_w_ECU)");
7719 
7720 #  undef CHECK
7721 }
7722 
7723 
7724 /*--------------------------------------------------------------------*/
7725 /*--- end                                           mc_translate.c ---*/
7726 /*--------------------------------------------------------------------*/
7727