• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==//
2 //
3 //                        The Subzero Code Generator
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief Implements the TargetLoweringX86Base class, which consists almost
12 /// entirely of the lowering sequence for each high-level instruction.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #ifndef SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H
17 #define SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H
18 
19 #include "IceCfg.h"
20 #include "IceCfgNode.h"
21 #include "IceClFlags.h"
22 #include "IceDefs.h"
23 #include "IceELFObjectWriter.h"
24 #include "IceGlobalInits.h"
25 #include "IceInstVarIter.h"
26 #include "IceInstX86Base.h"
27 #include "IceLiveness.h"
28 #include "IceOperand.h"
29 #include "IcePhiLoweringImpl.h"
30 #include "IceUtils.h"
31 #include "IceVariableSplitting.h"
32 
33 #include "llvm/Support/MathExtras.h"
34 
35 #include <stack>
36 
37 namespace Ice {
38 namespace X86 {
39 template <typename T> struct PoolTypeConverter {};
40 
41 template <> struct PoolTypeConverter<float> {
42   using PrimitiveIntType = uint32_t;
43   using IceType = ConstantFloat;
44   static const Type Ty = IceType_f32;
45   static const char *TypeName;
46   static const char *AsmTag;
47   static const char *PrintfString;
48 };
49 
50 template <> struct PoolTypeConverter<double> {
51   using PrimitiveIntType = uint64_t;
52   using IceType = ConstantDouble;
53   static const Type Ty = IceType_f64;
54   static const char *TypeName;
55   static const char *AsmTag;
56   static const char *PrintfString;
57 };
58 
59 // Add converter for int type constant pooling
60 template <> struct PoolTypeConverter<uint32_t> {
61   using PrimitiveIntType = uint32_t;
62   using IceType = ConstantInteger32;
63   static const Type Ty = IceType_i32;
64   static const char *TypeName;
65   static const char *AsmTag;
66   static const char *PrintfString;
67 };
68 
69 // Add converter for int type constant pooling
70 template <> struct PoolTypeConverter<uint16_t> {
71   using PrimitiveIntType = uint32_t;
72   using IceType = ConstantInteger32;
73   static const Type Ty = IceType_i16;
74   static const char *TypeName;
75   static const char *AsmTag;
76   static const char *PrintfString;
77 };
78 
79 // Add converter for int type constant pooling
80 template <> struct PoolTypeConverter<uint8_t> {
81   using PrimitiveIntType = uint32_t;
82   using IceType = ConstantInteger32;
83   static const Type Ty = IceType_i8;
84   static const char *TypeName;
85   static const char *AsmTag;
86   static const char *PrintfString;
87 };
88 } // end of namespace X86
89 
90 namespace X86NAMESPACE {
91 
92 using Utils::BoolFlagSaver;
93 
94 template <typename Traits> class BoolFoldingEntry {
95   BoolFoldingEntry(const BoolFoldingEntry &) = delete;
96 
97 public:
98   BoolFoldingEntry() = default;
99   explicit BoolFoldingEntry(Inst *I);
100   BoolFoldingEntry &operator=(const BoolFoldingEntry &) = default;
101   /// Instr is the instruction producing the i1-type variable of interest.
102   Inst *Instr = nullptr;
103   /// IsComplex is the cached result of BoolFolding::hasComplexLowering(Instr).
104   bool IsComplex = false;
105   /// IsLiveOut is initialized conservatively to true, and is set to false when
106   /// we encounter an instruction that ends Var's live range. We disable the
107   /// folding optimization when Var is live beyond this basic block. Note that
108   /// if liveness analysis is not performed (e.g. in Om1 mode), IsLiveOut will
109   /// always be true and the folding optimization will never be performed.
110   bool IsLiveOut = true;
111   // NumUses counts the number of times Var is used as a source operand in the
112   // basic block. If IsComplex is true and there is more than one use of Var,
113   // then the folding optimization is disabled for Var.
114   uint32_t NumUses = 0;
115 };
116 
117 template <typename Traits> class BoolFolding {
118 public:
119   enum BoolFoldingProducerKind {
120     PK_None,
121     // TODO(jpp): PK_Icmp32 is no longer meaningful. Rename to PK_IcmpNative.
122     PK_Icmp32,
123     PK_Icmp64,
124     PK_Fcmp,
125     PK_Trunc,
126     PK_Arith // A flag-setting arithmetic instruction.
127   };
128 
129   /// Currently the actual enum values are not used (other than CK_None), but we
130   /// go ahead and produce them anyway for symmetry with the
131   /// BoolFoldingProducerKind.
132   enum BoolFoldingConsumerKind { CK_None, CK_Br, CK_Select, CK_Sext, CK_Zext };
133 
134 private:
135   BoolFolding(const BoolFolding &) = delete;
136   BoolFolding &operator=(const BoolFolding &) = delete;
137 
138 public:
139   BoolFolding() = default;
140   static BoolFoldingProducerKind getProducerKind(const Inst *Instr);
141   static BoolFoldingConsumerKind getConsumerKind(const Inst *Instr);
142   static bool hasComplexLowering(const Inst *Instr);
143   static bool isValidFolding(BoolFoldingProducerKind ProducerKind,
144                              BoolFoldingConsumerKind ConsumerKind);
145   void init(CfgNode *Node);
146   const Inst *getProducerFor(const Operand *Opnd) const;
147   void dump(const Cfg *Func) const;
148 
149 private:
150   /// Returns true if Producers contains a valid entry for the given VarNum.
151   bool containsValid(SizeT VarNum) const {
152     auto Element = Producers.find(VarNum);
153     return Element != Producers.end() && Element->second.Instr != nullptr;
154   }
155   void setInvalid(SizeT VarNum) { Producers[VarNum].Instr = nullptr; }
156   void invalidateProducersOnStore(const Inst *Instr);
157   /// Producers maps Variable::Number to a BoolFoldingEntry.
158   CfgUnorderedMap<SizeT, BoolFoldingEntry<Traits>> Producers;
159 };
160 
161 template <typename Traits>
162 BoolFoldingEntry<Traits>::BoolFoldingEntry(Inst *I)
163     : Instr(I), IsComplex(BoolFolding<Traits>::hasComplexLowering(I)) {}
164 
165 template <typename Traits>
166 typename BoolFolding<Traits>::BoolFoldingProducerKind
167 BoolFolding<Traits>::getProducerKind(const Inst *Instr) {
168   if (llvm::isa<InstIcmp>(Instr)) {
169     if (Traits::Is64Bit || Instr->getSrc(0)->getType() != IceType_i64)
170       return PK_Icmp32;
171     return PK_Icmp64;
172   }
173   if (llvm::isa<InstFcmp>(Instr))
174     return PK_Fcmp;
175   if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) {
176     if (Traits::Is64Bit || Arith->getSrc(0)->getType() != IceType_i64) {
177       switch (Arith->getOp()) {
178       default:
179         return PK_None;
180       case InstArithmetic::And:
181       case InstArithmetic::Or:
182         return PK_Arith;
183       }
184     }
185   }
186   return PK_None; // TODO(stichnot): remove this
187 
188   if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
189     switch (Cast->getCastKind()) {
190     default:
191       return PK_None;
192     case InstCast::Trunc:
193       return PK_Trunc;
194     }
195   }
196   return PK_None;
197 }
198 
199 template <typename Traits>
200 typename BoolFolding<Traits>::BoolFoldingConsumerKind
201 BoolFolding<Traits>::getConsumerKind(const Inst *Instr) {
202   if (llvm::isa<InstBr>(Instr))
203     return CK_Br;
204   if (llvm::isa<InstSelect>(Instr))
205     return CK_Select;
206   return CK_None; // TODO(stichnot): remove this
207 
208   if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
209     switch (Cast->getCastKind()) {
210     default:
211       return CK_None;
212     case InstCast::Sext:
213       return CK_Sext;
214     case InstCast::Zext:
215       return CK_Zext;
216     }
217   }
218   return CK_None;
219 }
220 
221 /// Returns true if the producing instruction has a "complex" lowering sequence.
222 /// This generally means that its lowering sequence requires more than one
223 /// conditional branch, namely 64-bit integer compares and some floating-point
224 /// compares. When this is true, and there is more than one consumer, we prefer
225 /// to disable the folding optimization because it minimizes branches.
226 template <typename Traits>
227 bool BoolFolding<Traits>::hasComplexLowering(const Inst *Instr) {
228   switch (getProducerKind(Instr)) {
229   default:
230     return false;
231   case PK_Icmp64:
232     return !Traits::Is64Bit;
233   case PK_Fcmp:
234     return Traits::TableFcmp[llvm::cast<InstFcmp>(Instr)->getCondition()].C2 !=
235            Traits::Cond::Br_None;
236   }
237 }
238 
239 template <typename Traits>
240 bool BoolFolding<Traits>::isValidFolding(
241     typename BoolFolding<Traits>::BoolFoldingProducerKind ProducerKind,
242     typename BoolFolding<Traits>::BoolFoldingConsumerKind ConsumerKind) {
243   switch (ProducerKind) {
244   default:
245     return false;
246   case PK_Icmp32:
247   case PK_Icmp64:
248   case PK_Fcmp:
249     return (ConsumerKind == CK_Br) || (ConsumerKind == CK_Select);
250   case PK_Arith:
251     return ConsumerKind == CK_Br;
252   }
253 }
254 
255 template <typename Traits> void BoolFolding<Traits>::init(CfgNode *Node) {
256   Producers.clear();
257   for (Inst &Instr : Node->getInsts()) {
258     if (Instr.isDeleted())
259       continue;
260     invalidateProducersOnStore(&Instr);
261     // Check whether Instr is a valid producer.
262     Variable *Var = Instr.getDest();
263     if (Var) { // only consider instructions with an actual dest var
264       if (isBooleanType(Var->getType())) {        // only bool-type dest vars
265         if (getProducerKind(&Instr) != PK_None) { // white-listed instructions
266           Producers[Var->getIndex()] = BoolFoldingEntry<Traits>(&Instr);
267         }
268       }
269     }
270     // Check each src variable against the map.
271     FOREACH_VAR_IN_INST(Var, Instr) {
272       SizeT VarNum = Var->getIndex();
273       if (!containsValid(VarNum))
274         continue;
275       // All valid consumers use Var as the first source operand
276       if (IndexOfVarOperandInInst(Var) != 0) {
277         setInvalid(VarNum);
278         continue;
279       }
280       // Consumer instructions must be white-listed
281       typename BoolFolding<Traits>::BoolFoldingConsumerKind ConsumerKind =
282           getConsumerKind(&Instr);
283       if (ConsumerKind == CK_None) {
284         setInvalid(VarNum);
285         continue;
286       }
287       typename BoolFolding<Traits>::BoolFoldingProducerKind ProducerKind =
288           getProducerKind(Producers[VarNum].Instr);
289       if (!isValidFolding(ProducerKind, ConsumerKind)) {
290         setInvalid(VarNum);
291         continue;
292       }
293       // Avoid creating multiple copies of complex producer instructions.
294       if (Producers[VarNum].IsComplex && Producers[VarNum].NumUses > 0) {
295         setInvalid(VarNum);
296         continue;
297       }
298       ++Producers[VarNum].NumUses;
299       if (Instr.isLastUse(Var)) {
300         Producers[VarNum].IsLiveOut = false;
301       }
302     }
303   }
304   for (auto &I : Producers) {
305     // Ignore entries previously marked invalid.
306     if (I.second.Instr == nullptr)
307       continue;
308     // Disable the producer if its dest may be live beyond this block.
309     if (I.second.IsLiveOut) {
310       setInvalid(I.first);
311       continue;
312     }
313     // Mark as "dead" rather than outright deleting. This is so that other
314     // peephole style optimizations during or before lowering have access to
315     // this instruction in undeleted form. See for example
316     // tryOptimizedCmpxchgCmpBr().
317     I.second.Instr->setDead();
318   }
319 }
320 
321 template <typename Traits>
322 const Inst *BoolFolding<Traits>::getProducerFor(const Operand *Opnd) const {
323   auto *Var = llvm::dyn_cast<const Variable>(Opnd);
324   if (Var == nullptr)
325     return nullptr;
326   SizeT VarNum = Var->getIndex();
327   auto Element = Producers.find(VarNum);
328   if (Element == Producers.end())
329     return nullptr;
330   return Element->second.Instr;
331 }
332 
333 template <typename Traits>
334 void BoolFolding<Traits>::dump(const Cfg *Func) const {
335   if (!BuildDefs::dump() || !Func->isVerbose(IceV_Folding))
336     return;
337   OstreamLocker L(Func->getContext());
338   Ostream &Str = Func->getContext()->getStrDump();
339   for (auto &I : Producers) {
340     if (I.second.Instr == nullptr)
341       continue;
342     Str << "Found foldable producer:\n  ";
343     I.second.Instr->dump(Func);
344     Str << "\n";
345   }
346 }
347 
348 /// If the given instruction has potential memory side effects (e.g. store, rmw,
349 /// or a call instruction with potential memory side effects), then we must not
350 /// allow a pre-store Producer instruction with memory operands to be folded
351 /// into a post-store Consumer instruction.  If this is detected, the Producer
352 /// is invalidated.
353 ///
354 /// We use the Producer's IsLiveOut field to determine whether any potential
355 /// Consumers come after this store instruction.  The IsLiveOut field is
356 /// initialized to true, and BoolFolding::init() sets IsLiveOut to false when it
357 /// sees the variable's definitive last use (indicating the variable is not in
358 /// the node's live-out set).  Thus if we see here that IsLiveOut is false, we
359 /// know that there can be no consumers after the store, and therefore we know
360 /// the folding is safe despite the store instruction.
361 template <typename Traits>
362 void BoolFolding<Traits>::invalidateProducersOnStore(const Inst *Instr) {
363   if (!Instr->isMemoryWrite())
364     return;
365   for (auto &ProducerPair : Producers) {
366     if (!ProducerPair.second.IsLiveOut)
367       continue;
368     Inst *PInst = ProducerPair.second.Instr;
369     if (PInst == nullptr)
370       continue;
371     bool HasMemOperand = false;
372     const SizeT SrcSize = PInst->getSrcSize();
373     for (SizeT I = 0; I < SrcSize; ++I) {
374       if (llvm::isa<typename Traits::X86OperandMem>(PInst->getSrc(I))) {
375         HasMemOperand = true;
376         break;
377       }
378     }
379     if (!HasMemOperand)
380       continue;
381     setInvalid(ProducerPair.first);
382   }
383 }
384 
385 template <typename TraitsType>
386 void TargetX86Base<TraitsType>::initNodeForLowering(CfgNode *Node) {
387   FoldingInfo.init(Node);
388   FoldingInfo.dump(Func);
389 }
390 
391 template <typename TraitsType>
392 TargetX86Base<TraitsType>::TargetX86Base(Cfg *Func)
393     : TargetLowering(Func), NeedSandboxing(SandboxingType == ST_NaCl) {
394   static_assert(
395       (Traits::InstructionSet::End - Traits::InstructionSet::Begin) ==
396           (TargetInstructionSet::X86InstructionSet_End -
397            TargetInstructionSet::X86InstructionSet_Begin),
398       "Traits::InstructionSet range different from TargetInstructionSet");
399   if (getFlags().getTargetInstructionSet() !=
400       TargetInstructionSet::BaseInstructionSet) {
401     InstructionSet = static_cast<InstructionSetEnum>(
402         (getFlags().getTargetInstructionSet() -
403          TargetInstructionSet::X86InstructionSet_Begin) +
404         Traits::InstructionSet::Begin);
405   }
406 }
407 
408 template <typename TraitsType>
409 void TargetX86Base<TraitsType>::staticInit(GlobalContext *Ctx) {
410   RegNumT::setLimit(Traits::RegisterSet::Reg_NUM);
411   Traits::initRegisterSet(getFlags(), &TypeToRegisterSet, &RegisterAliases);
412   for (size_t i = 0; i < TypeToRegisterSet.size(); ++i)
413     TypeToRegisterSetUnfiltered[i] = TypeToRegisterSet[i];
414   filterTypeToRegisterSet(Ctx, Traits::RegisterSet::Reg_NUM,
415                           TypeToRegisterSet.data(), TypeToRegisterSet.size(),
416                           Traits::getRegName, getRegClassName);
417   PcRelFixup = Traits::FK_PcRel;
418   AbsFixup = getFlags().getUseNonsfi() ? Traits::FK_Gotoff : Traits::FK_Abs;
419 }
420 
421 template <typename TraitsType>
422 bool TargetX86Base<TraitsType>::shouldBePooled(const Constant *C) {
423   if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(C)) {
424     return !Utils::isPositiveZero(ConstFloat->getValue());
425   }
426   if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(C)) {
427     return !Utils::isPositiveZero(ConstDouble->getValue());
428   }
429   if (getFlags().getRandomizeAndPoolImmediatesOption() != RPI_Pool) {
430     return false;
431   }
432   return C->shouldBeRandomizedOrPooled();
433 }
434 
435 template <typename TraitsType>
436 ::Ice::Type TargetX86Base<TraitsType>::getPointerType() {
437   if (!Traits::Is64Bit ||
438       ::Ice::getFlags().getApplicationBinaryInterface() == ::Ice::ABI_PNaCl) {
439     return ::Ice::IceType_i32;
440   }
441   return ::Ice::IceType_i64;
442 }
443 
444 template <typename TraitsType> void TargetX86Base<TraitsType>::translateO2() {
445   TimerMarker T(TimerStack::TT_O2, Func);
446 
447   if (SandboxingType != ST_None) {
448     initRebasePtr();
449   }
450 
451   genTargetHelperCalls();
452   Func->dump("After target helper call insertion");
453 
454   // Merge Alloca instructions, and lay out the stack.
455   static constexpr bool SortAndCombineAllocas = true;
456   Func->processAllocas(SortAndCombineAllocas);
457   Func->dump("After Alloca processing");
458 
459   // Run this early so it can be used to focus optimizations on potentially hot
460   // code.
461   // TODO(stichnot,ascull): currently only used for regalloc not
462   // expensive high level optimizations which could be focused on potentially
463   // hot code.
464   Func->generateLoopInfo();
465   Func->dump("After loop analysis");
466   if (getFlags().getLoopInvariantCodeMotion()) {
467     Func->loopInvariantCodeMotion();
468     Func->dump("After LICM");
469   }
470 
471   if (getFlags().getLocalCSE() != Ice::LCSE_Disabled) {
472     Func->localCSE(getFlags().getLocalCSE() == Ice::LCSE_EnabledSSA);
473     Func->dump("After Local CSE");
474     Func->floatConstantCSE();
475   }
476   if (getFlags().getEnableShortCircuit()) {
477     Func->shortCircuitJumps();
478     Func->dump("After Short Circuiting");
479   }
480 
481   if (!getFlags().getEnablePhiEdgeSplit()) {
482     // Lower Phi instructions.
483     Func->placePhiLoads();
484     if (Func->hasError())
485       return;
486     Func->placePhiStores();
487     if (Func->hasError())
488       return;
489     Func->deletePhis();
490     if (Func->hasError())
491       return;
492     Func->dump("After Phi lowering");
493   }
494 
495   // Address mode optimization.
496   Func->getVMetadata()->init(VMK_SingleDefs);
497   Func->doAddressOpt();
498   Func->materializeVectorShuffles();
499 
500   // Find read-modify-write opportunities. Do this after address mode
501   // optimization so that doAddressOpt() doesn't need to be applied to RMW
502   // instructions as well.
503   findRMW();
504   Func->dump("After RMW transform");
505 
506   // Argument lowering
507   Func->doArgLowering();
508 
509   // Target lowering. This requires liveness analysis for some parts of the
510   // lowering decisions, such as compare/branch fusing. If non-lightweight
511   // liveness analysis is used, the instructions need to be renumbered first
512   // TODO: This renumbering should only be necessary if we're actually
513   // calculating live intervals, which we only do for register allocation.
514   Func->renumberInstructions();
515   if (Func->hasError())
516     return;
517 
518   // TODO: It should be sufficient to use the fastest liveness calculation,
519   // i.e. livenessLightweight(). However, for some reason that slows down the
520   // rest of the translation. Investigate.
521   Func->liveness(Liveness_Basic);
522   if (Func->hasError())
523     return;
524   Func->dump("After x86 address mode opt");
525 
526   // Disable constant blinding or pooling for load optimization.
527   {
528     BoolFlagSaver B(RandomizationPoolingPaused, true);
529     doLoadOpt();
530   }
531   Func->genCode();
532   if (Func->hasError())
533     return;
534   if (SandboxingType != ST_None) {
535     initSandbox();
536   }
537   Func->dump("After x86 codegen");
538   splitBlockLocalVariables(Func);
539 
540   // Register allocation. This requires instruction renumbering and full
541   // liveness analysis. Loops must be identified before liveness so variable
542   // use weights are correct.
543   Func->renumberInstructions();
544   if (Func->hasError())
545     return;
546   Func->liveness(Liveness_Intervals);
547   if (Func->hasError())
548     return;
549   // The post-codegen dump is done here, after liveness analysis and associated
550   // cleanup, to make the dump cleaner and more useful.
551   Func->dump("After initial x86 codegen");
552   // Validate the live range computations. The expensive validation call is
553   // deliberately only made when assertions are enabled.
554   assert(Func->validateLiveness());
555   Func->getVMetadata()->init(VMK_All);
556   regAlloc(RAK_Global);
557   if (Func->hasError())
558     return;
559   Func->dump("After linear scan regalloc");
560 
561   if (getFlags().getEnablePhiEdgeSplit()) {
562     Func->advancedPhiLowering();
563     Func->dump("After advanced Phi lowering");
564   }
565 
566   // Stack frame mapping.
567   Func->genFrame();
568   if (Func->hasError())
569     return;
570   Func->dump("After stack frame mapping");
571 
572   Func->contractEmptyNodes();
573   Func->reorderNodes();
574 
575   // Shuffle basic block order if -reorder-basic-blocks is enabled.
576   Func->shuffleNodes();
577 
578   // Branch optimization.  This needs to be done just before code emission. In
579   // particular, no transformations that insert or reorder CfgNodes should be
580   // done after branch optimization. We go ahead and do it before nop insertion
581   // to reduce the amount of work needed for searching for opportunities.
582   Func->doBranchOpt();
583   Func->dump("After branch optimization");
584 
585   // Nop insertion if -nop-insertion is enabled.
586   Func->doNopInsertion();
587 
588   // Mark nodes that require sandbox alignment
589   if (NeedSandboxing) {
590     Func->markNodesForSandboxing();
591   }
592 }
593 
594 template <typename TraitsType> void TargetX86Base<TraitsType>::translateOm1() {
595   TimerMarker T(TimerStack::TT_Om1, Func);
596 
597   if (SandboxingType != ST_None) {
598     initRebasePtr();
599   }
600 
601   genTargetHelperCalls();
602 
603   // Do not merge Alloca instructions, and lay out the stack.
604   static constexpr bool SortAndCombineAllocas = false;
605   Func->processAllocas(SortAndCombineAllocas);
606   Func->dump("After Alloca processing");
607 
608   Func->placePhiLoads();
609   if (Func->hasError())
610     return;
611   Func->placePhiStores();
612   if (Func->hasError())
613     return;
614   Func->deletePhis();
615   if (Func->hasError())
616     return;
617   Func->dump("After Phi lowering");
618 
619   Func->doArgLowering();
620   Func->genCode();
621   if (Func->hasError())
622     return;
623   if (SandboxingType != ST_None) {
624     initSandbox();
625   }
626   Func->dump("After initial x86 codegen");
627 
628   regAlloc(RAK_InfOnly);
629   if (Func->hasError())
630     return;
631   Func->dump("After regalloc of infinite-weight variables");
632 
633   Func->genFrame();
634   if (Func->hasError())
635     return;
636   Func->dump("After stack frame mapping");
637 
638   // Shuffle basic block order if -reorder-basic-blocks is enabled.
639   Func->shuffleNodes();
640 
641   // Nop insertion if -nop-insertion is enabled.
642   Func->doNopInsertion();
643 
644   // Mark nodes that require sandbox alignment
645   if (NeedSandboxing)
646     Func->markNodesForSandboxing();
647 }
648 
649 inline bool canRMW(const InstArithmetic *Arith) {
650   Type Ty = Arith->getDest()->getType();
651   // X86 vector instructions write to a register and have no RMW option.
652   if (isVectorType(Ty))
653     return false;
654   bool isI64 = Ty == IceType_i64;
655 
656   switch (Arith->getOp()) {
657   // Not handled for lack of simple lowering:
658   //   shift on i64
659   //   mul, udiv, urem, sdiv, srem, frem
660   // Not handled for lack of RMW instructions:
661   //   fadd, fsub, fmul, fdiv (also vector types)
662   default:
663     return false;
664   case InstArithmetic::Add:
665   case InstArithmetic::Sub:
666   case InstArithmetic::And:
667   case InstArithmetic::Or:
668   case InstArithmetic::Xor:
669     return true;
670   case InstArithmetic::Shl:
671   case InstArithmetic::Lshr:
672   case InstArithmetic::Ashr:
673     return false; // TODO(stichnot): implement
674     return !isI64;
675   }
676 }
677 
678 template <typename TraitsType>
679 bool isSameMemAddressOperand(const Operand *A, const Operand *B) {
680   if (A == B)
681     return true;
682   if (auto *MemA =
683           llvm::dyn_cast<typename TargetX86Base<TraitsType>::X86OperandMem>(
684               A)) {
685     if (auto *MemB =
686             llvm::dyn_cast<typename TargetX86Base<TraitsType>::X86OperandMem>(
687                 B)) {
688       return MemA->getBase() == MemB->getBase() &&
689              MemA->getOffset() == MemB->getOffset() &&
690              MemA->getIndex() == MemB->getIndex() &&
691              MemA->getShift() == MemB->getShift() &&
692              MemA->getSegmentRegister() == MemB->getSegmentRegister();
693     }
694   }
695   return false;
696 }
697 
698 template <typename TraitsType> void TargetX86Base<TraitsType>::findRMW() {
699   TimerMarker _(TimerStack::TT_findRMW, Func);
700   Func->dump("Before RMW");
701   if (Func->isVerbose(IceV_RMW))
702     Func->getContext()->lockStr();
703   for (CfgNode *Node : Func->getNodes()) {
704     // Walk through the instructions, considering each sequence of 3
705     // instructions, and look for the particular RMW pattern. Note that this
706     // search can be "broken" (false negatives) if there are intervening
707     // deleted instructions, or intervening instructions that could be safely
708     // moved out of the way to reveal an RMW pattern.
709     auto E = Node->getInsts().end();
710     auto I1 = E, I2 = E, I3 = Node->getInsts().begin();
711     for (; I3 != E; I1 = I2, I2 = I3, ++I3) {
712       // Make I3 skip over deleted instructions.
713       while (I3 != E && I3->isDeleted())
714         ++I3;
715       if (I1 == E || I2 == E || I3 == E)
716         continue;
717       assert(!I1->isDeleted());
718       assert(!I2->isDeleted());
719       assert(!I3->isDeleted());
720       auto *Load = llvm::dyn_cast<InstLoad>(I1);
721       auto *Arith = llvm::dyn_cast<InstArithmetic>(I2);
722       auto *Store = llvm::dyn_cast<InstStore>(I3);
723       if (!Load || !Arith || !Store)
724         continue;
725       // Look for:
726       //   a = Load addr
727       //   b = <op> a, other
728       //   Store b, addr
729       // Change to:
730       //   a = Load addr
731       //   b = <op> a, other
732       //   x = FakeDef
733       //   RMW <op>, addr, other, x
734       //   b = Store b, addr, x
735       // Note that inferTwoAddress() makes sure setDestRedefined() gets called
736       // on the updated Store instruction, to avoid liveness problems later.
737       //
738       // With this transformation, the Store instruction acquires a Dest
739       // variable and is now subject to dead code elimination if there are no
740       // more uses of "b".  Variable "x" is a beacon for determining whether the
741       // Store instruction gets dead-code eliminated.  If the Store instruction
742       // is eliminated, then it must be the case that the RMW instruction ends
743       // x's live range, and therefore the RMW instruction will be retained and
744       // later lowered.  On the other hand, if the RMW instruction does not end
745       // x's live range, then the Store instruction must still be present, and
746       // therefore the RMW instruction is ignored during lowering because it is
747       // redundant with the Store instruction.
748       //
749       // Note that if "a" has further uses, the RMW transformation may still
750       // trigger, resulting in two loads and one store, which is worse than the
751       // original one load and one store.  However, this is probably rare, and
752       // caching probably keeps it just as fast.
753       if (!isSameMemAddressOperand<TraitsType>(Load->getSourceAddress(),
754                                                Store->getAddr()))
755         continue;
756       Operand *ArithSrcFromLoad = Arith->getSrc(0);
757       Operand *ArithSrcOther = Arith->getSrc(1);
758       if (ArithSrcFromLoad != Load->getDest()) {
759         if (!Arith->isCommutative() || ArithSrcOther != Load->getDest())
760           continue;
761         std::swap(ArithSrcFromLoad, ArithSrcOther);
762       }
763       if (Arith->getDest() != Store->getData())
764         continue;
765       if (!canRMW(Arith))
766         continue;
767       if (Func->isVerbose(IceV_RMW)) {
768         Ostream &Str = Func->getContext()->getStrDump();
769         Str << "Found RMW in " << Func->getFunctionName() << ":\n  ";
770         Load->dump(Func);
771         Str << "\n  ";
772         Arith->dump(Func);
773         Str << "\n  ";
774         Store->dump(Func);
775         Str << "\n";
776       }
777       Variable *Beacon = Func->makeVariable(IceType_i32);
778       Beacon->setMustNotHaveReg();
779       Store->setRmwBeacon(Beacon);
780       auto *BeaconDef = InstFakeDef::create(Func, Beacon);
781       Node->getInsts().insert(I3, BeaconDef);
782       auto *RMW = InstX86FakeRMW::create(Func, ArithSrcOther, Store->getAddr(),
783                                          Beacon, Arith->getOp());
784       Node->getInsts().insert(I3, RMW);
785     }
786   }
787   if (Func->isVerbose(IceV_RMW))
788     Func->getContext()->unlockStr();
789 }
790 
791 // Converts a ConstantInteger32 operand into its constant value, or
792 // MemoryOrderInvalid if the operand is not a ConstantInteger32.
793 inline uint64_t getConstantMemoryOrder(Operand *Opnd) {
794   if (auto *Integer = llvm::dyn_cast<ConstantInteger32>(Opnd))
795     return Integer->getValue();
796   return Intrinsics::MemoryOrderInvalid;
797 }
798 
799 /// Determines whether the dest of a Load instruction can be folded into one of
800 /// the src operands of a 2-operand instruction. This is true as long as the
801 /// load dest matches exactly one of the binary instruction's src operands.
802 /// Replaces Src0 or Src1 with LoadSrc if the answer is true.
803 inline bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest,
804                                       Operand *&Src0, Operand *&Src1) {
805   if (Src0 == LoadDest && Src1 != LoadDest) {
806     Src0 = LoadSrc;
807     return true;
808   }
809   if (Src0 != LoadDest && Src1 == LoadDest) {
810     Src1 = LoadSrc;
811     return true;
812   }
813   return false;
814 }
815 
816 template <typename TraitsType> void TargetX86Base<TraitsType>::doLoadOpt() {
817   TimerMarker _(TimerStack::TT_loadOpt, Func);
818   for (CfgNode *Node : Func->getNodes()) {
819     Context.init(Node);
820     while (!Context.atEnd()) {
821       Variable *LoadDest = nullptr;
822       Operand *LoadSrc = nullptr;
823       Inst *CurInst = iteratorToInst(Context.getCur());
824       Inst *Next = Context.getNextInst();
825       // Determine whether the current instruction is a Load instruction or
826       // equivalent.
827       if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) {
828         // An InstLoad always qualifies.
829         LoadDest = Load->getDest();
830         constexpr bool DoLegalize = false;
831         LoadSrc = formMemoryOperand(Load->getSourceAddress(),
832                                     LoadDest->getType(), DoLegalize);
833       } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsicCall>(CurInst)) {
834         // An AtomicLoad intrinsic qualifies as long as it has a valid memory
835         // ordering, and can be implemented in a single instruction (i.e., not
836         // i64 on x86-32).
837         Intrinsics::IntrinsicID ID = Intrin->getIntrinsicInfo().ID;
838         if (ID == Intrinsics::AtomicLoad &&
839             (Traits::Is64Bit || Intrin->getDest()->getType() != IceType_i64) &&
840             Intrinsics::isMemoryOrderValid(
841                 ID, getConstantMemoryOrder(Intrin->getArg(1)))) {
842           LoadDest = Intrin->getDest();
843           constexpr bool DoLegalize = false;
844           LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(),
845                                       DoLegalize);
846         }
847       }
848       // A Load instruction can be folded into the following instruction only
849       // if the following instruction ends the Load's Dest variable's live
850       // range.
851       if (LoadDest && Next && Next->isLastUse(LoadDest)) {
852         assert(LoadSrc);
853         Inst *NewInst = nullptr;
854         if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) {
855           Operand *Src0 = Arith->getSrc(0);
856           Operand *Src1 = Arith->getSrc(1);
857           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
858             NewInst = InstArithmetic::create(Func, Arith->getOp(),
859                                              Arith->getDest(), Src0, Src1);
860           }
861         } else if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Next)) {
862           Operand *Src0 = Icmp->getSrc(0);
863           Operand *Src1 = Icmp->getSrc(1);
864           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
865             NewInst = InstIcmp::create(Func, Icmp->getCondition(),
866                                        Icmp->getDest(), Src0, Src1);
867           }
868         } else if (auto *Fcmp = llvm::dyn_cast<InstFcmp>(Next)) {
869           Operand *Src0 = Fcmp->getSrc(0);
870           Operand *Src1 = Fcmp->getSrc(1);
871           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
872             NewInst = InstFcmp::create(Func, Fcmp->getCondition(),
873                                        Fcmp->getDest(), Src0, Src1);
874           }
875         } else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) {
876           Operand *Src0 = Select->getTrueOperand();
877           Operand *Src1 = Select->getFalseOperand();
878           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
879             NewInst = InstSelect::create(Func, Select->getDest(),
880                                          Select->getCondition(), Src0, Src1);
881           }
882         } else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) {
883           // The load dest can always be folded into a Cast instruction.
884           auto *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0));
885           if (Src0 == LoadDest) {
886             NewInst = InstCast::create(Func, Cast->getCastKind(),
887                                        Cast->getDest(), LoadSrc);
888           }
889         }
890         if (NewInst) {
891           CurInst->setDeleted();
892           Next->setDeleted();
893           Context.insert(NewInst);
894           // Update NewInst->LiveRangesEnded so that target lowering may
895           // benefit. Also update NewInst->HasSideEffects.
896           NewInst->spliceLivenessInfo(Next, CurInst);
897         }
898       }
899       Context.advanceCur();
900       Context.advanceNext();
901     }
902   }
903   Func->dump("After load optimization");
904 }
905 
906 template <typename TraitsType>
907 bool TargetX86Base<TraitsType>::doBranchOpt(Inst *I, const CfgNode *NextNode) {
908   if (auto *Br = llvm::dyn_cast<InstX86Br>(I)) {
909     return Br->optimizeBranch(NextNode);
910   }
911   return false;
912 }
913 
914 template <typename TraitsType>
915 Variable *TargetX86Base<TraitsType>::getPhysicalRegister(RegNumT RegNum,
916                                                          Type Ty) {
917   if (Ty == IceType_void)
918     Ty = IceType_i32;
919   if (PhysicalRegisters[Ty].empty())
920     PhysicalRegisters[Ty].resize(Traits::RegisterSet::Reg_NUM);
921   assert(unsigned(RegNum) < PhysicalRegisters[Ty].size());
922   Variable *Reg = PhysicalRegisters[Ty][RegNum];
923   if (Reg == nullptr) {
924     Reg = Func->makeVariable(Ty);
925     Reg->setRegNum(RegNum);
926     PhysicalRegisters[Ty][RegNum] = Reg;
927     // Specially mark a named physical register as an "argument" so that it is
928     // considered live upon function entry.  Otherwise it's possible to get
929     // liveness validation errors for saving callee-save registers.
930     Func->addImplicitArg(Reg);
931     // Don't bother tracking the live range of a named physical register.
932     Reg->setIgnoreLiveness();
933   }
934   assert(Traits::getGprForType(Ty, RegNum) == RegNum);
935   return Reg;
936 }
937 
938 template <typename TraitsType>
939 const char *TargetX86Base<TraitsType>::getRegName(RegNumT RegNum,
940                                                   Type Ty) const {
941   return Traits::getRegName(Traits::getGprForType(Ty, RegNum));
942 }
943 
944 template <typename TraitsType>
945 void TargetX86Base<TraitsType>::emitVariable(const Variable *Var) const {
946   if (!BuildDefs::dump())
947     return;
948   Ostream &Str = Ctx->getStrEmit();
949   if (Var->hasReg()) {
950     const bool Is64BitSandboxing = Traits::Is64Bit && NeedSandboxing;
951     const Type VarType = (Var->isRematerializable() && Is64BitSandboxing)
952                              ? IceType_i64
953                              : Var->getType();
954     Str << "%" << getRegName(Var->getRegNum(), VarType);
955     return;
956   }
957   if (Var->mustHaveReg()) {
958     llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() +
959                              ") has no register assigned - function " +
960                              Func->getFunctionName());
961   }
962   const int32_t Offset = Var->getStackOffset();
963   auto BaseRegNum = Var->getBaseRegNum();
964   if (BaseRegNum.hasNoValue())
965     BaseRegNum = getFrameOrStackReg();
966 
967   // Print in the form "Offset(%reg)", omitting Offset when it is 0.
968   if (getFlags().getDecorateAsm()) {
969     Str << Var->getSymbolicStackOffset();
970   } else if (Offset != 0) {
971     Str << Offset;
972   }
973   const Type FrameSPTy = Traits::WordType;
974   Str << "(%" << getRegName(BaseRegNum, FrameSPTy) << ")";
975 }
976 
977 template <typename TraitsType>
978 typename TargetX86Base<TraitsType>::X86Address
979 TargetX86Base<TraitsType>::stackVarToAsmOperand(const Variable *Var) const {
980   if (Var->hasReg())
981     llvm::report_fatal_error("Stack Variable has a register assigned");
982   if (Var->mustHaveReg()) {
983     llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() +
984                              ") has no register assigned - function " +
985                              Func->getFunctionName());
986   }
987   int32_t Offset = Var->getStackOffset();
988   auto BaseRegNum = Var->getBaseRegNum();
989   if (Var->getBaseRegNum().hasNoValue())
990     BaseRegNum = getFrameOrStackReg();
991   return X86Address(Traits::getEncodedGPR(BaseRegNum), Offset,
992                     AssemblerFixup::NoFixup);
993 }
994 
995 template <typename TraitsType>
996 void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) {
997   // Stack frame layout:
998   //
999   // +------------------------+
1000   // | 1. return address      |
1001   // +------------------------+
1002   // | 2. preserved registers |
1003   // +------------------------+ <--- BasePointer (if used)
1004   // | 3. padding             |
1005   // +------------------------+
1006   // | 4. global spill area   |
1007   // +------------------------+
1008   // | 5. padding             |
1009   // +------------------------+
1010   // | 6. local spill area    |
1011   // +------------------------+
1012   // | 7. padding             |
1013   // +------------------------+
1014   // | 8. allocas             |
1015   // +------------------------+
1016   // | 9. padding             |
1017   // +------------------------+
1018   // | 10. out args           |
1019   // +------------------------+ <--- StackPointer
1020   //
1021   // The following variables record the size in bytes of the given areas:
1022   //  * X86_RET_IP_SIZE_BYTES:   area 1
1023   //  * PreservedRegsSizeBytes:  area 2
1024   //  * SpillAreaPaddingBytes:   area 3
1025   //  * GlobalsSize:             area 4
1026   //  * LocalsSlotsPaddingBytes: area 5
1027   //  * GlobalsAndSubsequentPaddingSize: areas 4 - 5
1028   //  * LocalsSpillAreaSize:     area 6
1029   //  * FixedAllocaSizeBytes:    areas 7 - 8
1030   //  * SpillAreaSizeBytes:      areas 3 - 10
1031   //  * maxOutArgsSizeBytes():   areas 9 - 10
1032 
1033   // Determine stack frame offsets for each Variable without a register
1034   // assignment. This can be done as one variable per stack slot. Or, do
1035   // coalescing by running the register allocator again with an infinite set of
1036   // registers (as a side effect, this gives variables a second chance at
1037   // physical register assignment).
1038   //
1039   // A middle ground approach is to leverage sparsity and allocate one block of
1040   // space on the frame for globals (variables with multi-block lifetime), and
1041   // one block to share for locals (single-block lifetime).
1042 
1043   Context.init(Node);
1044   Context.setInsertPoint(Context.getCur());
1045 
1046   SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
1047   RegsUsed = SmallBitVector(CalleeSaves.size());
1048   VarList SortedSpilledVariables, VariablesLinkedToSpillSlots;
1049   size_t GlobalsSize = 0;
1050   // If there is a separate locals area, this represents that area. Otherwise
1051   // it counts any variable not counted by GlobalsSize.
1052   SpillAreaSizeBytes = 0;
1053   // If there is a separate locals area, this specifies the alignment for it.
1054   uint32_t LocalsSlotsAlignmentBytes = 0;
1055   // The entire spill locations area gets aligned to largest natural alignment
1056   // of the variables that have a spill slot.
1057   uint32_t SpillAreaAlignmentBytes = 0;
1058   // A spill slot linked to a variable with a stack slot should reuse that
1059   // stack slot.
1060   std::function<bool(Variable *)> TargetVarHook =
1061       [&VariablesLinkedToSpillSlots](Variable *Var) {
1062         // TODO(stichnot): Refactor this into the base class.
1063         Variable *Root = Var->getLinkedToStackRoot();
1064         if (Root != nullptr) {
1065           assert(!Root->hasReg());
1066           if (!Root->hasReg()) {
1067             VariablesLinkedToSpillSlots.push_back(Var);
1068             return true;
1069           }
1070         }
1071         return false;
1072       };
1073 
1074   // Compute the list of spilled variables and bounds for GlobalsSize, etc.
1075   getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize,
1076                         &SpillAreaSizeBytes, &SpillAreaAlignmentBytes,
1077                         &LocalsSlotsAlignmentBytes, TargetVarHook);
1078   uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes;
1079   SpillAreaSizeBytes += GlobalsSize;
1080 
1081   // Add push instructions for preserved registers.
1082   uint32_t NumCallee = 0;
1083   size_t PreservedRegsSizeBytes = 0;
1084   SmallBitVector Pushed(CalleeSaves.size());
1085   for (RegNumT i : RegNumBVIter(CalleeSaves)) {
1086     const auto Canonical = Traits::getBaseReg(i);
1087     assert(Canonical == Traits::getBaseReg(Canonical));
1088     if (RegsUsed[i]) {
1089       Pushed[Canonical] = true;
1090     }
1091   }
1092   for (RegNumT RegNum : RegNumBVIter(Pushed)) {
1093     assert(RegNum == Traits::getBaseReg(RegNum));
1094     ++NumCallee;
1095     PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType);
1096     _push_reg(getPhysicalRegister(RegNum, Traits::WordType));
1097   }
1098   Ctx->statsUpdateRegistersSaved(NumCallee);
1099 
1100   // Generate "push frameptr; mov frameptr, stackptr"
1101   if (IsEbpBasedFrame) {
1102     assert((RegsUsed & getRegisterSet(RegSet_FramePointer, RegSet_None))
1103                .count() == 0);
1104     PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType);
1105     _link_bp();
1106   }
1107 
1108   // Align the variables area. SpillAreaPaddingBytes is the size of the region
1109   // after the preserved registers and before the spill areas.
1110   // LocalsSlotsPaddingBytes is the amount of padding between the globals and
1111   // locals area if they are separate.
1112   assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
1113   uint32_t SpillAreaPaddingBytes = 0;
1114   uint32_t LocalsSlotsPaddingBytes = 0;
1115   alignStackSpillAreas(Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes,
1116                        SpillAreaAlignmentBytes, GlobalsSize,
1117                        LocalsSlotsAlignmentBytes, &SpillAreaPaddingBytes,
1118                        &LocalsSlotsPaddingBytes);
1119   SpillAreaSizeBytes += SpillAreaPaddingBytes + LocalsSlotsPaddingBytes;
1120   uint32_t GlobalsAndSubsequentPaddingSize =
1121       GlobalsSize + LocalsSlotsPaddingBytes;
1122 
1123   // Functions returning scalar floating point types may need to convert values
1124   // from an in-register xmm value to the top of the x87 floating point stack.
1125   // This is done by a movp[sd] and an fld[sd].  Ensure there is enough scratch
1126   // space on the stack for this.
1127   const Type ReturnType = Func->getReturnType();
1128   if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
1129     if (isScalarFloatingType(ReturnType)) {
1130       // Avoid misaligned double-precision load/store.
1131       RequiredStackAlignment = std::max<size_t>(
1132           RequiredStackAlignment, Traits::X86_STACK_ALIGNMENT_BYTES);
1133       SpillAreaSizeBytes =
1134           std::max(typeWidthInBytesOnStack(ReturnType), SpillAreaSizeBytes);
1135     }
1136   }
1137 
1138   RequiredStackAlignment =
1139       std::max<size_t>(RequiredStackAlignment, SpillAreaAlignmentBytes);
1140 
1141   if (PrologEmitsFixedAllocas) {
1142     RequiredStackAlignment =
1143         std::max(RequiredStackAlignment, FixedAllocaAlignBytes);
1144   }
1145 
1146   // Combine fixed allocations into SpillAreaSizeBytes if we are emitting the
1147   // fixed allocations in the prolog.
1148   if (PrologEmitsFixedAllocas)
1149     SpillAreaSizeBytes += FixedAllocaSizeBytes;
1150 
1151   // Entering the function has made the stack pointer unaligned. Re-align it by
1152   // adjusting the stack size.
1153   uint32_t StackOffset = Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
1154   uint32_t StackSize = Utils::applyAlignment(StackOffset + SpillAreaSizeBytes,
1155                                              RequiredStackAlignment);
1156   StackSize = Utils::applyAlignment(StackSize + maxOutArgsSizeBytes(),
1157                                     RequiredStackAlignment);
1158   SpillAreaSizeBytes = StackSize - StackOffset;
1159 
1160   if (SpillAreaSizeBytes) {
1161     // Generate "sub stackptr, SpillAreaSizeBytes"
1162     _sub_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
1163   }
1164 
1165   // If the required alignment is greater than the stack pointer's guaranteed
1166   // alignment, align the stack pointer accordingly.
1167   if (RequiredStackAlignment > Traits::X86_STACK_ALIGNMENT_BYTES) {
1168     assert(IsEbpBasedFrame);
1169     _and(getPhysicalRegister(getStackReg(), Traits::WordType),
1170          Ctx->getConstantInt32(-RequiredStackAlignment));
1171   }
1172 
1173   // Account for known-frame-offset alloca instructions that were not already
1174   // combined into the prolog.
1175   if (!PrologEmitsFixedAllocas)
1176     SpillAreaSizeBytes += FixedAllocaSizeBytes;
1177 
1178   Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
1179 
1180   // Fill in stack offsets for stack args, and copy args into registers for
1181   // those that were register-allocated. Args are pushed right to left, so
1182   // Arg[0] is closest to the stack/frame pointer.
1183   RegNumT FrameOrStackReg = IsEbpBasedFrame ? getFrameReg() : getStackReg();
1184   Variable *FramePtr = getPhysicalRegister(FrameOrStackReg, Traits::WordType);
1185   size_t BasicFrameOffset =
1186       PreservedRegsSizeBytes + Traits::X86_RET_IP_SIZE_BYTES;
1187   if (!IsEbpBasedFrame)
1188     BasicFrameOffset += SpillAreaSizeBytes;
1189 
1190   emitGetIP(Node);
1191 
1192   const VarList &Args = Func->getArgs();
1193   size_t InArgsSizeBytes = 0;
1194   unsigned NumXmmArgs = 0;
1195   unsigned NumGPRArgs = 0;
1196   for (Variable *Arg : Args) {
1197     // Skip arguments passed in registers.
1198     if (isVectorType(Arg->getType())) {
1199       if (Traits::getRegisterForXmmArgNum(NumXmmArgs).hasValue()) {
1200         ++NumXmmArgs;
1201         continue;
1202       }
1203     } else if (isScalarFloatingType(Arg->getType())) {
1204       if (Traits::X86_PASS_SCALAR_FP_IN_XMM &&
1205           Traits::getRegisterForXmmArgNum(NumXmmArgs).hasValue()) {
1206         ++NumXmmArgs;
1207         continue;
1208       }
1209     } else {
1210       assert(isScalarIntegerType(Arg->getType()));
1211       if (Traits::getRegisterForGprArgNum(Traits::WordType, NumGPRArgs)
1212               .hasValue()) {
1213         ++NumGPRArgs;
1214         continue;
1215       }
1216     }
1217     // For esp-based frames where the allocas are done outside the prolog, the
1218     // esp value may not stabilize to its home value until after all the
1219     // fixed-size alloca instructions have executed.  In this case, a stack
1220     // adjustment is needed when accessing in-args in order to copy them into
1221     // registers.
1222     size_t StackAdjBytes = 0;
1223     if (!IsEbpBasedFrame && !PrologEmitsFixedAllocas)
1224       StackAdjBytes -= FixedAllocaSizeBytes;
1225     finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, StackAdjBytes,
1226                            InArgsSizeBytes);
1227   }
1228 
1229   // Fill in stack offsets for locals.
1230   assignVarStackSlots(SortedSpilledVariables, SpillAreaPaddingBytes,
1231                       SpillAreaSizeBytes, GlobalsAndSubsequentPaddingSize,
1232                       IsEbpBasedFrame && !needsStackPointerAlignment());
1233   // Assign stack offsets to variables that have been linked to spilled
1234   // variables.
1235   for (Variable *Var : VariablesLinkedToSpillSlots) {
1236     const Variable *Root = Var->getLinkedToStackRoot();
1237     assert(Root != nullptr);
1238     Var->setStackOffset(Root->getStackOffset());
1239   }
1240   this->HasComputedFrame = true;
1241 
1242   if (BuildDefs::dump() && Func->isVerbose(IceV_Frame)) {
1243     OstreamLocker L(Func->getContext());
1244     Ostream &Str = Func->getContext()->getStrDump();
1245 
1246     Str << "Stack layout:\n";
1247     uint32_t EspAdjustmentPaddingSize =
1248         SpillAreaSizeBytes - LocalsSpillAreaSize -
1249         GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes -
1250         maxOutArgsSizeBytes();
1251     Str << " in-args = " << InArgsSizeBytes << " bytes\n"
1252         << " return address = " << Traits::X86_RET_IP_SIZE_BYTES << " bytes\n"
1253         << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n"
1254         << " spill area padding = " << SpillAreaPaddingBytes << " bytes\n"
1255         << " globals spill area = " << GlobalsSize << " bytes\n"
1256         << " globals-locals spill areas intermediate padding = "
1257         << GlobalsAndSubsequentPaddingSize - GlobalsSize << " bytes\n"
1258         << " locals spill area = " << LocalsSpillAreaSize << " bytes\n"
1259         << " esp alignment padding = " << EspAdjustmentPaddingSize
1260         << " bytes\n";
1261 
1262     Str << "Stack details:\n"
1263         << " esp adjustment = " << SpillAreaSizeBytes << " bytes\n"
1264         << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n"
1265         << " outgoing args size = " << maxOutArgsSizeBytes() << " bytes\n"
1266         << " locals spill area alignment = " << LocalsSlotsAlignmentBytes
1267         << " bytes\n"
1268         << " is ebp based = " << IsEbpBasedFrame << "\n";
1269   }
1270 }
1271 
1272 /// Helper function for addProlog().
1273 ///
1274 /// This assumes Arg is an argument passed on the stack. This sets the frame
1275 /// offset for Arg and updates InArgsSizeBytes according to Arg's width. For an
1276 /// I64 arg that has been split into Lo and Hi components, it calls itself
1277 /// recursively on the components, taking care to handle Lo first because of the
1278 /// little-endian architecture. Lastly, this function generates an instruction
1279 /// to copy Arg into its assigned register if applicable.
1280 template <typename TraitsType>
1281 void TargetX86Base<TraitsType>::finishArgumentLowering(
1282     Variable *Arg, Variable *FramePtr, size_t BasicFrameOffset,
1283     size_t StackAdjBytes, size_t &InArgsSizeBytes) {
1284   if (!Traits::Is64Bit) {
1285     if (auto *Arg64On32 = llvm::dyn_cast<Variable64On32>(Arg)) {
1286       Variable *Lo = Arg64On32->getLo();
1287       Variable *Hi = Arg64On32->getHi();
1288       finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, StackAdjBytes,
1289                              InArgsSizeBytes);
1290       finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, StackAdjBytes,
1291                              InArgsSizeBytes);
1292       return;
1293     }
1294   }
1295   Type Ty = Arg->getType();
1296   if (isVectorType(Ty)) {
1297     InArgsSizeBytes = Traits::applyStackAlignment(InArgsSizeBytes);
1298   }
1299   Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);
1300   InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
1301   if (Arg->hasReg()) {
1302     assert(Ty != IceType_i64 || Traits::Is64Bit);
1303     auto *Mem = X86OperandMem::create(
1304         Func, Ty, FramePtr,
1305         Ctx->getConstantInt32(Arg->getStackOffset() + StackAdjBytes));
1306     if (isVectorType(Arg->getType())) {
1307       _movp(Arg, Mem);
1308     } else {
1309       _mov(Arg, Mem);
1310     }
1311     // This argument-copying instruction uses an explicit X86OperandMem
1312     // operand instead of a Variable, so its fill-from-stack operation has to
1313     // be tracked separately for statistics.
1314     Ctx->statsUpdateFills();
1315   }
1316 }
1317 
1318 template <typename TraitsType>
1319 void TargetX86Base<TraitsType>::addEpilog(CfgNode *Node) {
1320   InstList &Insts = Node->getInsts();
1321   InstList::reverse_iterator RI, E;
1322   for (RI = Insts.rbegin(), E = Insts.rend(); RI != E; ++RI) {
1323     if (llvm::isa<typename Traits::Insts::Ret>(*RI))
1324       break;
1325   }
1326   if (RI == E)
1327     return;
1328 
1329   // Convert the reverse_iterator position into its corresponding (forward)
1330   // iterator position.
1331   InstList::iterator InsertPoint = reverseToForwardIterator(RI);
1332   --InsertPoint;
1333   Context.init(Node);
1334   Context.setInsertPoint(InsertPoint);
1335 
1336   if (IsEbpBasedFrame) {
1337     _unlink_bp();
1338   } else {
1339     // add stackptr, SpillAreaSizeBytes
1340     if (SpillAreaSizeBytes != 0) {
1341       _add_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
1342     }
1343   }
1344 
1345   // Add pop instructions for preserved registers.
1346   SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
1347   SmallBitVector Popped(CalleeSaves.size());
1348   for (int32_t i = CalleeSaves.size() - 1; i >= 0; --i) {
1349     const auto RegNum = RegNumT::fromInt(i);
1350     if (RegNum == getFrameReg() && IsEbpBasedFrame)
1351       continue;
1352     const RegNumT Canonical = Traits::getBaseReg(RegNum);
1353     if (CalleeSaves[i] && RegsUsed[i]) {
1354       Popped[Canonical] = true;
1355     }
1356   }
1357   for (int32_t i = Popped.size() - 1; i >= 0; --i) {
1358     if (!Popped[i])
1359       continue;
1360     const auto RegNum = RegNumT::fromInt(i);
1361     assert(RegNum == Traits::getBaseReg(RegNum));
1362     _pop(getPhysicalRegister(RegNum, Traits::WordType));
1363   }
1364 
1365   if (!NeedSandboxing) {
1366     return;
1367   }
1368   emitSandboxedReturn();
1369   if (RI->getSrcSize()) {
1370     auto *RetValue = llvm::cast<Variable>(RI->getSrc(0));
1371     Context.insert<InstFakeUse>(RetValue);
1372   }
1373   RI->setDeleted();
1374 }
1375 
1376 template <typename TraitsType> Type TargetX86Base<TraitsType>::stackSlotType() {
1377   return Traits::WordType;
1378 }
1379 
1380 template <typename TraitsType>
1381 template <typename T>
1382 typename std::enable_if<!T::Is64Bit, Operand>::type *
1383 TargetX86Base<TraitsType>::loOperand(Operand *Operand) {
1384   assert(Operand->getType() == IceType_i64 ||
1385          Operand->getType() == IceType_f64);
1386   if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
1387     return Operand;
1388   if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
1389     return Var64On32->getLo();
1390   if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
1391     auto *ConstInt = llvm::dyn_cast<ConstantInteger32>(
1392         Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue())));
1393     // Check if we need to blind/pool the constant.
1394     return legalize(ConstInt);
1395   }
1396   if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Operand)) {
1397     auto *MemOperand = X86OperandMem::create(
1398         Func, IceType_i32, Mem->getBase(), Mem->getOffset(), Mem->getIndex(),
1399         Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsRebased());
1400     // Test if we should randomize or pool the offset, if so randomize it or
1401     // pool it then create mem operand with the blinded/pooled constant.
1402     // Otherwise, return the mem operand as ordinary mem operand.
1403     return legalize(MemOperand);
1404   }
1405   llvm_unreachable("Unsupported operand type");
1406   return nullptr;
1407 }
1408 
1409 template <typename TraitsType>
1410 template <typename T>
1411 typename std::enable_if<!T::Is64Bit, Operand>::type *
1412 TargetX86Base<TraitsType>::hiOperand(Operand *Operand) {
1413   assert(Operand->getType() == IceType_i64 ||
1414          Operand->getType() == IceType_f64);
1415   if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
1416     return Operand;
1417   if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
1418     return Var64On32->getHi();
1419   if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
1420     auto *ConstInt = llvm::dyn_cast<ConstantInteger32>(
1421         Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue() >> 32)));
1422     // Check if we need to blind/pool the constant.
1423     return legalize(ConstInt);
1424   }
1425   if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Operand)) {
1426     Constant *Offset = Mem->getOffset();
1427     if (Offset == nullptr) {
1428       Offset = Ctx->getConstantInt32(4);
1429     } else if (auto *IntOffset = llvm::dyn_cast<ConstantInteger32>(Offset)) {
1430       Offset = Ctx->getConstantInt32(4 + IntOffset->getValue());
1431     } else if (auto *SymOffset = llvm::dyn_cast<ConstantRelocatable>(Offset)) {
1432       assert(!Utils::WouldOverflowAdd(SymOffset->getOffset(), 4));
1433       Offset =
1434           Ctx->getConstantSym(4 + SymOffset->getOffset(), SymOffset->getName());
1435     }
1436     auto *MemOperand = X86OperandMem::create(
1437         Func, IceType_i32, Mem->getBase(), Offset, Mem->getIndex(),
1438         Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsRebased());
1439     // Test if the Offset is an eligible i32 constants for randomization and
1440     // pooling. Blind/pool it if it is. Otherwise return as oridinary mem
1441     // operand.
1442     return legalize(MemOperand);
1443   }
1444   llvm_unreachable("Unsupported operand type");
1445   return nullptr;
1446 }
1447 
1448 template <typename TraitsType>
1449 SmallBitVector
1450 TargetX86Base<TraitsType>::getRegisterSet(RegSetMask Include,
1451                                           RegSetMask Exclude) const {
1452   return Traits::getRegisterSet(getFlags(), Include, Exclude);
1453 }
1454 
1455 template <typename TraitsType>
1456 void TargetX86Base<TraitsType>::lowerAlloca(const InstAlloca *Instr) {
1457   // Conservatively require the stack to be aligned. Some stack adjustment
1458   // operations implemented below assume that the stack is aligned before the
1459   // alloca. All the alloca code ensures that the stack alignment is preserved
1460   // after the alloca. The stack alignment restriction can be relaxed in some
1461   // cases.
1462   RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment,
1463                                             Traits::X86_STACK_ALIGNMENT_BYTES);
1464 
1465   // For default align=0, set it to the real value 1, to avoid any
1466   // bit-manipulation problems below.
1467   const uint32_t AlignmentParam = std::max(1u, Instr->getAlignInBytes());
1468 
1469   // LLVM enforces power of 2 alignment.
1470   assert(llvm::isPowerOf2_32(AlignmentParam));
1471   assert(llvm::isPowerOf2_32(Traits::X86_STACK_ALIGNMENT_BYTES));
1472 
1473   const uint32_t Alignment =
1474       std::max(AlignmentParam, Traits::X86_STACK_ALIGNMENT_BYTES);
1475   const bool OverAligned = Alignment > Traits::X86_STACK_ALIGNMENT_BYTES;
1476   const bool OptM1 = Func->getOptLevel() == Opt_m1;
1477   const bool AllocaWithKnownOffset = Instr->getKnownFrameOffset();
1478   const bool UseFramePointer =
1479       hasFramePointer() || OverAligned || !AllocaWithKnownOffset || OptM1;
1480 
1481   if (UseFramePointer)
1482     setHasFramePointer();
1483 
1484   Variable *esp = getPhysicalRegister(getStackReg(), Traits::WordType);
1485   if (OverAligned) {
1486     _and(esp, Ctx->getConstantInt32(-Alignment));
1487   }
1488 
1489   Variable *Dest = Instr->getDest();
1490   Operand *TotalSize = legalize(Instr->getSizeInBytes());
1491 
1492   if (const auto *ConstantTotalSize =
1493           llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
1494     const uint32_t Value =
1495         Utils::applyAlignment(ConstantTotalSize->getValue(), Alignment);
1496     if (UseFramePointer) {
1497       _sub_sp(Ctx->getConstantInt32(Value));
1498     } else {
1499       // If we don't need a Frame Pointer, this alloca has a known offset to the
1500       // stack pointer. We don't need adjust the stack pointer, nor assign any
1501       // value to Dest, as Dest is rematerializable.
1502       assert(Dest->isRematerializable());
1503       FixedAllocaSizeBytes += Value;
1504       Context.insert<InstFakeDef>(Dest);
1505     }
1506   } else {
1507     // Non-constant sizes need to be adjusted to the next highest multiple of
1508     // the required alignment at runtime.
1509     Variable *T = nullptr;
1510     if (Traits::Is64Bit && TotalSize->getType() != IceType_i64 &&
1511         !NeedSandboxing) {
1512       T = makeReg(IceType_i64);
1513       _movzx(T, TotalSize);
1514     } else {
1515       T = makeReg(IceType_i32);
1516       _mov(T, TotalSize);
1517     }
1518     _add(T, Ctx->getConstantInt32(Alignment - 1));
1519     _and(T, Ctx->getConstantInt32(-Alignment));
1520     _sub_sp(T);
1521   }
1522   // Add enough to the returned address to account for the out args area.
1523   uint32_t OutArgsSize = maxOutArgsSizeBytes();
1524   if (OutArgsSize > 0) {
1525     Variable *T = makeReg(IceType_i32);
1526     auto *CalculateOperand = X86OperandMem::create(
1527         Func, IceType_void, esp, Ctx->getConstantInt(IceType_i32, OutArgsSize));
1528     _lea(T, CalculateOperand);
1529     _mov(Dest, T);
1530   } else {
1531     _mov(Dest, esp);
1532   }
1533 }
1534 
1535 template <typename TraitsType>
1536 void TargetX86Base<TraitsType>::lowerArguments() {
1537   const bool OptM1 = Func->getOptLevel() == Opt_m1;
1538   VarList &Args = Func->getArgs();
1539   unsigned NumXmmArgs = 0;
1540   bool XmmSlotsRemain = true;
1541   unsigned NumGprArgs = 0;
1542   bool GprSlotsRemain = true;
1543 
1544   Context.init(Func->getEntryNode());
1545   Context.setInsertPoint(Context.getCur());
1546 
1547   for (SizeT i = 0, End = Args.size();
1548        i < End && (XmmSlotsRemain || GprSlotsRemain); ++i) {
1549     Variable *Arg = Args[i];
1550     Type Ty = Arg->getType();
1551     Variable *RegisterArg = nullptr;
1552     RegNumT RegNum;
1553     if (isVectorType(Ty)) {
1554       RegNum = Traits::getRegisterForXmmArgNum(NumXmmArgs);
1555       if (RegNum.hasNoValue()) {
1556         XmmSlotsRemain = false;
1557         continue;
1558       }
1559       ++NumXmmArgs;
1560       RegisterArg = Func->makeVariable(Ty);
1561     } else if (isScalarFloatingType(Ty)) {
1562       if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
1563         continue;
1564       }
1565       RegNum = Traits::getRegisterForXmmArgNum(NumXmmArgs);
1566       if (RegNum.hasNoValue()) {
1567         XmmSlotsRemain = false;
1568         continue;
1569       }
1570       ++NumXmmArgs;
1571       RegisterArg = Func->makeVariable(Ty);
1572     } else if (isScalarIntegerType(Ty)) {
1573       RegNum = Traits::getRegisterForGprArgNum(Ty, NumGprArgs);
1574       if (RegNum.hasNoValue()) {
1575         GprSlotsRemain = false;
1576         continue;
1577       }
1578       ++NumGprArgs;
1579       RegisterArg = Func->makeVariable(Ty);
1580     }
1581     assert(RegNum.hasValue());
1582     assert(RegisterArg != nullptr);
1583     // Replace Arg in the argument list with the home register. Then generate
1584     // an instruction in the prolog to copy the home register to the assigned
1585     // location of Arg.
1586     if (BuildDefs::dump())
1587       RegisterArg->setName(Func, "home_reg:" + Arg->getName());
1588     RegisterArg->setRegNum(RegNum);
1589     RegisterArg->setIsArg();
1590     Arg->setIsArg(false);
1591 
1592     Args[i] = RegisterArg;
1593     // When not Om1, do the assignment through a temporary, instead of directly
1594     // from the pre-colored variable, so that a subsequent availabilityGet()
1595     // call has a chance to work.  (In Om1, don't bother creating extra
1596     // instructions with extra variables to register-allocate.)
1597     if (OptM1) {
1598       Context.insert<InstAssign>(Arg, RegisterArg);
1599     } else {
1600       Variable *Tmp = makeReg(RegisterArg->getType());
1601       Context.insert<InstAssign>(Tmp, RegisterArg);
1602       Context.insert<InstAssign>(Arg, Tmp);
1603     }
1604   }
1605   if (!OptM1)
1606     Context.availabilityUpdate();
1607 }
1608 
1609 /// Strength-reduce scalar integer multiplication by a constant (for i32 or
1610 /// narrower) for certain constants. The lea instruction can be used to multiply
1611 /// by 3, 5, or 9, and the lsh instruction can be used to multiply by powers of
1612 /// 2. These can be combined such that e.g. multiplying by 100 can be done as 2
1613 /// lea-based multiplies by 5, combined with left-shifting by 2.
1614 template <typename TraitsType>
1615 bool TargetX86Base<TraitsType>::optimizeScalarMul(Variable *Dest, Operand *Src0,
1616                                                   int32_t Src1) {
1617   // Disable this optimization for Om1 and O0, just to keep things simple
1618   // there.
1619   if (Func->getOptLevel() < Opt_1)
1620     return false;
1621   Type Ty = Dest->getType();
1622   if (Src1 == -1) {
1623     Variable *T = nullptr;
1624     _mov(T, Src0);
1625     _neg(T);
1626     _mov(Dest, T);
1627     return true;
1628   }
1629   if (Src1 == 0) {
1630     _mov(Dest, Ctx->getConstantZero(Ty));
1631     return true;
1632   }
1633   if (Src1 == 1) {
1634     Variable *T = nullptr;
1635     _mov(T, Src0);
1636     _mov(Dest, T);
1637     return true;
1638   }
1639   // Don't bother with the edge case where Src1 == MININT.
1640   if (Src1 == -Src1)
1641     return false;
1642   const bool Src1IsNegative = Src1 < 0;
1643   if (Src1IsNegative)
1644     Src1 = -Src1;
1645   uint32_t Count9 = 0;
1646   uint32_t Count5 = 0;
1647   uint32_t Count3 = 0;
1648   uint32_t Count2 = 0;
1649   uint32_t CountOps = 0;
1650   while (Src1 > 1) {
1651     if (Src1 % 9 == 0) {
1652       ++CountOps;
1653       ++Count9;
1654       Src1 /= 9;
1655     } else if (Src1 % 5 == 0) {
1656       ++CountOps;
1657       ++Count5;
1658       Src1 /= 5;
1659     } else if (Src1 % 3 == 0) {
1660       ++CountOps;
1661       ++Count3;
1662       Src1 /= 3;
1663     } else if (Src1 % 2 == 0) {
1664       if (Count2 == 0)
1665         ++CountOps;
1666       ++Count2;
1667       Src1 /= 2;
1668     } else {
1669       return false;
1670     }
1671   }
1672   // Lea optimization only works for i16 and i32 types, not i8.
1673   if (Ty != IceType_i32 && !(Traits::Is64Bit && Ty == IceType_i64) &&
1674       (Count3 || Count5 || Count9))
1675     return false;
1676   // Limit the number of lea/shl operations for a single multiply, to a
1677   // somewhat arbitrary choice of 3.
1678   constexpr uint32_t MaxOpsForOptimizedMul = 3;
1679   if (CountOps > MaxOpsForOptimizedMul)
1680     return false;
1681   Variable *T = makeReg(Traits::WordType);
1682   if (typeWidthInBytes(Src0->getType()) < typeWidthInBytes(T->getType())) {
1683     Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
1684     _movzx(T, Src0RM);
1685   } else {
1686     _mov(T, Src0);
1687   }
1688   Constant *Zero = Ctx->getConstantZero(IceType_i32);
1689   for (uint32_t i = 0; i < Count9; ++i) {
1690     constexpr uint16_t Shift = 3; // log2(9-1)
1691     _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
1692   }
1693   for (uint32_t i = 0; i < Count5; ++i) {
1694     constexpr uint16_t Shift = 2; // log2(5-1)
1695     _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
1696   }
1697   for (uint32_t i = 0; i < Count3; ++i) {
1698     constexpr uint16_t Shift = 1; // log2(3-1)
1699     _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
1700   }
1701   if (Count2) {
1702     _shl(T, Ctx->getConstantInt(Ty, Count2));
1703   }
1704   if (Src1IsNegative)
1705     _neg(T);
1706   _mov(Dest, T);
1707   return true;
1708 }
1709 
1710 template <typename TraitsType>
1711 void TargetX86Base<TraitsType>::lowerShift64(InstArithmetic::OpKind Op,
1712                                              Operand *Src0Lo, Operand *Src0Hi,
1713                                              Operand *Src1Lo, Variable *DestLo,
1714                                              Variable *DestHi) {
1715   // TODO: Refactor the similarities between Shl, Lshr, and Ashr.
1716   Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
1717   Constant *Zero = Ctx->getConstantZero(IceType_i32);
1718   Constant *SignExtend = Ctx->getConstantInt32(0x1f);
1719   if (auto *ConstantShiftAmount = llvm::dyn_cast<ConstantInteger32>(Src1Lo)) {
1720     uint32_t ShiftAmount = ConstantShiftAmount->getValue();
1721     if (ShiftAmount > 32) {
1722       Constant *ReducedShift = Ctx->getConstantInt32(ShiftAmount - 32);
1723       switch (Op) {
1724       default:
1725         assert(0 && "non-shift op");
1726         break;
1727       case InstArithmetic::Shl: {
1728         // a=b<<c ==>
1729         //   t2 = b.lo
1730         //   t2 = shl t2, ShiftAmount-32
1731         //   t3 = t2
1732         //   t2 = 0
1733         _mov(T_2, Src0Lo);
1734         _shl(T_2, ReducedShift);
1735         _mov(DestHi, T_2);
1736         _mov(DestLo, Zero);
1737       } break;
1738       case InstArithmetic::Lshr: {
1739         // a=b>>c (unsigned) ==>
1740         //   t2 = b.hi
1741         //   t2 = shr t2, ShiftAmount-32
1742         //   a.lo = t2
1743         //   a.hi = 0
1744         _mov(T_2, Src0Hi);
1745         _shr(T_2, ReducedShift);
1746         _mov(DestLo, T_2);
1747         _mov(DestHi, Zero);
1748       } break;
1749       case InstArithmetic::Ashr: {
1750         // a=b>>c (signed) ==>
1751         //   t3 = b.hi
1752         //   t3 = sar t3, 0x1f
1753         //   t2 = b.hi
1754         //   t2 = shrd t2, t3, ShiftAmount-32
1755         //   a.lo = t2
1756         //   a.hi = t3
1757         _mov(T_3, Src0Hi);
1758         _sar(T_3, SignExtend);
1759         _mov(T_2, Src0Hi);
1760         _shrd(T_2, T_3, ReducedShift);
1761         _mov(DestLo, T_2);
1762         _mov(DestHi, T_3);
1763       } break;
1764       }
1765     } else if (ShiftAmount == 32) {
1766       switch (Op) {
1767       default:
1768         assert(0 && "non-shift op");
1769         break;
1770       case InstArithmetic::Shl: {
1771         // a=b<<c ==>
1772         //   t2 = b.lo
1773         //   a.hi = t2
1774         //   a.lo = 0
1775         _mov(T_2, Src0Lo);
1776         _mov(DestHi, T_2);
1777         _mov(DestLo, Zero);
1778       } break;
1779       case InstArithmetic::Lshr: {
1780         // a=b>>c (unsigned) ==>
1781         //   t2 = b.hi
1782         //   a.lo = t2
1783         //   a.hi = 0
1784         _mov(T_2, Src0Hi);
1785         _mov(DestLo, T_2);
1786         _mov(DestHi, Zero);
1787       } break;
1788       case InstArithmetic::Ashr: {
1789         // a=b>>c (signed) ==>
1790         //   t2 = b.hi
1791         //   a.lo = t2
1792         //   t3 = b.hi
1793         //   t3 = sar t3, 0x1f
1794         //   a.hi = t3
1795         _mov(T_2, Src0Hi);
1796         _mov(DestLo, T_2);
1797         _mov(T_3, Src0Hi);
1798         _sar(T_3, SignExtend);
1799         _mov(DestHi, T_3);
1800       } break;
1801       }
1802     } else {
1803       // COMMON PREFIX OF: a=b SHIFT_OP c ==>
1804       //   t2 = b.lo
1805       //   t3 = b.hi
1806       _mov(T_2, Src0Lo);
1807       _mov(T_3, Src0Hi);
1808       switch (Op) {
1809       default:
1810         assert(0 && "non-shift op");
1811         break;
1812       case InstArithmetic::Shl: {
1813         // a=b<<c ==>
1814         //   t3 = shld t3, t2, ShiftAmount
1815         //   t2 = shl t2, ShiftAmount
1816         _shld(T_3, T_2, ConstantShiftAmount);
1817         _shl(T_2, ConstantShiftAmount);
1818       } break;
1819       case InstArithmetic::Lshr: {
1820         // a=b>>c (unsigned) ==>
1821         //   t2 = shrd t2, t3, ShiftAmount
1822         //   t3 = shr t3, ShiftAmount
1823         _shrd(T_2, T_3, ConstantShiftAmount);
1824         _shr(T_3, ConstantShiftAmount);
1825       } break;
1826       case InstArithmetic::Ashr: {
1827         // a=b>>c (signed) ==>
1828         //   t2 = shrd t2, t3, ShiftAmount
1829         //   t3 = sar t3, ShiftAmount
1830         _shrd(T_2, T_3, ConstantShiftAmount);
1831         _sar(T_3, ConstantShiftAmount);
1832       } break;
1833       }
1834       // COMMON SUFFIX OF: a=b SHIFT_OP c ==>
1835       //   a.lo = t2
1836       //   a.hi = t3
1837       _mov(DestLo, T_2);
1838       _mov(DestHi, T_3);
1839     }
1840   } else {
1841     // NON-CONSTANT CASES.
1842     Constant *BitTest = Ctx->getConstantInt32(0x20);
1843     InstX86Label *Label = InstX86Label::create(Func, this);
1844     // COMMON PREFIX OF: a=b SHIFT_OP c ==>
1845     //   t1:ecx = c.lo & 0xff
1846     //   t2 = b.lo
1847     //   t3 = b.hi
1848     T_1 = copyToReg8(Src1Lo, Traits::RegisterSet::Reg_cl);
1849     _mov(T_2, Src0Lo);
1850     _mov(T_3, Src0Hi);
1851     switch (Op) {
1852     default:
1853       assert(0 && "non-shift op");
1854       break;
1855     case InstArithmetic::Shl: {
1856       // a=b<<c ==>
1857       //   t3 = shld t3, t2, t1
1858       //   t2 = shl t2, t1
1859       //   test t1, 0x20
1860       //   je L1
1861       //   use(t3)
1862       //   t3 = t2
1863       //   t2 = 0
1864       _shld(T_3, T_2, T_1);
1865       _shl(T_2, T_1);
1866       _test(T_1, BitTest);
1867       _br(Traits::Cond::Br_e, Label);
1868       // T_2 and T_3 are being assigned again because of the intra-block control
1869       // flow, so we need to use _redefined to avoid liveness problems.
1870       _redefined(_mov(T_3, T_2));
1871       _redefined(_mov(T_2, Zero));
1872     } break;
1873     case InstArithmetic::Lshr: {
1874       // a=b>>c (unsigned) ==>
1875       //   t2 = shrd t2, t3, t1
1876       //   t3 = shr t3, t1
1877       //   test t1, 0x20
1878       //   je L1
1879       //   use(t2)
1880       //   t2 = t3
1881       //   t3 = 0
1882       _shrd(T_2, T_3, T_1);
1883       _shr(T_3, T_1);
1884       _test(T_1, BitTest);
1885       _br(Traits::Cond::Br_e, Label);
1886       // T_2 and T_3 are being assigned again because of the intra-block control
1887       // flow, so we need to use _redefined to avoid liveness problems.
1888       _redefined(_mov(T_2, T_3));
1889       _redefined(_mov(T_3, Zero));
1890     } break;
1891     case InstArithmetic::Ashr: {
1892       // a=b>>c (signed) ==>
1893       //   t2 = shrd t2, t3, t1
1894       //   t3 = sar t3, t1
1895       //   test t1, 0x20
1896       //   je L1
1897       //   use(t2)
1898       //   t2 = t3
1899       //   t3 = sar t3, 0x1f
1900       Constant *SignExtend = Ctx->getConstantInt32(0x1f);
1901       _shrd(T_2, T_3, T_1);
1902       _sar(T_3, T_1);
1903       _test(T_1, BitTest);
1904       _br(Traits::Cond::Br_e, Label);
1905       // T_2 and T_3 are being assigned again because of the intra-block control
1906       // flow, so T_2 needs to use _redefined to avoid liveness problems. T_3
1907       // doesn't need special treatment because it is reassigned via _sar
1908       // instead of _mov.
1909       _redefined(_mov(T_2, T_3));
1910       _sar(T_3, SignExtend);
1911     } break;
1912     }
1913     // COMMON SUFFIX OF: a=b SHIFT_OP c ==>
1914     // L1:
1915     //   a.lo = t2
1916     //   a.hi = t3
1917     Context.insert(Label);
1918     _mov(DestLo, T_2);
1919     _mov(DestHi, T_3);
1920   }
1921 }
1922 
1923 template <typename TraitsType>
1924 void TargetX86Base<TraitsType>::lowerArithmetic(const InstArithmetic *Instr) {
1925   Variable *Dest = Instr->getDest();
1926   if (Dest->isRematerializable()) {
1927     Context.insert<InstFakeDef>(Dest);
1928     return;
1929   }
1930   Type Ty = Dest->getType();
1931   Operand *Src0 = legalize(Instr->getSrc(0));
1932   Operand *Src1 = legalize(Instr->getSrc(1));
1933   if (Instr->isCommutative()) {
1934     uint32_t SwapCount = 0;
1935     if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1)) {
1936       std::swap(Src0, Src1);
1937       ++SwapCount;
1938     }
1939     if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1)) {
1940       std::swap(Src0, Src1);
1941       ++SwapCount;
1942     }
1943     // Improve two-address code patterns by avoiding a copy to the dest
1944     // register when one of the source operands ends its lifetime here.
1945     if (!Instr->isLastUse(Src0) && Instr->isLastUse(Src1)) {
1946       std::swap(Src0, Src1);
1947       ++SwapCount;
1948     }
1949     assert(SwapCount <= 1);
1950     (void)SwapCount;
1951   }
1952   if (!Traits::Is64Bit && Ty == IceType_i64) {
1953     // These x86-32 helper-call-involved instructions are lowered in this
1954     // separate switch. This is because loOperand() and hiOperand() may insert
1955     // redundant instructions for constant blinding and pooling. Such redundant
1956     // instructions will fail liveness analysis under -Om1 setting. And,
1957     // actually these arguments do not need to be processed with loOperand()
1958     // and hiOperand() to be used.
1959     switch (Instr->getOp()) {
1960     case InstArithmetic::Udiv:
1961     case InstArithmetic::Sdiv:
1962     case InstArithmetic::Urem:
1963     case InstArithmetic::Srem:
1964       llvm::report_fatal_error("Helper call was expected");
1965       return;
1966     default:
1967       break;
1968     }
1969 
1970     auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
1971     auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
1972     Operand *Src0Lo = loOperand(Src0);
1973     Operand *Src0Hi = hiOperand(Src0);
1974     Operand *Src1Lo = loOperand(Src1);
1975     Operand *Src1Hi = hiOperand(Src1);
1976     Variable *T_Lo = nullptr, *T_Hi = nullptr;
1977     switch (Instr->getOp()) {
1978     case InstArithmetic::_num:
1979       llvm_unreachable("Unknown arithmetic operator");
1980       break;
1981     case InstArithmetic::Add:
1982       _mov(T_Lo, Src0Lo);
1983       _add(T_Lo, Src1Lo);
1984       _mov(DestLo, T_Lo);
1985       _mov(T_Hi, Src0Hi);
1986       _adc(T_Hi, Src1Hi);
1987       _mov(DestHi, T_Hi);
1988       break;
1989     case InstArithmetic::And:
1990       _mov(T_Lo, Src0Lo);
1991       _and(T_Lo, Src1Lo);
1992       _mov(DestLo, T_Lo);
1993       _mov(T_Hi, Src0Hi);
1994       _and(T_Hi, Src1Hi);
1995       _mov(DestHi, T_Hi);
1996       break;
1997     case InstArithmetic::Or:
1998       _mov(T_Lo, Src0Lo);
1999       _or(T_Lo, Src1Lo);
2000       _mov(DestLo, T_Lo);
2001       _mov(T_Hi, Src0Hi);
2002       _or(T_Hi, Src1Hi);
2003       _mov(DestHi, T_Hi);
2004       break;
2005     case InstArithmetic::Xor:
2006       _mov(T_Lo, Src0Lo);
2007       _xor(T_Lo, Src1Lo);
2008       _mov(DestLo, T_Lo);
2009       _mov(T_Hi, Src0Hi);
2010       _xor(T_Hi, Src1Hi);
2011       _mov(DestHi, T_Hi);
2012       break;
2013     case InstArithmetic::Sub:
2014       _mov(T_Lo, Src0Lo);
2015       _sub(T_Lo, Src1Lo);
2016       _mov(DestLo, T_Lo);
2017       _mov(T_Hi, Src0Hi);
2018       _sbb(T_Hi, Src1Hi);
2019       _mov(DestHi, T_Hi);
2020       break;
2021     case InstArithmetic::Mul: {
2022       Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
2023       Variable *T_4Lo = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
2024       Variable *T_4Hi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
2025       // gcc does the following:
2026       // a=b*c ==>
2027       //   t1 = b.hi; t1 *=(imul) c.lo
2028       //   t2 = c.hi; t2 *=(imul) b.lo
2029       //   t3:eax = b.lo
2030       //   t4.hi:edx,t4.lo:eax = t3:eax *(mul) c.lo
2031       //   a.lo = t4.lo
2032       //   t4.hi += t1
2033       //   t4.hi += t2
2034       //   a.hi = t4.hi
2035       // The mul instruction cannot take an immediate operand.
2036       Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Mem);
2037       _mov(T_1, Src0Hi);
2038       _imul(T_1, Src1Lo);
2039       _mov(T_3, Src0Lo, Traits::RegisterSet::Reg_eax);
2040       _mul(T_4Lo, T_3, Src1Lo);
2041       // The mul instruction produces two dest variables, edx:eax. We create a
2042       // fake definition of edx to account for this.
2043       Context.insert<InstFakeDef>(T_4Hi, T_4Lo);
2044       Context.insert<InstFakeUse>(T_4Hi);
2045       _mov(DestLo, T_4Lo);
2046       _add(T_4Hi, T_1);
2047       _mov(T_2, Src1Hi);
2048       _imul(T_2, Src0Lo);
2049       _add(T_4Hi, T_2);
2050       _mov(DestHi, T_4Hi);
2051     } break;
2052     case InstArithmetic::Shl:
2053     case InstArithmetic::Lshr:
2054     case InstArithmetic::Ashr:
2055       lowerShift64(Instr->getOp(), Src0Lo, Src0Hi, Src1Lo, DestLo, DestHi);
2056       break;
2057     case InstArithmetic::Fadd:
2058     case InstArithmetic::Fsub:
2059     case InstArithmetic::Fmul:
2060     case InstArithmetic::Fdiv:
2061     case InstArithmetic::Frem:
2062       llvm_unreachable("FP instruction with i64 type");
2063       break;
2064     case InstArithmetic::Udiv:
2065     case InstArithmetic::Sdiv:
2066     case InstArithmetic::Urem:
2067     case InstArithmetic::Srem:
2068       llvm_unreachable("Call-helper-involved instruction for i64 type \
2069                        should have already been handled before");
2070       break;
2071     }
2072     return;
2073   }
2074   if (isVectorType(Ty)) {
2075     // TODO: Trap on integer divide and integer modulo by zero. See:
2076     // https://code.google.com/p/nativeclient/issues/detail?id=3899
2077     if (llvm::isa<X86OperandMem>(Src1))
2078       Src1 = legalizeToReg(Src1);
2079     switch (Instr->getOp()) {
2080     case InstArithmetic::_num:
2081       llvm_unreachable("Unknown arithmetic operator");
2082       break;
2083     case InstArithmetic::Add: {
2084       Variable *T = makeReg(Ty);
2085       _movp(T, Src0);
2086       _padd(T, Src1);
2087       _movp(Dest, T);
2088     } break;
2089     case InstArithmetic::And: {
2090       Variable *T = makeReg(Ty);
2091       _movp(T, Src0);
2092       _pand(T, Src1);
2093       _movp(Dest, T);
2094     } break;
2095     case InstArithmetic::Or: {
2096       Variable *T = makeReg(Ty);
2097       _movp(T, Src0);
2098       _por(T, Src1);
2099       _movp(Dest, T);
2100     } break;
2101     case InstArithmetic::Xor: {
2102       Variable *T = makeReg(Ty);
2103       _movp(T, Src0);
2104       _pxor(T, Src1);
2105       _movp(Dest, T);
2106     } break;
2107     case InstArithmetic::Sub: {
2108       Variable *T = makeReg(Ty);
2109       _movp(T, Src0);
2110       _psub(T, Src1);
2111       _movp(Dest, T);
2112     } break;
2113     case InstArithmetic::Mul: {
2114       bool TypesAreValidForPmull = Ty == IceType_v4i32 || Ty == IceType_v8i16;
2115       bool InstructionSetIsValidForPmull =
2116           Ty == IceType_v8i16 || InstructionSet >= Traits::SSE4_1;
2117       if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {
2118         Variable *T = makeReg(Ty);
2119         _movp(T, Src0);
2120         _pmull(T, Src0 == Src1 ? T : Src1);
2121         _movp(Dest, T);
2122       } else if (Ty == IceType_v4i32) {
2123         // Lowering sequence:
2124         // Note: The mask arguments have index 0 on the left.
2125         //
2126         // movups  T1, Src0
2127         // pshufd  T2, Src0, {1,0,3,0}
2128         // pshufd  T3, Src1, {1,0,3,0}
2129         // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]}
2130         // pmuludq T1, Src1
2131         // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]}
2132         // pmuludq T2, T3
2133         // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])}
2134         // shufps  T1, T2, {0,2,0,2}
2135         // pshufd  T4, T1, {0,2,1,3}
2136         // movups  Dest, T4
2137 
2138         // Mask that directs pshufd to create a vector with entries
2139         // Src[1, 0, 3, 0]
2140         constexpr unsigned Constant1030 = 0x31;
2141         Constant *Mask1030 = Ctx->getConstantInt32(Constant1030);
2142         // Mask that directs shufps to create a vector with entries
2143         // Dest[0, 2], Src[0, 2]
2144         constexpr unsigned Mask0202 = 0x88;
2145         // Mask that directs pshufd to create a vector with entries
2146         // Src[0, 2, 1, 3]
2147         constexpr unsigned Mask0213 = 0xd8;
2148         Variable *T1 = makeReg(IceType_v4i32);
2149         Variable *T2 = makeReg(IceType_v4i32);
2150         Variable *T3 = makeReg(IceType_v4i32);
2151         Variable *T4 = makeReg(IceType_v4i32);
2152         _movp(T1, Src0);
2153         _pshufd(T2, Src0, Mask1030);
2154         _pshufd(T3, Src1, Mask1030);
2155         _pmuludq(T1, Src1);
2156         _pmuludq(T2, T3);
2157         _shufps(T1, T2, Ctx->getConstantInt32(Mask0202));
2158         _pshufd(T4, T1, Ctx->getConstantInt32(Mask0213));
2159         _movp(Dest, T4);
2160       } else if (Ty == IceType_v16i8) {
2161         llvm::report_fatal_error("Scalarized operation was expected");
2162       } else {
2163         llvm::report_fatal_error("Invalid vector multiply type");
2164       }
2165     } break;
2166     case InstArithmetic::Shl: {
2167       assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
2168       Variable *T = makeReg(Ty);
2169       _movp(T, Src0);
2170       _psll(T, Src1);
2171       _movp(Dest, T);
2172     } break;
2173     case InstArithmetic::Lshr: {
2174       assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
2175       Variable *T = makeReg(Ty);
2176       _movp(T, Src0);
2177       _psrl(T, Src1);
2178       _movp(Dest, T);
2179     } break;
2180     case InstArithmetic::Ashr: {
2181       assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
2182       Variable *T = makeReg(Ty);
2183       _movp(T, Src0);
2184       _psra(T, Src1);
2185       _movp(Dest, T);
2186     } break;
2187     case InstArithmetic::Udiv:
2188     case InstArithmetic::Urem:
2189     case InstArithmetic::Sdiv:
2190     case InstArithmetic::Srem:
2191       llvm::report_fatal_error("Scalarized operation was expected");
2192       break;
2193     case InstArithmetic::Fadd: {
2194       Variable *T = makeReg(Ty);
2195       _movp(T, Src0);
2196       _addps(T, Src1);
2197       _movp(Dest, T);
2198     } break;
2199     case InstArithmetic::Fsub: {
2200       Variable *T = makeReg(Ty);
2201       _movp(T, Src0);
2202       _subps(T, Src1);
2203       _movp(Dest, T);
2204     } break;
2205     case InstArithmetic::Fmul: {
2206       Variable *T = makeReg(Ty);
2207       _movp(T, Src0);
2208       _mulps(T, Src0 == Src1 ? T : Src1);
2209       _movp(Dest, T);
2210     } break;
2211     case InstArithmetic::Fdiv: {
2212       Variable *T = makeReg(Ty);
2213       _movp(T, Src0);
2214       _divps(T, Src1);
2215       _movp(Dest, T);
2216     } break;
2217     case InstArithmetic::Frem:
2218       llvm::report_fatal_error("Scalarized operation was expected");
2219       break;
2220     }
2221     return;
2222   }
2223   Variable *T_edx = nullptr;
2224   Variable *T = nullptr;
2225   switch (Instr->getOp()) {
2226   case InstArithmetic::_num:
2227     llvm_unreachable("Unknown arithmetic operator");
2228     break;
2229   case InstArithmetic::Add: {
2230     const bool ValidType =
2231         Ty == IceType_i32 || (Ty == IceType_i64 && Traits::Is64Bit);
2232     auto *Const = llvm::dyn_cast<Constant>(Instr->getSrc(1));
2233     const bool ValidKind =
2234         Const != nullptr && (llvm::isa<ConstantInteger32>(Const) ||
2235                              llvm::isa<ConstantRelocatable>(Const));
2236     if (getFlags().getAggressiveLea() && ValidType && ValidKind) {
2237       auto *Var = legalizeToReg(Src0);
2238       auto *Mem = Traits::X86OperandMem::create(Func, IceType_void, Var, Const);
2239       T = makeReg(Ty);
2240       _lea(T, _sandbox_mem_reference(Mem));
2241       _mov(Dest, T);
2242       break;
2243     }
2244     _mov(T, Src0);
2245     _add(T, Src1);
2246     _mov(Dest, T);
2247   } break;
2248   case InstArithmetic::And:
2249     _mov(T, Src0);
2250     _and(T, Src1);
2251     _mov(Dest, T);
2252     break;
2253   case InstArithmetic::Or:
2254     _mov(T, Src0);
2255     _or(T, Src1);
2256     _mov(Dest, T);
2257     break;
2258   case InstArithmetic::Xor:
2259     _mov(T, Src0);
2260     _xor(T, Src1);
2261     _mov(Dest, T);
2262     break;
2263   case InstArithmetic::Sub:
2264     _mov(T, Src0);
2265     _sub(T, Src1);
2266     _mov(Dest, T);
2267     break;
2268   case InstArithmetic::Mul:
2269     if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
2270       if (optimizeScalarMul(Dest, Src0, C->getValue()))
2271         return;
2272     }
2273     // The 8-bit version of imul only allows the form "imul r/m8" where T must
2274     // be in al.
2275     if (isByteSizedArithType(Ty)) {
2276       _mov(T, Src0, Traits::RegisterSet::Reg_al);
2277       Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2278       _imul(T, Src0 == Src1 ? T : Src1);
2279       _mov(Dest, T);
2280     } else if (auto *ImmConst = llvm::dyn_cast<ConstantInteger32>(Src1)) {
2281       T = makeReg(Ty);
2282       _imul_imm(T, Src0, ImmConst);
2283       _mov(Dest, T);
2284     } else {
2285       _mov(T, Src0);
2286       _imul(T, Src0 == Src1 ? T : Src1);
2287       _mov(Dest, T);
2288     }
2289     break;
2290   case InstArithmetic::Shl:
2291     _mov(T, Src0);
2292     if (!llvm::isa<ConstantInteger32>(Src1) &&
2293         !llvm::isa<ConstantInteger64>(Src1))
2294       Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
2295     _shl(T, Src1);
2296     _mov(Dest, T);
2297     break;
2298   case InstArithmetic::Lshr:
2299     _mov(T, Src0);
2300     if (!llvm::isa<ConstantInteger32>(Src1) &&
2301         !llvm::isa<ConstantInteger64>(Src1))
2302       Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
2303     _shr(T, Src1);
2304     _mov(Dest, T);
2305     break;
2306   case InstArithmetic::Ashr:
2307     _mov(T, Src0);
2308     if (!llvm::isa<ConstantInteger32>(Src1) &&
2309         !llvm::isa<ConstantInteger64>(Src1))
2310       Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
2311     _sar(T, Src1);
2312     _mov(Dest, T);
2313     break;
2314   case InstArithmetic::Udiv: {
2315     // div and idiv are the few arithmetic operators that do not allow
2316     // immediates as the operand.
2317     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2318     RegNumT Eax;
2319     RegNumT Edx;
2320     switch (Ty) {
2321     default:
2322       llvm::report_fatal_error("Bad type for udiv");
2323     case IceType_i64:
2324       Eax = Traits::getRaxOrDie();
2325       Edx = Traits::getRdxOrDie();
2326       break;
2327     case IceType_i32:
2328       Eax = Traits::RegisterSet::Reg_eax;
2329       Edx = Traits::RegisterSet::Reg_edx;
2330       break;
2331     case IceType_i16:
2332       Eax = Traits::RegisterSet::Reg_ax;
2333       Edx = Traits::RegisterSet::Reg_dx;
2334       break;
2335     case IceType_i8:
2336       Eax = Traits::RegisterSet::Reg_al;
2337       Edx = Traits::RegisterSet::Reg_ah;
2338       break;
2339     }
2340     T_edx = makeReg(Ty, Edx);
2341     _mov(T, Src0, Eax);
2342     _mov(T_edx, Ctx->getConstantZero(Ty));
2343     _div(T_edx, Src1, T);
2344     _redefined(Context.insert<InstFakeDef>(T, T_edx));
2345     _mov(Dest, T);
2346   } break;
2347   case InstArithmetic::Sdiv:
2348     // TODO(stichnot): Enable this after doing better performance and cross
2349     // testing.
2350     if (false && Func->getOptLevel() >= Opt_1) {
2351       // Optimize division by constant power of 2, but not for Om1 or O0, just
2352       // to keep things simple there.
2353       if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
2354         const int32_t Divisor = C->getValue();
2355         const uint32_t UDivisor = Divisor;
2356         if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
2357           uint32_t LogDiv = llvm::Log2_32(UDivisor);
2358           // LLVM does the following for dest=src/(1<<log):
2359           //   t=src
2360           //   sar t,typewidth-1 // -1 if src is negative, 0 if not
2361           //   shr t,typewidth-log
2362           //   add t,src
2363           //   sar t,log
2364           //   dest=t
2365           uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty);
2366           _mov(T, Src0);
2367           // If for some reason we are dividing by 1, just treat it like an
2368           // assignment.
2369           if (LogDiv > 0) {
2370             // The initial sar is unnecessary when dividing by 2.
2371             if (LogDiv > 1)
2372               _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
2373             _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
2374             _add(T, Src0);
2375             _sar(T, Ctx->getConstantInt(Ty, LogDiv));
2376           }
2377           _mov(Dest, T);
2378           return;
2379         }
2380       }
2381     }
2382     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2383     switch (Ty) {
2384     default:
2385       llvm::report_fatal_error("Bad type for sdiv");
2386     case IceType_i64:
2387       T_edx = makeReg(Ty, Traits::getRdxOrDie());
2388       _mov(T, Src0, Traits::getRaxOrDie());
2389       break;
2390     case IceType_i32:
2391       T_edx = makeReg(Ty, Traits::RegisterSet::Reg_edx);
2392       _mov(T, Src0, Traits::RegisterSet::Reg_eax);
2393       break;
2394     case IceType_i16:
2395       T_edx = makeReg(Ty, Traits::RegisterSet::Reg_dx);
2396       _mov(T, Src0, Traits::RegisterSet::Reg_ax);
2397       break;
2398     case IceType_i8:
2399       T_edx = makeReg(IceType_i16, Traits::RegisterSet::Reg_ax);
2400       _mov(T, Src0, Traits::RegisterSet::Reg_al);
2401       break;
2402     }
2403     _cbwdq(T_edx, T);
2404     _idiv(T_edx, Src1, T);
2405     _redefined(Context.insert<InstFakeDef>(T, T_edx));
2406     _mov(Dest, T);
2407     break;
2408   case InstArithmetic::Urem: {
2409     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2410     RegNumT Eax;
2411     RegNumT Edx;
2412     switch (Ty) {
2413     default:
2414       llvm::report_fatal_error("Bad type for urem");
2415     case IceType_i64:
2416       Eax = Traits::getRaxOrDie();
2417       Edx = Traits::getRdxOrDie();
2418       break;
2419     case IceType_i32:
2420       Eax = Traits::RegisterSet::Reg_eax;
2421       Edx = Traits::RegisterSet::Reg_edx;
2422       break;
2423     case IceType_i16:
2424       Eax = Traits::RegisterSet::Reg_ax;
2425       Edx = Traits::RegisterSet::Reg_dx;
2426       break;
2427     case IceType_i8:
2428       Eax = Traits::RegisterSet::Reg_al;
2429       Edx = Traits::RegisterSet::Reg_ah;
2430       break;
2431     }
2432     T_edx = makeReg(Ty, Edx);
2433     _mov(T_edx, Ctx->getConstantZero(Ty));
2434     _mov(T, Src0, Eax);
2435     _div(T, Src1, T_edx);
2436     _redefined(Context.insert<InstFakeDef>(T_edx, T));
2437     if (Ty == IceType_i8) {
2438       // Register ah must be moved into one of {al,bl,cl,dl} before it can be
2439       // moved into a general 8-bit register.
2440       auto *T_AhRcvr = makeReg(Ty);
2441       T_AhRcvr->setRegClass(RCX86_IsAhRcvr);
2442       _mov(T_AhRcvr, T_edx);
2443       T_edx = T_AhRcvr;
2444     }
2445     _mov(Dest, T_edx);
2446   } break;
2447   case InstArithmetic::Srem: {
2448     // TODO(stichnot): Enable this after doing better performance and cross
2449     // testing.
2450     if (false && Func->getOptLevel() >= Opt_1) {
2451       // Optimize mod by constant power of 2, but not for Om1 or O0, just to
2452       // keep things simple there.
2453       if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
2454         const int32_t Divisor = C->getValue();
2455         const uint32_t UDivisor = Divisor;
2456         if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
2457           uint32_t LogDiv = llvm::Log2_32(UDivisor);
2458           // LLVM does the following for dest=src%(1<<log):
2459           //   t=src
2460           //   sar t,typewidth-1 // -1 if src is negative, 0 if not
2461           //   shr t,typewidth-log
2462           //   add t,src
2463           //   and t, -(1<<log)
2464           //   sub t,src
2465           //   neg t
2466           //   dest=t
2467           uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty);
2468           // If for some reason we are dividing by 1, just assign 0.
2469           if (LogDiv == 0) {
2470             _mov(Dest, Ctx->getConstantZero(Ty));
2471             return;
2472           }
2473           _mov(T, Src0);
2474           // The initial sar is unnecessary when dividing by 2.
2475           if (LogDiv > 1)
2476             _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
2477           _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
2478           _add(T, Src0);
2479           _and(T, Ctx->getConstantInt(Ty, -(1 << LogDiv)));
2480           _sub(T, Src0);
2481           _neg(T);
2482           _mov(Dest, T);
2483           return;
2484         }
2485       }
2486     }
2487     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2488     RegNumT Eax;
2489     RegNumT Edx;
2490     switch (Ty) {
2491     default:
2492       llvm::report_fatal_error("Bad type for srem");
2493     case IceType_i64:
2494       Eax = Traits::getRaxOrDie();
2495       Edx = Traits::getRdxOrDie();
2496       break;
2497     case IceType_i32:
2498       Eax = Traits::RegisterSet::Reg_eax;
2499       Edx = Traits::RegisterSet::Reg_edx;
2500       break;
2501     case IceType_i16:
2502       Eax = Traits::RegisterSet::Reg_ax;
2503       Edx = Traits::RegisterSet::Reg_dx;
2504       break;
2505     case IceType_i8:
2506       Eax = Traits::RegisterSet::Reg_al;
2507       Edx = Traits::RegisterSet::Reg_ah;
2508       break;
2509     }
2510     T_edx = makeReg(Ty, Edx);
2511     _mov(T, Src0, Eax);
2512     _cbwdq(T_edx, T);
2513     _idiv(T, Src1, T_edx);
2514     _redefined(Context.insert<InstFakeDef>(T_edx, T));
2515     if (Ty == IceType_i8) {
2516       // Register ah must be moved into one of {al,bl,cl,dl} before it can be
2517       // moved into a general 8-bit register.
2518       auto *T_AhRcvr = makeReg(Ty);
2519       T_AhRcvr->setRegClass(RCX86_IsAhRcvr);
2520       _mov(T_AhRcvr, T_edx);
2521       T_edx = T_AhRcvr;
2522     }
2523     _mov(Dest, T_edx);
2524   } break;
2525   case InstArithmetic::Fadd:
2526     _mov(T, Src0);
2527     _addss(T, Src1);
2528     _mov(Dest, T);
2529     break;
2530   case InstArithmetic::Fsub:
2531     _mov(T, Src0);
2532     _subss(T, Src1);
2533     _mov(Dest, T);
2534     break;
2535   case InstArithmetic::Fmul:
2536     _mov(T, Src0);
2537     _mulss(T, Src0 == Src1 ? T : Src1);
2538     _mov(Dest, T);
2539     break;
2540   case InstArithmetic::Fdiv:
2541     _mov(T, Src0);
2542     _divss(T, Src1);
2543     _mov(Dest, T);
2544     break;
2545   case InstArithmetic::Frem:
2546     llvm::report_fatal_error("Helper call was expected");
2547     break;
2548   }
2549 }
2550 
2551 template <typename TraitsType>
2552 void TargetX86Base<TraitsType>::lowerAssign(const InstAssign *Instr) {
2553   Variable *Dest = Instr->getDest();
2554   if (Dest->isRematerializable()) {
2555     Context.insert<InstFakeDef>(Dest);
2556     return;
2557   }
2558   Operand *Src = Instr->getSrc(0);
2559   assert(Dest->getType() == Src->getType());
2560   lowerMove(Dest, Src, false);
2561 }
2562 
2563 template <typename TraitsType>
2564 void TargetX86Base<TraitsType>::lowerBr(const InstBr *Br) {
2565   if (Br->isUnconditional()) {
2566     _br(Br->getTargetUnconditional());
2567     return;
2568   }
2569   Operand *Cond = Br->getCondition();
2570 
2571   // Handle folding opportunities.
2572   if (const Inst *Producer = FoldingInfo.getProducerFor(Cond)) {
2573     assert(Producer->isDeleted());
2574     switch (BoolFolding<Traits>::getProducerKind(Producer)) {
2575     default:
2576       break;
2577     case BoolFolding<Traits>::PK_Icmp32:
2578     case BoolFolding<Traits>::PK_Icmp64: {
2579       lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Br);
2580       return;
2581     }
2582     case BoolFolding<Traits>::PK_Fcmp: {
2583       lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Br);
2584       return;
2585     }
2586     case BoolFolding<Traits>::PK_Arith: {
2587       lowerArithAndConsumer(llvm::cast<InstArithmetic>(Producer), Br);
2588       return;
2589     }
2590     }
2591   }
2592   Operand *Src0 = legalize(Cond, Legal_Reg | Legal_Mem);
2593   Constant *Zero = Ctx->getConstantZero(IceType_i32);
2594   _cmp(Src0, Zero);
2595   _br(Traits::Cond::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
2596 }
2597 
2598 // constexprMax returns a (constexpr) max(S0, S1), and it is used for defining
2599 // OperandList in lowerCall. std::max() is supposed to work, but it doesn't.
2600 inline constexpr SizeT constexprMax(SizeT S0, SizeT S1) {
2601   return S0 < S1 ? S1 : S0;
2602 }
2603 
2604 template <typename TraitsType>
2605 void TargetX86Base<TraitsType>::lowerCall(const InstCall *Instr) {
2606   // Common x86 calling convention lowering:
2607   //
2608   // * At the point before the call, the stack must be aligned to 16 bytes.
2609   //
2610   // * Non-register arguments are pushed onto the stack in right-to-left order,
2611   // such that the left-most argument ends up on the top of the stack at the
2612   // lowest memory address.
2613   //
2614   // * Stack arguments of vector type are aligned to start at the next highest
2615   // multiple of 16 bytes. Other stack arguments are aligned to the next word
2616   // size boundary (4 or 8 bytes, respectively).
2617   RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment,
2618                                             Traits::X86_STACK_ALIGNMENT_BYTES);
2619 
2620   using OperandList =
2621       llvm::SmallVector<Operand *, constexprMax(Traits::X86_MAX_XMM_ARGS,
2622                                                 Traits::X86_MAX_GPR_ARGS)>;
2623   OperandList XmmArgs;
2624   CfgVector<std::pair<const Type, Operand *>> GprArgs;
2625   OperandList StackArgs, StackArgLocations;
2626   uint32_t ParameterAreaSizeBytes = 0;
2627 
2628   // Classify each argument operand according to the location where the argument
2629   // is passed.
2630   for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {
2631     Operand *Arg = Instr->getArg(i);
2632     const Type Ty = Arg->getType();
2633     // The PNaCl ABI requires the width of arguments to be at least 32 bits.
2634     assert(typeWidthInBytes(Ty) >= 4);
2635     if (isVectorType(Ty) &&
2636         Traits::getRegisterForXmmArgNum(XmmArgs.size()).hasValue()) {
2637       XmmArgs.push_back(Arg);
2638     } else if (isScalarFloatingType(Ty) && Traits::X86_PASS_SCALAR_FP_IN_XMM &&
2639                Traits::getRegisterForXmmArgNum(XmmArgs.size()).hasValue()) {
2640       XmmArgs.push_back(Arg);
2641     } else if (isScalarIntegerType(Ty) &&
2642                Traits::getRegisterForGprArgNum(Ty, GprArgs.size()).hasValue()) {
2643       GprArgs.emplace_back(Ty, Arg);
2644     } else {
2645       // Place on stack.
2646       StackArgs.push_back(Arg);
2647       if (isVectorType(Arg->getType())) {
2648         ParameterAreaSizeBytes =
2649             Traits::applyStackAlignment(ParameterAreaSizeBytes);
2650       }
2651       Variable *esp = getPhysicalRegister(getStackReg(), Traits::WordType);
2652       Constant *Loc = Ctx->getConstantInt32(ParameterAreaSizeBytes);
2653       StackArgLocations.push_back(
2654           Traits::X86OperandMem::create(Func, Ty, esp, Loc));
2655       ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType());
2656     }
2657   }
2658   // Ensure there is enough space for the fstp/movs for floating returns.
2659   Variable *Dest = Instr->getDest();
2660   const Type DestTy = Dest ? Dest->getType() : IceType_void;
2661   if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
2662     if (isScalarFloatingType(DestTy)) {
2663       ParameterAreaSizeBytes =
2664           std::max(static_cast<size_t>(ParameterAreaSizeBytes),
2665                    typeWidthInBytesOnStack(DestTy));
2666     }
2667   }
2668   // Adjust the parameter area so that the stack is aligned. It is assumed that
2669   // the stack is already aligned at the start of the calling sequence.
2670   ParameterAreaSizeBytes = Traits::applyStackAlignment(ParameterAreaSizeBytes);
2671   assert(ParameterAreaSizeBytes <= maxOutArgsSizeBytes());
2672   // Copy arguments that are passed on the stack to the appropriate stack
2673   // locations.  We make sure legalize() is called on each argument at this
2674   // point, to allow availabilityGet() to work.
2675   for (SizeT i = 0, NumStackArgs = StackArgs.size(); i < NumStackArgs; ++i) {
2676     lowerStore(
2677         InstStore::create(Func, legalize(StackArgs[i]), StackArgLocations[i]));
2678   }
2679   // Copy arguments to be passed in registers to the appropriate registers.
2680   for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) {
2681     XmmArgs[i] =
2682         legalizeToReg(legalize(XmmArgs[i]), Traits::getRegisterForXmmArgNum(i));
2683   }
2684   // Materialize moves for arguments passed in GPRs.
2685   for (SizeT i = 0, NumGprArgs = GprArgs.size(); i < NumGprArgs; ++i) {
2686     const Type SignatureTy = GprArgs[i].first;
2687     Operand *Arg =
2688         legalize(GprArgs[i].second, Legal_Default | Legal_Rematerializable);
2689     GprArgs[i].second =
2690         legalizeToReg(Arg, Traits::getRegisterForGprArgNum(Arg->getType(), i));
2691     assert(SignatureTy == IceType_i64 || SignatureTy == IceType_i32);
2692     assert(SignatureTy == Arg->getType());
2693     (void)SignatureTy;
2694   }
2695   // Generate a FakeUse of register arguments so that they do not get dead code
2696   // eliminated as a result of the FakeKill of scratch registers after the call.
2697   // These need to be right before the call instruction.
2698   for (auto *Arg : XmmArgs) {
2699     Context.insert<InstFakeUse>(llvm::cast<Variable>(Arg));
2700   }
2701   for (auto &ArgPair : GprArgs) {
2702     Context.insert<InstFakeUse>(llvm::cast<Variable>(ArgPair.second));
2703   }
2704   // Generate the call instruction. Assign its result to a temporary with high
2705   // register allocation weight.
2706   // ReturnReg doubles as ReturnRegLo as necessary.
2707   Variable *ReturnReg = nullptr;
2708   Variable *ReturnRegHi = nullptr;
2709   if (Dest) {
2710     switch (DestTy) {
2711     case IceType_NUM:
2712     case IceType_void:
2713     case IceType_i1:
2714     case IceType_i8:
2715     case IceType_i16:
2716       llvm::report_fatal_error("Invalid Call dest type");
2717       break;
2718     case IceType_i32:
2719       ReturnReg = makeReg(DestTy, Traits::RegisterSet::Reg_eax);
2720       break;
2721     case IceType_i64:
2722       if (Traits::Is64Bit) {
2723         ReturnReg = makeReg(IceType_i64, Traits::getRaxOrDie());
2724       } else {
2725         ReturnReg = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
2726         ReturnRegHi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
2727       }
2728       break;
2729     case IceType_f32:
2730     case IceType_f64:
2731       if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
2732         // Leave ReturnReg==ReturnRegHi==nullptr, and capture the result with
2733         // the fstp instruction.
2734         break;
2735       }
2736     // Fallthrough intended.
2737     case IceType_v4i1:
2738     case IceType_v8i1:
2739     case IceType_v16i1:
2740     case IceType_v16i8:
2741     case IceType_v8i16:
2742     case IceType_v4i32:
2743     case IceType_v4f32:
2744       ReturnReg = makeReg(DestTy, Traits::RegisterSet::Reg_xmm0);
2745       break;
2746     }
2747   }
2748   // Emit the call to the function.
2749   Operand *CallTarget =
2750       legalize(Instr->getCallTarget(), Legal_Reg | Legal_Imm | Legal_AddrAbs);
2751   Inst *NewCall = emitCallToTarget(CallTarget, ReturnReg);
2752   // Keep the upper return register live on 32-bit platform.
2753   if (ReturnRegHi)
2754     Context.insert<InstFakeDef>(ReturnRegHi);
2755   // Mark the call as killing all the caller-save registers.
2756   Context.insert<InstFakeKill>(NewCall);
2757   // Handle x86-32 floating point returns.
2758   if (Dest != nullptr && isScalarFloatingType(DestTy) &&
2759       !Traits::X86_PASS_SCALAR_FP_IN_XMM) {
2760     // Special treatment for an FP function which returns its result in st(0).
2761     // If Dest ends up being a physical xmm register, the fstp emit code will
2762     // route st(0) through the space reserved in the function argument area
2763     // we allocated.
2764     _fstp(Dest);
2765     // Create a fake use of Dest in case it actually isn't used, because st(0)
2766     // still needs to be popped.
2767     Context.insert<InstFakeUse>(Dest);
2768   }
2769   // Generate a FakeUse to keep the call live if necessary.
2770   if (Instr->hasSideEffects() && ReturnReg) {
2771     Context.insert<InstFakeUse>(ReturnReg);
2772   }
2773   // Process the return value, if any.
2774   if (Dest == nullptr)
2775     return;
2776   // Assign the result of the call to Dest.  Route it through a temporary so
2777   // that the local register availability peephole can be subsequently used.
2778   Variable *Tmp = nullptr;
2779   if (isVectorType(DestTy)) {
2780     assert(ReturnReg && "Vector type requires a return register");
2781     Tmp = makeReg(DestTy);
2782     _movp(Tmp, ReturnReg);
2783     _movp(Dest, Tmp);
2784   } else if (isScalarFloatingType(DestTy)) {
2785     if (Traits::X86_PASS_SCALAR_FP_IN_XMM) {
2786       assert(ReturnReg && "FP type requires a return register");
2787       _mov(Tmp, ReturnReg);
2788       _mov(Dest, Tmp);
2789     }
2790   } else {
2791     assert(isScalarIntegerType(DestTy));
2792     assert(ReturnReg && "Integer type requires a return register");
2793     if (DestTy == IceType_i64 && !Traits::Is64Bit) {
2794       assert(ReturnRegHi && "64-bit type requires two return registers");
2795       auto *Dest64On32 = llvm::cast<Variable64On32>(Dest);
2796       Variable *DestLo = Dest64On32->getLo();
2797       Variable *DestHi = Dest64On32->getHi();
2798       _mov(Tmp, ReturnReg);
2799       _mov(DestLo, Tmp);
2800       Variable *TmpHi = nullptr;
2801       _mov(TmpHi, ReturnRegHi);
2802       _mov(DestHi, TmpHi);
2803     } else {
2804       _mov(Tmp, ReturnReg);
2805       _mov(Dest, Tmp);
2806     }
2807   }
2808 }
2809 
2810 template <typename TraitsType>
2811 void TargetX86Base<TraitsType>::lowerCast(const InstCast *Instr) {
2812   // a = cast(b) ==> t=cast(b); a=t; (link t->b, link a->t, no overlap)
2813   InstCast::OpKind CastKind = Instr->getCastKind();
2814   Variable *Dest = Instr->getDest();
2815   Type DestTy = Dest->getType();
2816   switch (CastKind) {
2817   default:
2818     Func->setError("Cast type not supported");
2819     return;
2820   case InstCast::Sext: {
2821     // Src0RM is the source operand legalized to physical register or memory,
2822     // but not immediate, since the relevant x86 native instructions don't
2823     // allow an immediate operand. If the operand is an immediate, we could
2824     // consider computing the strength-reduced result at translation time, but
2825     // we're unlikely to see something like that in the bitcode that the
2826     // optimizer wouldn't have already taken care of.
2827     Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2828     if (isVectorType(DestTy)) {
2829       if (DestTy == IceType_v16i8) {
2830         // onemask = materialize(1,1,...); dst = (src & onemask) > 0
2831         Variable *OneMask = makeVectorOfOnes(DestTy);
2832         Variable *T = makeReg(DestTy);
2833         _movp(T, Src0RM);
2834         _pand(T, OneMask);
2835         Variable *Zeros = makeVectorOfZeros(DestTy);
2836         _pcmpgt(T, Zeros);
2837         _movp(Dest, T);
2838       } else {
2839         /// width = width(elty) - 1; dest = (src << width) >> width
2840         SizeT ShiftAmount =
2841             Traits::X86_CHAR_BIT * typeWidthInBytes(typeElementType(DestTy)) -
2842             1;
2843         Constant *ShiftConstant = Ctx->getConstantInt8(ShiftAmount);
2844         Variable *T = makeReg(DestTy);
2845         _movp(T, Src0RM);
2846         _psll(T, ShiftConstant);
2847         _psra(T, ShiftConstant);
2848         _movp(Dest, T);
2849       }
2850     } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
2851       // t1=movsx src; t2=t1; t2=sar t2, 31; dst.lo=t1; dst.hi=t2
2852       Constant *Shift = Ctx->getConstantInt32(31);
2853       auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
2854       auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
2855       Variable *T_Lo = makeReg(DestLo->getType());
2856       if (Src0RM->getType() == IceType_i32) {
2857         _mov(T_Lo, Src0RM);
2858       } else if (Src0RM->getType() == IceType_i1) {
2859         _movzx(T_Lo, Src0RM);
2860         _shl(T_Lo, Shift);
2861         _sar(T_Lo, Shift);
2862       } else {
2863         _movsx(T_Lo, Src0RM);
2864       }
2865       _mov(DestLo, T_Lo);
2866       Variable *T_Hi = nullptr;
2867       _mov(T_Hi, T_Lo);
2868       if (Src0RM->getType() != IceType_i1)
2869         // For i1, the sar instruction is already done above.
2870         _sar(T_Hi, Shift);
2871       _mov(DestHi, T_Hi);
2872     } else if (Src0RM->getType() == IceType_i1) {
2873       // t1 = src
2874       // shl t1, dst_bitwidth - 1
2875       // sar t1, dst_bitwidth - 1
2876       // dst = t1
2877       size_t DestBits = Traits::X86_CHAR_BIT * typeWidthInBytes(DestTy);
2878       Constant *ShiftAmount = Ctx->getConstantInt32(DestBits - 1);
2879       Variable *T = makeReg(DestTy);
2880       if (typeWidthInBytes(DestTy) <= typeWidthInBytes(Src0RM->getType())) {
2881         _mov(T, Src0RM);
2882       } else {
2883         // Widen the source using movsx or movzx. (It doesn't matter which one,
2884         // since the following shl/sar overwrite the bits.)
2885         _movzx(T, Src0RM);
2886       }
2887       _shl(T, ShiftAmount);
2888       _sar(T, ShiftAmount);
2889       _mov(Dest, T);
2890     } else {
2891       // t1 = movsx src; dst = t1
2892       Variable *T = makeReg(DestTy);
2893       _movsx(T, Src0RM);
2894       _mov(Dest, T);
2895     }
2896     break;
2897   }
2898   case InstCast::Zext: {
2899     Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2900     if (isVectorType(DestTy)) {
2901       // onemask = materialize(1,1,...); dest = onemask & src
2902       Variable *OneMask = makeVectorOfOnes(DestTy);
2903       Variable *T = makeReg(DestTy);
2904       _movp(T, Src0RM);
2905       _pand(T, OneMask);
2906       _movp(Dest, T);
2907     } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
2908       // t1=movzx src; dst.lo=t1; dst.hi=0
2909       Constant *Zero = Ctx->getConstantZero(IceType_i32);
2910       auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
2911       auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
2912       Variable *Tmp = makeReg(DestLo->getType());
2913       if (Src0RM->getType() == IceType_i32) {
2914         _mov(Tmp, Src0RM);
2915       } else {
2916         _movzx(Tmp, Src0RM);
2917       }
2918       _mov(DestLo, Tmp);
2919       _mov(DestHi, Zero);
2920     } else if (Src0RM->getType() == IceType_i1) {
2921       // t = Src0RM; Dest = t
2922       Variable *T = nullptr;
2923       if (DestTy == IceType_i8) {
2924         _mov(T, Src0RM);
2925       } else {
2926         assert(DestTy != IceType_i1);
2927         assert(Traits::Is64Bit || DestTy != IceType_i64);
2928         // Use 32-bit for both 16-bit and 32-bit, since 32-bit ops are shorter.
2929         // In x86-64 we need to widen T to 64-bits to ensure that T -- if
2930         // written to the stack (i.e., in -Om1) will be fully zero-extended.
2931         T = makeReg(DestTy == IceType_i64 ? IceType_i64 : IceType_i32);
2932         _movzx(T, Src0RM);
2933       }
2934       _mov(Dest, T);
2935     } else {
2936       // t1 = movzx src; dst = t1
2937       Variable *T = makeReg(DestTy);
2938       _movzx(T, Src0RM);
2939       _mov(Dest, T);
2940     }
2941     break;
2942   }
2943   case InstCast::Trunc: {
2944     if (isVectorType(DestTy)) {
2945       // onemask = materialize(1,1,...); dst = src & onemask
2946       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2947       Type Src0Ty = Src0RM->getType();
2948       Variable *OneMask = makeVectorOfOnes(Src0Ty);
2949       Variable *T = makeReg(DestTy);
2950       _movp(T, Src0RM);
2951       _pand(T, OneMask);
2952       _movp(Dest, T);
2953     } else if (DestTy == IceType_i1 || DestTy == IceType_i8) {
2954       // Make sure we truncate from and into valid registers.
2955       Operand *Src0 = legalizeUndef(Instr->getSrc(0));
2956       if (!Traits::Is64Bit && Src0->getType() == IceType_i64)
2957         Src0 = loOperand(Src0);
2958       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2959       Variable *T = copyToReg8(Src0RM);
2960       if (DestTy == IceType_i1)
2961         _and(T, Ctx->getConstantInt1(1));
2962       _mov(Dest, T);
2963     } else {
2964       Operand *Src0 = legalizeUndef(Instr->getSrc(0));
2965       if (!Traits::Is64Bit && Src0->getType() == IceType_i64)
2966         Src0 = loOperand(Src0);
2967       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2968       // t1 = trunc Src0RM; Dest = t1
2969       Variable *T = makeReg(DestTy);
2970       _mov(T, Src0RM);
2971       _mov(Dest, T);
2972     }
2973     break;
2974   }
2975   case InstCast::Fptrunc:
2976   case InstCast::Fpext: {
2977     Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2978     // t1 = cvt Src0RM; Dest = t1
2979     Variable *T = makeReg(DestTy);
2980     _cvt(T, Src0RM, Traits::Insts::Cvt::Float2float);
2981     _mov(Dest, T);
2982     break;
2983   }
2984   case InstCast::Fptosi:
2985     if (isVectorType(DestTy)) {
2986       assert(DestTy == IceType_v4i32);
2987       assert(Instr->getSrc(0)->getType() == IceType_v4f32);
2988       Operand *Src0R = legalizeToReg(Instr->getSrc(0));
2989       Variable *T = makeReg(DestTy);
2990       _cvt(T, Src0R, Traits::Insts::Cvt::Tps2dq);
2991       _movp(Dest, T);
2992     } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
2993       llvm::report_fatal_error("Helper call was expected");
2994     } else {
2995       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2996       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
2997       Variable *T_1 = nullptr;
2998       if (Traits::Is64Bit && DestTy == IceType_i64) {
2999         T_1 = makeReg(IceType_i64);
3000       } else {
3001         assert(DestTy != IceType_i64);
3002         T_1 = makeReg(IceType_i32);
3003       }
3004       // cvt() requires its integer argument to be a GPR.
3005       Variable *T_2 = makeReg(DestTy);
3006       if (isByteSizedType(DestTy)) {
3007         assert(T_1->getType() == IceType_i32);
3008         T_1->setRegClass(RCX86_Is32To8);
3009         T_2->setRegClass(RCX86_IsTrunc8Rcvr);
3010       }
3011       _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si);
3012       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
3013       if (DestTy == IceType_i1)
3014         _and(T_2, Ctx->getConstantInt1(1));
3015       _mov(Dest, T_2);
3016     }
3017     break;
3018   case InstCast::Fptoui:
3019     if (isVectorType(DestTy)) {
3020       llvm::report_fatal_error("Helper call was expected");
3021     } else if (DestTy == IceType_i64 ||
3022                (!Traits::Is64Bit && DestTy == IceType_i32)) {
3023       llvm::report_fatal_error("Helper call was expected");
3024     } else {
3025       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
3026       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
3027       assert(DestTy != IceType_i64);
3028       Variable *T_1 = nullptr;
3029       if (Traits::Is64Bit && DestTy == IceType_i32) {
3030         T_1 = makeReg(IceType_i64);
3031       } else {
3032         assert(DestTy != IceType_i32);
3033         T_1 = makeReg(IceType_i32);
3034       }
3035       Variable *T_2 = makeReg(DestTy);
3036       if (isByteSizedType(DestTy)) {
3037         assert(T_1->getType() == IceType_i32);
3038         T_1->setRegClass(RCX86_Is32To8);
3039         T_2->setRegClass(RCX86_IsTrunc8Rcvr);
3040       }
3041       _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si);
3042       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
3043       if (DestTy == IceType_i1)
3044         _and(T_2, Ctx->getConstantInt1(1));
3045       _mov(Dest, T_2);
3046     }
3047     break;
3048   case InstCast::Sitofp:
3049     if (isVectorType(DestTy)) {
3050       assert(DestTy == IceType_v4f32);
3051       assert(Instr->getSrc(0)->getType() == IceType_v4i32);
3052       Operand *Src0R = legalizeToReg(Instr->getSrc(0));
3053       Variable *T = makeReg(DestTy);
3054       _cvt(T, Src0R, Traits::Insts::Cvt::Dq2ps);
3055       _movp(Dest, T);
3056     } else if (!Traits::Is64Bit && Instr->getSrc(0)->getType() == IceType_i64) {
3057       llvm::report_fatal_error("Helper call was expected");
3058     } else {
3059       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
3060       // Sign-extend the operand.
3061       // t1.i32 = movsx Src0RM; t2 = Cvt t1.i32; Dest = t2
3062       Variable *T_1 = nullptr;
3063       if (Traits::Is64Bit && Src0RM->getType() == IceType_i64) {
3064         T_1 = makeReg(IceType_i64);
3065       } else {
3066         assert(Src0RM->getType() != IceType_i64);
3067         T_1 = makeReg(IceType_i32);
3068       }
3069       Variable *T_2 = makeReg(DestTy);
3070       if (Src0RM->getType() == T_1->getType())
3071         _mov(T_1, Src0RM);
3072       else
3073         _movsx(T_1, Src0RM);
3074       _cvt(T_2, T_1, Traits::Insts::Cvt::Si2ss);
3075       _mov(Dest, T_2);
3076     }
3077     break;
3078   case InstCast::Uitofp: {
3079     Operand *Src0 = Instr->getSrc(0);
3080     if (isVectorType(Src0->getType())) {
3081       llvm::report_fatal_error("Helper call was expected");
3082     } else if (Src0->getType() == IceType_i64 ||
3083                (!Traits::Is64Bit && Src0->getType() == IceType_i32)) {
3084       llvm::report_fatal_error("Helper call was expected");
3085     } else {
3086       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3087       // Zero-extend the operand.
3088       // t1.i32 = movzx Src0RM; t2 = Cvt t1.i32; Dest = t2
3089       Variable *T_1 = nullptr;
3090       if (Traits::Is64Bit && Src0RM->getType() == IceType_i32) {
3091         T_1 = makeReg(IceType_i64);
3092       } else {
3093         assert(Src0RM->getType() != IceType_i64);
3094         assert(Traits::Is64Bit || Src0RM->getType() != IceType_i32);
3095         T_1 = makeReg(IceType_i32);
3096       }
3097       Variable *T_2 = makeReg(DestTy);
3098       if (Src0RM->getType() == T_1->getType())
3099         _mov(T_1, Src0RM);
3100       else
3101         _movzx(T_1, Src0RM)->setMustKeep();
3102       _cvt(T_2, T_1, Traits::Insts::Cvt::Si2ss);
3103       _mov(Dest, T_2);
3104     }
3105     break;
3106   }
3107   case InstCast::Bitcast: {
3108     Operand *Src0 = Instr->getSrc(0);
3109     if (DestTy == Src0->getType()) {
3110       auto *Assign = InstAssign::create(Func, Dest, Src0);
3111       lowerAssign(Assign);
3112       return;
3113     }
3114     switch (DestTy) {
3115     default:
3116       llvm_unreachable("Unexpected Bitcast dest type");
3117     case IceType_i8: {
3118       llvm::report_fatal_error("Helper call was expected");
3119     } break;
3120     case IceType_i16: {
3121       llvm::report_fatal_error("Helper call was expected");
3122     } break;
3123     case IceType_i32:
3124     case IceType_f32: {
3125       Variable *Src0R = legalizeToReg(Src0);
3126       Variable *T = makeReg(DestTy);
3127       _movd(T, Src0R);
3128       _mov(Dest, T);
3129     } break;
3130     case IceType_i64: {
3131       assert(Src0->getType() == IceType_f64);
3132       if (Traits::Is64Bit) {
3133         Variable *Src0R = legalizeToReg(Src0);
3134         Variable *T = makeReg(IceType_i64);
3135         _movd(T, Src0R);
3136         _mov(Dest, T);
3137       } else {
3138         Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3139         // a.i64 = bitcast b.f64 ==>
3140         //   s.f64 = spill b.f64
3141         //   t_lo.i32 = lo(s.f64)
3142         //   a_lo.i32 = t_lo.i32
3143         //   t_hi.i32 = hi(s.f64)
3144         //   a_hi.i32 = t_hi.i32
3145         Operand *SpillLo, *SpillHi;
3146         if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0RM)) {
3147           Variable *Spill = Func->makeVariable(IceType_f64);
3148           Spill->setLinkedTo(Src0Var);
3149           Spill->setMustNotHaveReg();
3150           _movq(Spill, Src0RM);
3151           SpillLo = Traits::VariableSplit::create(Func, Spill,
3152                                                   Traits::VariableSplit::Low);
3153           SpillHi = Traits::VariableSplit::create(Func, Spill,
3154                                                   Traits::VariableSplit::High);
3155         } else {
3156           SpillLo = loOperand(Src0RM);
3157           SpillHi = hiOperand(Src0RM);
3158         }
3159 
3160         auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
3161         auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
3162         Variable *T_Lo = makeReg(IceType_i32);
3163         Variable *T_Hi = makeReg(IceType_i32);
3164 
3165         _mov(T_Lo, SpillLo);
3166         _mov(DestLo, T_Lo);
3167         _mov(T_Hi, SpillHi);
3168         _mov(DestHi, T_Hi);
3169       }
3170     } break;
3171     case IceType_f64: {
3172       assert(Src0->getType() == IceType_i64);
3173       if (Traits::Is64Bit) {
3174         Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3175         Variable *T = makeReg(IceType_f64);
3176         _movd(T, Src0RM);
3177         _mov(Dest, T);
3178       } else {
3179         Src0 = legalize(Src0);
3180         if (llvm::isa<X86OperandMem>(Src0)) {
3181           Variable *T = makeReg(DestTy);
3182           _movq(T, Src0);
3183           _movq(Dest, T);
3184           break;
3185         }
3186         // a.f64 = bitcast b.i64 ==>
3187         //   t_lo.i32 = b_lo.i32
3188         //   FakeDef(s.f64)
3189         //   lo(s.f64) = t_lo.i32
3190         //   t_hi.i32 = b_hi.i32
3191         //   hi(s.f64) = t_hi.i32
3192         //   a.f64 = s.f64
3193         Variable *Spill = Func->makeVariable(IceType_f64);
3194         Spill->setLinkedTo(Dest);
3195         Spill->setMustNotHaveReg();
3196 
3197         Variable *T_Lo = nullptr, *T_Hi = nullptr;
3198         auto *SpillLo = Traits::VariableSplit::create(
3199             Func, Spill, Traits::VariableSplit::Low);
3200         auto *SpillHi = Traits::VariableSplit::create(
3201             Func, Spill, Traits::VariableSplit::High);
3202         _mov(T_Lo, loOperand(Src0));
3203         // Technically, the Spill is defined after the _store happens, but
3204         // SpillLo is considered a "use" of Spill so define Spill before it is
3205         // used.
3206         Context.insert<InstFakeDef>(Spill);
3207         _store(T_Lo, SpillLo);
3208         _mov(T_Hi, hiOperand(Src0));
3209         _store(T_Hi, SpillHi);
3210         _movq(Dest, Spill);
3211       }
3212     } break;
3213     case IceType_v8i1: {
3214       llvm::report_fatal_error("Helper call was expected");
3215     } break;
3216     case IceType_v16i1: {
3217       llvm::report_fatal_error("Helper call was expected");
3218     } break;
3219     case IceType_v8i16:
3220     case IceType_v16i8:
3221     case IceType_v4i32:
3222     case IceType_v4f32: {
3223       if (Src0->getType() == IceType_i32) {
3224         // Bitcast requires equal type sizes, which isn't strictly the case
3225         // between scalars and vectors, but to emulate v4i8 vectors one has to
3226         // use v16i8 vectors.
3227         assert(getFlags().getApplicationBinaryInterface() != ABI_PNaCl &&
3228                "PNaCl only supports real 128-bit vectors");
3229         _movd(Dest, legalize(Src0, Legal_Reg | Legal_Mem));
3230       } else {
3231         _movp(Dest, legalizeToReg(Src0));
3232       }
3233     } break;
3234     }
3235     break;
3236   }
3237   }
3238 }
3239 
3240 template <typename TraitsType>
3241 void TargetX86Base<TraitsType>::lowerExtractElement(
3242     const InstExtractElement *Instr) {
3243   Operand *SourceVectNotLegalized = Instr->getSrc(0);
3244   auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(1));
3245   // Only constant indices are allowed in PNaCl IR.
3246   assert(ElementIndex);
3247 
3248   unsigned Index = ElementIndex->getValue();
3249   Type Ty = SourceVectNotLegalized->getType();
3250   Type ElementTy = typeElementType(Ty);
3251   Type InVectorElementTy = Traits::getInVectorElementType(Ty);
3252 
3253   // TODO(wala): Determine the best lowering sequences for each type.
3254   bool CanUsePextr = Ty == IceType_v8i16 || Ty == IceType_v8i1 ||
3255                      (InstructionSet >= Traits::SSE4_1 && Ty != IceType_v4f32);
3256   Variable *ExtractedElementR =
3257       makeReg(CanUsePextr ? IceType_i32 : InVectorElementTy);
3258   if (CanUsePextr) {
3259     // Use pextrb, pextrw, or pextrd.  The "b" and "w" versions clear the upper
3260     // bits of the destination register, so we represent this by always
3261     // extracting into an i32 register.  The _mov into Dest below will do
3262     // truncation as necessary.
3263     Constant *Mask = Ctx->getConstantInt32(Index);
3264     Variable *SourceVectR = legalizeToReg(SourceVectNotLegalized);
3265     _pextr(ExtractedElementR, SourceVectR, Mask);
3266   } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
3267     // Use pshufd and movd/movss.
3268     Variable *T = nullptr;
3269     if (Index) {
3270       // The shuffle only needs to occur if the element to be extracted is not
3271       // at the lowest index.
3272       Constant *Mask = Ctx->getConstantInt32(Index);
3273       T = makeReg(Ty);
3274       _pshufd(T, legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem), Mask);
3275     } else {
3276       T = legalizeToReg(SourceVectNotLegalized);
3277     }
3278 
3279     if (InVectorElementTy == IceType_i32) {
3280       _movd(ExtractedElementR, T);
3281     } else { // Ty == IceType_f32
3282       // TODO(wala): _movss is only used here because _mov does not allow a
3283       // vector source and a scalar destination.  _mov should be able to be
3284       // used here.
3285       // _movss is a binary instruction, so the FakeDef is needed to keep the
3286       // live range analysis consistent.
3287       Context.insert<InstFakeDef>(ExtractedElementR);
3288       _movss(ExtractedElementR, T);
3289     }
3290   } else {
3291     assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
3292     // Spill the value to a stack slot and do the extraction in memory.
3293     //
3294     // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support
3295     // for legalizing to mem is implemented.
3296     Variable *Slot = Func->makeVariable(Ty);
3297     Slot->setMustNotHaveReg();
3298     _movp(Slot, legalizeToReg(SourceVectNotLegalized));
3299 
3300     // Compute the location of the element in memory.
3301     unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
3302     X86OperandMem *Loc =
3303         getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
3304     _mov(ExtractedElementR, Loc);
3305   }
3306 
3307   if (ElementTy == IceType_i1) {
3308     // Truncate extracted integers to i1s if necessary.
3309     Variable *T = makeReg(IceType_i1);
3310     InstCast *Cast =
3311         InstCast::create(Func, InstCast::Trunc, T, ExtractedElementR);
3312     lowerCast(Cast);
3313     ExtractedElementR = T;
3314   }
3315 
3316   // Copy the element to the destination.
3317   Variable *Dest = Instr->getDest();
3318   _mov(Dest, ExtractedElementR);
3319 }
3320 
3321 template <typename TraitsType>
3322 void TargetX86Base<TraitsType>::lowerFcmp(const InstFcmp *Fcmp) {
3323   Variable *Dest = Fcmp->getDest();
3324 
3325   if (isVectorType(Dest->getType())) {
3326     lowerFcmpVector(Fcmp);
3327   } else {
3328     constexpr Inst *Consumer = nullptr;
3329     lowerFcmpAndConsumer(Fcmp, Consumer);
3330   }
3331 }
3332 
3333 template <typename TraitsType>
3334 void TargetX86Base<TraitsType>::lowerFcmpAndConsumer(const InstFcmp *Fcmp,
3335                                                      const Inst *Consumer) {
3336   Operand *Src0 = Fcmp->getSrc(0);
3337   Operand *Src1 = Fcmp->getSrc(1);
3338   Variable *Dest = Fcmp->getDest();
3339 
3340   if (Consumer != nullptr) {
3341     if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3342       if (lowerOptimizeFcmpSelect(Fcmp, Select))
3343         return;
3344     }
3345   }
3346 
3347   if (isVectorType(Dest->getType())) {
3348     lowerFcmp(Fcmp);
3349     if (Consumer != nullptr)
3350       lowerSelectVector(llvm::cast<InstSelect>(Consumer));
3351     return;
3352   }
3353 
3354   // Lowering a = fcmp cond, b, c
3355   //   ucomiss b, c       /* only if C1 != Br_None */
3356   //                      /* but swap b,c order if SwapOperands==true */
3357   //   mov a, <default>
3358   //   j<C1> label        /* only if C1 != Br_None */
3359   //   j<C2> label        /* only if C2 != Br_None */
3360   //   FakeUse(a)         /* only if C1 != Br_None */
3361   //   mov a, !<default>  /* only if C1 != Br_None */
3362   //   label:             /* only if C1 != Br_None */
3363   //
3364   // setcc lowering when C1 != Br_None && C2 == Br_None:
3365   //   ucomiss b, c       /* but swap b,c order if SwapOperands==true */
3366   //   setcc a, C1
3367   InstFcmp::FCond Condition = Fcmp->getCondition();
3368   assert(Condition < Traits::TableFcmpSize);
3369   if (Traits::TableFcmp[Condition].SwapScalarOperands)
3370     std::swap(Src0, Src1);
3371   const bool HasC1 = (Traits::TableFcmp[Condition].C1 != Traits::Cond::Br_None);
3372   const bool HasC2 = (Traits::TableFcmp[Condition].C2 != Traits::Cond::Br_None);
3373   if (HasC1) {
3374     Src0 = legalize(Src0);
3375     Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3376     Variable *T = nullptr;
3377     _mov(T, Src0);
3378     _ucomiss(T, Src1RM);
3379     if (!HasC2) {
3380       assert(Traits::TableFcmp[Condition].Default);
3381       setccOrConsumer(Traits::TableFcmp[Condition].C1, Dest, Consumer);
3382       return;
3383     }
3384   }
3385   int32_t IntDefault = Traits::TableFcmp[Condition].Default;
3386   if (Consumer == nullptr) {
3387     Constant *Default = Ctx->getConstantInt(Dest->getType(), IntDefault);
3388     _mov(Dest, Default);
3389     if (HasC1) {
3390       InstX86Label *Label = InstX86Label::create(Func, this);
3391       _br(Traits::TableFcmp[Condition].C1, Label);
3392       if (HasC2) {
3393         _br(Traits::TableFcmp[Condition].C2, Label);
3394       }
3395       Constant *NonDefault = Ctx->getConstantInt(Dest->getType(), !IntDefault);
3396       _redefined(_mov(Dest, NonDefault));
3397       Context.insert(Label);
3398     }
3399     return;
3400   }
3401   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3402     CfgNode *TrueSucc = Br->getTargetTrue();
3403     CfgNode *FalseSucc = Br->getTargetFalse();
3404     if (IntDefault != 0)
3405       std::swap(TrueSucc, FalseSucc);
3406     if (HasC1) {
3407       _br(Traits::TableFcmp[Condition].C1, FalseSucc);
3408       if (HasC2) {
3409         _br(Traits::TableFcmp[Condition].C2, FalseSucc);
3410       }
3411       _br(TrueSucc);
3412       return;
3413     }
3414     _br(FalseSucc);
3415     return;
3416   }
3417   if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3418     Operand *SrcT = Select->getTrueOperand();
3419     Operand *SrcF = Select->getFalseOperand();
3420     Variable *SelectDest = Select->getDest();
3421     if (IntDefault != 0)
3422       std::swap(SrcT, SrcF);
3423     lowerMove(SelectDest, SrcF, false);
3424     if (HasC1) {
3425       InstX86Label *Label = InstX86Label::create(Func, this);
3426       _br(Traits::TableFcmp[Condition].C1, Label);
3427       if (HasC2) {
3428         _br(Traits::TableFcmp[Condition].C2, Label);
3429       }
3430       static constexpr bool IsRedefinition = true;
3431       lowerMove(SelectDest, SrcT, IsRedefinition);
3432       Context.insert(Label);
3433     }
3434     return;
3435   }
3436   llvm::report_fatal_error("Unexpected consumer type");
3437 }
3438 
3439 template <typename TraitsType>
3440 void TargetX86Base<TraitsType>::lowerFcmpVector(const InstFcmp *Fcmp) {
3441   Operand *Src0 = Fcmp->getSrc(0);
3442   Operand *Src1 = Fcmp->getSrc(1);
3443   Variable *Dest = Fcmp->getDest();
3444 
3445   if (!isVectorType(Dest->getType()))
3446     llvm::report_fatal_error("Expected vector compare");
3447 
3448   InstFcmp::FCond Condition = Fcmp->getCondition();
3449   assert(Condition < Traits::TableFcmpSize);
3450 
3451   if (Traits::TableFcmp[Condition].SwapVectorOperands)
3452     std::swap(Src0, Src1);
3453 
3454   Variable *T = nullptr;
3455 
3456   if (Condition == InstFcmp::True) {
3457     // makeVectorOfOnes() requires an integer vector type.
3458     T = makeVectorOfMinusOnes(IceType_v4i32);
3459   } else if (Condition == InstFcmp::False) {
3460     T = makeVectorOfZeros(Dest->getType());
3461   } else {
3462     Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3463     Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3464     if (llvm::isa<X86OperandMem>(Src1RM))
3465       Src1RM = legalizeToReg(Src1RM);
3466 
3467     switch (Condition) {
3468     default: {
3469       const CmppsCond Predicate = Traits::TableFcmp[Condition].Predicate;
3470       assert(Predicate != Traits::Cond::Cmpps_Invalid);
3471       T = makeReg(Src0RM->getType());
3472       _movp(T, Src0RM);
3473       _cmpps(T, Src1RM, Predicate);
3474     } break;
3475     case InstFcmp::One: {
3476       // Check both unequal and ordered.
3477       T = makeReg(Src0RM->getType());
3478       Variable *T2 = makeReg(Src0RM->getType());
3479       _movp(T, Src0RM);
3480       _cmpps(T, Src1RM, Traits::Cond::Cmpps_neq);
3481       _movp(T2, Src0RM);
3482       _cmpps(T2, Src1RM, Traits::Cond::Cmpps_ord);
3483       _pand(T, T2);
3484     } break;
3485     case InstFcmp::Ueq: {
3486       // Check both equal or unordered.
3487       T = makeReg(Src0RM->getType());
3488       Variable *T2 = makeReg(Src0RM->getType());
3489       _movp(T, Src0RM);
3490       _cmpps(T, Src1RM, Traits::Cond::Cmpps_eq);
3491       _movp(T2, Src0RM);
3492       _cmpps(T2, Src1RM, Traits::Cond::Cmpps_unord);
3493       _por(T, T2);
3494     } break;
3495     }
3496   }
3497 
3498   assert(T != nullptr);
3499   _movp(Dest, T);
3500   eliminateNextVectorSextInstruction(Dest);
3501 }
3502 
3503 inline bool isZero(const Operand *Opnd) {
3504   if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Opnd))
3505     return C64->getValue() == 0;
3506   if (auto *C32 = llvm::dyn_cast<ConstantInteger32>(Opnd))
3507     return C32->getValue() == 0;
3508   return false;
3509 }
3510 
3511 template <typename TraitsType>
3512 void TargetX86Base<TraitsType>::lowerIcmpAndConsumer(const InstIcmp *Icmp,
3513                                                      const Inst *Consumer) {
3514   Operand *Src0 = legalize(Icmp->getSrc(0));
3515   Operand *Src1 = legalize(Icmp->getSrc(1));
3516   Variable *Dest = Icmp->getDest();
3517 
3518   if (isVectorType(Dest->getType())) {
3519     lowerIcmp(Icmp);
3520     if (Consumer != nullptr)
3521       lowerSelectVector(llvm::cast<InstSelect>(Consumer));
3522     return;
3523   }
3524 
3525   if (!Traits::Is64Bit && Src0->getType() == IceType_i64) {
3526     lowerIcmp64(Icmp, Consumer);
3527     return;
3528   }
3529 
3530   // cmp b, c
3531   if (isZero(Src1)) {
3532     switch (Icmp->getCondition()) {
3533     default:
3534       break;
3535     case InstIcmp::Uge:
3536       movOrConsumer(true, Dest, Consumer);
3537       return;
3538     case InstIcmp::Ult:
3539       movOrConsumer(false, Dest, Consumer);
3540       return;
3541     }
3542   }
3543   Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
3544   _cmp(Src0RM, Src1);
3545   setccOrConsumer(Traits::getIcmp32Mapping(Icmp->getCondition()), Dest,
3546                   Consumer);
3547 }
3548 
3549 template <typename TraitsType>
3550 void TargetX86Base<TraitsType>::lowerIcmpVector(const InstIcmp *Icmp) {
3551   Operand *Src0 = legalize(Icmp->getSrc(0));
3552   Operand *Src1 = legalize(Icmp->getSrc(1));
3553   Variable *Dest = Icmp->getDest();
3554 
3555   if (!isVectorType(Dest->getType()))
3556     llvm::report_fatal_error("Expected a vector compare");
3557 
3558   Type Ty = Src0->getType();
3559   // Promote i1 vectors to 128 bit integer vector types.
3560   if (typeElementType(Ty) == IceType_i1) {
3561     Type NewTy = IceType_NUM;
3562     switch (Ty) {
3563     default:
3564       llvm::report_fatal_error("unexpected type");
3565       break;
3566     case IceType_v4i1:
3567       NewTy = IceType_v4i32;
3568       break;
3569     case IceType_v8i1:
3570       NewTy = IceType_v8i16;
3571       break;
3572     case IceType_v16i1:
3573       NewTy = IceType_v16i8;
3574       break;
3575     }
3576     Variable *NewSrc0 = Func->makeVariable(NewTy);
3577     Variable *NewSrc1 = Func->makeVariable(NewTy);
3578     lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc0, Src0));
3579     lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc1, Src1));
3580     Src0 = NewSrc0;
3581     Src1 = NewSrc1;
3582     Ty = NewTy;
3583   }
3584 
3585   InstIcmp::ICond Condition = Icmp->getCondition();
3586 
3587   Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3588   Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3589 
3590   // SSE2 only has signed comparison operations. Transform unsigned inputs in
3591   // a manner that allows for the use of signed comparison operations by
3592   // flipping the high order bits.
3593   if (Condition == InstIcmp::Ugt || Condition == InstIcmp::Uge ||
3594       Condition == InstIcmp::Ult || Condition == InstIcmp::Ule) {
3595     Variable *T0 = makeReg(Ty);
3596     Variable *T1 = makeReg(Ty);
3597     Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty);
3598     _movp(T0, Src0RM);
3599     _pxor(T0, HighOrderBits);
3600     _movp(T1, Src1RM);
3601     _pxor(T1, HighOrderBits);
3602     Src0RM = T0;
3603     Src1RM = T1;
3604   }
3605 
3606   Variable *T = makeReg(Ty);
3607   switch (Condition) {
3608   default:
3609     llvm_unreachable("unexpected condition");
3610     break;
3611   case InstIcmp::Eq: {
3612     if (llvm::isa<X86OperandMem>(Src1RM))
3613       Src1RM = legalizeToReg(Src1RM);
3614     _movp(T, Src0RM);
3615     _pcmpeq(T, Src1RM);
3616   } break;
3617   case InstIcmp::Ne: {
3618     if (llvm::isa<X86OperandMem>(Src1RM))
3619       Src1RM = legalizeToReg(Src1RM);
3620     _movp(T, Src0RM);
3621     _pcmpeq(T, Src1RM);
3622     Variable *MinusOne = makeVectorOfMinusOnes(Ty);
3623     _pxor(T, MinusOne);
3624   } break;
3625   case InstIcmp::Ugt:
3626   case InstIcmp::Sgt: {
3627     if (llvm::isa<X86OperandMem>(Src1RM))
3628       Src1RM = legalizeToReg(Src1RM);
3629     _movp(T, Src0RM);
3630     _pcmpgt(T, Src1RM);
3631   } break;
3632   case InstIcmp::Uge:
3633   case InstIcmp::Sge: {
3634     // !(Src1RM > Src0RM)
3635     if (llvm::isa<X86OperandMem>(Src0RM))
3636       Src0RM = legalizeToReg(Src0RM);
3637     _movp(T, Src1RM);
3638     _pcmpgt(T, Src0RM);
3639     Variable *MinusOne = makeVectorOfMinusOnes(Ty);
3640     _pxor(T, MinusOne);
3641   } break;
3642   case InstIcmp::Ult:
3643   case InstIcmp::Slt: {
3644     if (llvm::isa<X86OperandMem>(Src0RM))
3645       Src0RM = legalizeToReg(Src0RM);
3646     _movp(T, Src1RM);
3647     _pcmpgt(T, Src0RM);
3648   } break;
3649   case InstIcmp::Ule:
3650   case InstIcmp::Sle: {
3651     // !(Src0RM > Src1RM)
3652     if (llvm::isa<X86OperandMem>(Src1RM))
3653       Src1RM = legalizeToReg(Src1RM);
3654     _movp(T, Src0RM);
3655     _pcmpgt(T, Src1RM);
3656     Variable *MinusOne = makeVectorOfMinusOnes(Ty);
3657     _pxor(T, MinusOne);
3658   } break;
3659   }
3660 
3661   _movp(Dest, T);
3662   eliminateNextVectorSextInstruction(Dest);
3663 }
3664 
3665 template <typename TraitsType>
3666 template <typename T>
3667 typename std::enable_if<!T::Is64Bit, void>::type
3668 TargetX86Base<TraitsType>::lowerIcmp64(const InstIcmp *Icmp,
3669                                        const Inst *Consumer) {
3670   // a=icmp cond, b, c ==> cmp b,c; a=1; br cond,L1; FakeUse(a); a=0; L1:
3671   Operand *Src0 = legalize(Icmp->getSrc(0));
3672   Operand *Src1 = legalize(Icmp->getSrc(1));
3673   Variable *Dest = Icmp->getDest();
3674   InstIcmp::ICond Condition = Icmp->getCondition();
3675   assert(Condition < Traits::TableIcmp64Size);
3676   Operand *Src0LoRM = nullptr;
3677   Operand *Src0HiRM = nullptr;
3678   // Legalize the portions of Src0 that are going to be needed.
3679   if (isZero(Src1)) {
3680     switch (Condition) {
3681     default:
3682       llvm_unreachable("unexpected condition");
3683       break;
3684     // These two are not optimized, so we fall through to the general case,
3685     // which needs the upper and lower halves legalized.
3686     case InstIcmp::Sgt:
3687     case InstIcmp::Sle:
3688     // These four compare after performing an "or" of the high and low half, so
3689     // they need the upper and lower halves legalized.
3690     case InstIcmp::Eq:
3691     case InstIcmp::Ule:
3692     case InstIcmp::Ne:
3693     case InstIcmp::Ugt:
3694       Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
3695     // These two test only the high half's sign bit, so they need only
3696     // the upper half legalized.
3697     case InstIcmp::Sge:
3698     case InstIcmp::Slt:
3699       Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
3700       break;
3701 
3702     // These two move constants and hence need no legalization.
3703     case InstIcmp::Uge:
3704     case InstIcmp::Ult:
3705       break;
3706     }
3707   } else {
3708     Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
3709     Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
3710   }
3711   // Optimize comparisons with zero.
3712   if (isZero(Src1)) {
3713     Constant *SignMask = Ctx->getConstantInt32(0x80000000);
3714     Variable *Temp = nullptr;
3715     switch (Condition) {
3716     default:
3717       llvm_unreachable("unexpected condition");
3718       break;
3719     case InstIcmp::Eq:
3720     case InstIcmp::Ule:
3721       // Mov Src0HiRM first, because it was legalized most recently, and will
3722       // sometimes avoid a move before the OR.
3723       _mov(Temp, Src0HiRM);
3724       _or(Temp, Src0LoRM);
3725       Context.insert<InstFakeUse>(Temp);
3726       setccOrConsumer(Traits::Cond::Br_e, Dest, Consumer);
3727       return;
3728     case InstIcmp::Ne:
3729     case InstIcmp::Ugt:
3730       // Mov Src0HiRM first, because it was legalized most recently, and will
3731       // sometimes avoid a move before the OR.
3732       _mov(Temp, Src0HiRM);
3733       _or(Temp, Src0LoRM);
3734       Context.insert<InstFakeUse>(Temp);
3735       setccOrConsumer(Traits::Cond::Br_ne, Dest, Consumer);
3736       return;
3737     case InstIcmp::Uge:
3738       movOrConsumer(true, Dest, Consumer);
3739       return;
3740     case InstIcmp::Ult:
3741       movOrConsumer(false, Dest, Consumer);
3742       return;
3743     case InstIcmp::Sgt:
3744       break;
3745     case InstIcmp::Sge:
3746       _test(Src0HiRM, SignMask);
3747       setccOrConsumer(Traits::Cond::Br_e, Dest, Consumer);
3748       return;
3749     case InstIcmp::Slt:
3750       _test(Src0HiRM, SignMask);
3751       setccOrConsumer(Traits::Cond::Br_ne, Dest, Consumer);
3752       return;
3753     case InstIcmp::Sle:
3754       break;
3755     }
3756   }
3757   // Handle general compares.
3758   Operand *Src1LoRI = legalize(loOperand(Src1), Legal_Reg | Legal_Imm);
3759   Operand *Src1HiRI = legalize(hiOperand(Src1), Legal_Reg | Legal_Imm);
3760   if (Consumer == nullptr) {
3761     Constant *Zero = Ctx->getConstantInt(Dest->getType(), 0);
3762     Constant *One = Ctx->getConstantInt(Dest->getType(), 1);
3763     InstX86Label *LabelFalse = InstX86Label::create(Func, this);
3764     InstX86Label *LabelTrue = InstX86Label::create(Func, this);
3765     _mov(Dest, One);
3766     _cmp(Src0HiRM, Src1HiRI);
3767     if (Traits::TableIcmp64[Condition].C1 != Traits::Cond::Br_None)
3768       _br(Traits::TableIcmp64[Condition].C1, LabelTrue);
3769     if (Traits::TableIcmp64[Condition].C2 != Traits::Cond::Br_None)
3770       _br(Traits::TableIcmp64[Condition].C2, LabelFalse);
3771     _cmp(Src0LoRM, Src1LoRI);
3772     _br(Traits::TableIcmp64[Condition].C3, LabelTrue);
3773     Context.insert(LabelFalse);
3774     _redefined(_mov(Dest, Zero));
3775     Context.insert(LabelTrue);
3776     return;
3777   }
3778   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3779     _cmp(Src0HiRM, Src1HiRI);
3780     if (Traits::TableIcmp64[Condition].C1 != Traits::Cond::Br_None)
3781       _br(Traits::TableIcmp64[Condition].C1, Br->getTargetTrue());
3782     if (Traits::TableIcmp64[Condition].C2 != Traits::Cond::Br_None)
3783       _br(Traits::TableIcmp64[Condition].C2, Br->getTargetFalse());
3784     _cmp(Src0LoRM, Src1LoRI);
3785     _br(Traits::TableIcmp64[Condition].C3, Br->getTargetTrue(),
3786         Br->getTargetFalse());
3787     return;
3788   }
3789   if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3790     Operand *SrcT = Select->getTrueOperand();
3791     Operand *SrcF = Select->getFalseOperand();
3792     Variable *SelectDest = Select->getDest();
3793     InstX86Label *LabelFalse = InstX86Label::create(Func, this);
3794     InstX86Label *LabelTrue = InstX86Label::create(Func, this);
3795     lowerMove(SelectDest, SrcT, false);
3796     _cmp(Src0HiRM, Src1HiRI);
3797     if (Traits::TableIcmp64[Condition].C1 != Traits::Cond::Br_None)
3798       _br(Traits::TableIcmp64[Condition].C1, LabelTrue);
3799     if (Traits::TableIcmp64[Condition].C2 != Traits::Cond::Br_None)
3800       _br(Traits::TableIcmp64[Condition].C2, LabelFalse);
3801     _cmp(Src0LoRM, Src1LoRI);
3802     _br(Traits::TableIcmp64[Condition].C3, LabelTrue);
3803     Context.insert(LabelFalse);
3804     static constexpr bool IsRedefinition = true;
3805     lowerMove(SelectDest, SrcF, IsRedefinition);
3806     Context.insert(LabelTrue);
3807     return;
3808   }
3809   llvm::report_fatal_error("Unexpected consumer type");
3810 }
3811 
3812 template <typename TraitsType>
3813 void TargetX86Base<TraitsType>::setccOrConsumer(BrCond Condition,
3814                                                 Variable *Dest,
3815                                                 const Inst *Consumer) {
3816   if (Consumer == nullptr) {
3817     _setcc(Dest, Condition);
3818     return;
3819   }
3820   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3821     _br(Condition, Br->getTargetTrue(), Br->getTargetFalse());
3822     return;
3823   }
3824   if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3825     Operand *SrcT = Select->getTrueOperand();
3826     Operand *SrcF = Select->getFalseOperand();
3827     Variable *SelectDest = Select->getDest();
3828     lowerSelectMove(SelectDest, Condition, SrcT, SrcF);
3829     return;
3830   }
3831   llvm::report_fatal_error("Unexpected consumer type");
3832 }
3833 
3834 template <typename TraitsType>
3835 void TargetX86Base<TraitsType>::movOrConsumer(bool IcmpResult, Variable *Dest,
3836                                               const Inst *Consumer) {
3837   if (Consumer == nullptr) {
3838     _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0)));
3839     return;
3840   }
3841   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3842     // TODO(sehr,stichnot): This could be done with a single unconditional
3843     // branch instruction, but subzero doesn't know how to handle the resulting
3844     // control flow graph changes now.  Make it do so to eliminate mov and cmp.
3845     _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0)));
3846     _cmp(Dest, Ctx->getConstantInt(Dest->getType(), 0));
3847     _br(Traits::Cond::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
3848     return;
3849   }
3850   if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3851     Operand *Src = nullptr;
3852     if (IcmpResult) {
3853       Src = legalize(Select->getTrueOperand(), Legal_Reg | Legal_Imm);
3854     } else {
3855       Src = legalize(Select->getFalseOperand(), Legal_Reg | Legal_Imm);
3856     }
3857     Variable *SelectDest = Select->getDest();
3858     lowerMove(SelectDest, Src, false);
3859     return;
3860   }
3861   llvm::report_fatal_error("Unexpected consumer type");
3862 }
3863 
3864 template <typename TraitsType>
3865 void TargetX86Base<TraitsType>::lowerArithAndConsumer(
3866     const InstArithmetic *Arith, const Inst *Consumer) {
3867   Variable *T = nullptr;
3868   Operand *Src0 = legalize(Arith->getSrc(0));
3869   Operand *Src1 = legalize(Arith->getSrc(1));
3870   Variable *Dest = Arith->getDest();
3871   switch (Arith->getOp()) {
3872   default:
3873     llvm_unreachable("arithmetic operator not AND or OR");
3874     break;
3875   case InstArithmetic::And:
3876     _mov(T, Src0);
3877     // Test cannot have an address in the second position.  Since T is
3878     // guaranteed to be a register and Src1 could be a memory load, ensure
3879     // that the second argument is a register.
3880     if (llvm::isa<Constant>(Src1))
3881       _test(T, Src1);
3882     else
3883       _test(Src1, T);
3884     break;
3885   case InstArithmetic::Or:
3886     _mov(T, Src0);
3887     _or(T, Src1);
3888     break;
3889   }
3890 
3891   if (Consumer == nullptr) {
3892     llvm::report_fatal_error("Expected a consumer instruction");
3893   }
3894   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3895     Context.insert<InstFakeUse>(T);
3896     Context.insert<InstFakeDef>(Dest);
3897     _br(Traits::Cond::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
3898     return;
3899   }
3900   llvm::report_fatal_error("Unexpected consumer type");
3901 }
3902 
3903 template <typename TraitsType>
3904 void TargetX86Base<TraitsType>::lowerInsertElement(
3905     const InstInsertElement *Instr) {
3906   Operand *SourceVectNotLegalized = Instr->getSrc(0);
3907   Operand *ElementToInsertNotLegalized = Instr->getSrc(1);
3908   auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(2));
3909   // Only constant indices are allowed in PNaCl IR.
3910   assert(ElementIndex);
3911   unsigned Index = ElementIndex->getValue();
3912   assert(Index < typeNumElements(SourceVectNotLegalized->getType()));
3913 
3914   Type Ty = SourceVectNotLegalized->getType();
3915   Type ElementTy = typeElementType(Ty);
3916   Type InVectorElementTy = Traits::getInVectorElementType(Ty);
3917 
3918   if (ElementTy == IceType_i1) {
3919     // Expand the element to the appropriate size for it to be inserted in the
3920     // vector.
3921     Variable *Expanded = Func->makeVariable(InVectorElementTy);
3922     auto *Cast = InstCast::create(Func, InstCast::Zext, Expanded,
3923                                   ElementToInsertNotLegalized);
3924     lowerCast(Cast);
3925     ElementToInsertNotLegalized = Expanded;
3926   }
3927 
3928   if (Ty == IceType_v8i16 || Ty == IceType_v8i1 ||
3929       InstructionSet >= Traits::SSE4_1) {
3930     // Use insertps, pinsrb, pinsrw, or pinsrd.
3931     Operand *ElementRM =
3932         legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
3933     Operand *SourceVectRM =
3934         legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
3935     Variable *T = makeReg(Ty);
3936     _movp(T, SourceVectRM);
3937     if (Ty == IceType_v4f32) {
3938       _insertps(T, ElementRM, Ctx->getConstantInt32(Index << 4));
3939     } else {
3940       // For the pinsrb and pinsrw instructions, when the source operand is a
3941       // register, it must be a full r32 register like eax, and not ax/al/ah.
3942       // For filetype=asm, InstX86Pinsr<TraitsType>::emit() compensates for
3943       // the use
3944       // of r16 and r8 by converting them through getBaseReg(), while emitIAS()
3945       // validates that the original and base register encodings are the same.
3946       if (ElementRM->getType() == IceType_i8 &&
3947           llvm::isa<Variable>(ElementRM)) {
3948         // Don't use ah/bh/ch/dh for pinsrb.
3949         ElementRM = copyToReg8(ElementRM);
3950       }
3951       _pinsr(T, ElementRM, Ctx->getConstantInt32(Index));
3952     }
3953     _movp(Instr->getDest(), T);
3954   } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
3955     // Use shufps or movss.
3956     Variable *ElementR = nullptr;
3957     Operand *SourceVectRM =
3958         legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
3959 
3960     if (InVectorElementTy == IceType_f32) {
3961       // ElementR will be in an XMM register since it is floating point.
3962       ElementR = legalizeToReg(ElementToInsertNotLegalized);
3963     } else {
3964       // Copy an integer to an XMM register.
3965       Operand *T = legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
3966       ElementR = makeReg(Ty);
3967       _movd(ElementR, T);
3968     }
3969 
3970     if (Index == 0) {
3971       Variable *T = makeReg(Ty);
3972       _movp(T, SourceVectRM);
3973       _movss(T, ElementR);
3974       _movp(Instr->getDest(), T);
3975       return;
3976     }
3977 
3978     // shufps treats the source and destination operands as vectors of four
3979     // doublewords. The destination's two high doublewords are selected from
3980     // the source operand and the two low doublewords are selected from the
3981     // (original value of) the destination operand. An insertelement operation
3982     // can be effected with a sequence of two shufps operations with
3983     // appropriate masks. In all cases below, Element[0] is being inserted into
3984     // SourceVectOperand. Indices are ordered from left to right.
3985     //
3986     // insertelement into index 1 (result is stored in ElementR):
3987     //   ElementR := ElementR[0, 0] SourceVectRM[0, 0]
3988     //   ElementR := ElementR[3, 0] SourceVectRM[2, 3]
3989     //
3990     // insertelement into index 2 (result is stored in T):
3991     //   T := SourceVectRM
3992     //   ElementR := ElementR[0, 0] T[0, 3]
3993     //   T := T[0, 1] ElementR[0, 3]
3994     //
3995     // insertelement into index 3 (result is stored in T):
3996     //   T := SourceVectRM
3997     //   ElementR := ElementR[0, 0] T[0, 2]
3998     //   T := T[0, 1] ElementR[3, 0]
3999     const unsigned char Mask1[3] = {0, 192, 128};
4000     const unsigned char Mask2[3] = {227, 196, 52};
4001 
4002     Constant *Mask1Constant = Ctx->getConstantInt32(Mask1[Index - 1]);
4003     Constant *Mask2Constant = Ctx->getConstantInt32(Mask2[Index - 1]);
4004 
4005     if (Index == 1) {
4006       _shufps(ElementR, SourceVectRM, Mask1Constant);
4007       _shufps(ElementR, SourceVectRM, Mask2Constant);
4008       _movp(Instr->getDest(), ElementR);
4009     } else {
4010       Variable *T = makeReg(Ty);
4011       _movp(T, SourceVectRM);
4012       _shufps(ElementR, T, Mask1Constant);
4013       _shufps(T, ElementR, Mask2Constant);
4014       _movp(Instr->getDest(), T);
4015     }
4016   } else {
4017     assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
4018     // Spill the value to a stack slot and perform the insertion in memory.
4019     //
4020     // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support
4021     // for legalizing to mem is implemented.
4022     Variable *Slot = Func->makeVariable(Ty);
4023     Slot->setMustNotHaveReg();
4024     _movp(Slot, legalizeToReg(SourceVectNotLegalized));
4025 
4026     // Compute the location of the position to insert in memory.
4027     unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
4028     X86OperandMem *Loc =
4029         getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
4030     _store(legalizeToReg(ElementToInsertNotLegalized), Loc);
4031 
4032     Variable *T = makeReg(Ty);
4033     _movp(T, Slot);
4034     _movp(Instr->getDest(), T);
4035   }
4036 }
4037 
4038 template <typename TraitsType>
4039 void TargetX86Base<TraitsType>::lowerIntrinsicCall(
4040     const InstIntrinsicCall *Instr) {
4041   switch (Intrinsics::IntrinsicID ID = Instr->getIntrinsicInfo().ID) {
4042   case Intrinsics::AtomicCmpxchg: {
4043     if (!Intrinsics::isMemoryOrderValid(
4044             ID, getConstantMemoryOrder(Instr->getArg(3)),
4045             getConstantMemoryOrder(Instr->getArg(4)))) {
4046       Func->setError("Unexpected memory ordering for AtomicCmpxchg");
4047       return;
4048     }
4049     Variable *DestPrev = Instr->getDest();
4050     Operand *PtrToMem = legalize(Instr->getArg(0));
4051     Operand *Expected = legalize(Instr->getArg(1));
4052     Operand *Desired = legalize(Instr->getArg(2));
4053     if (tryOptimizedCmpxchgCmpBr(DestPrev, PtrToMem, Expected, Desired))
4054       return;
4055     lowerAtomicCmpxchg(DestPrev, PtrToMem, Expected, Desired);
4056     return;
4057   }
4058   case Intrinsics::AtomicFence:
4059     if (!Intrinsics::isMemoryOrderValid(
4060             ID, getConstantMemoryOrder(Instr->getArg(0)))) {
4061       Func->setError("Unexpected memory ordering for AtomicFence");
4062       return;
4063     }
4064     _mfence();
4065     return;
4066   case Intrinsics::AtomicFenceAll:
4067     // NOTE: FenceAll should prevent and load/store from being moved across the
4068     // fence (both atomic and non-atomic). The InstX8632Mfence instruction is
4069     // currently marked coarsely as "HasSideEffects".
4070     _mfence();
4071     return;
4072   case Intrinsics::AtomicIsLockFree: {
4073     // X86 is always lock free for 8/16/32/64 bit accesses.
4074     // TODO(jvoung): Since the result is constant when given a constant byte
4075     // size, this opens up DCE opportunities.
4076     Operand *ByteSize = Instr->getArg(0);
4077     Variable *Dest = Instr->getDest();
4078     if (auto *CI = llvm::dyn_cast<ConstantInteger32>(ByteSize)) {
4079       Constant *Result;
4080       switch (CI->getValue()) {
4081       default:
4082         // Some x86-64 processors support the cmpxchg16b instruction, which can
4083         // make 16-byte operations lock free (when used with the LOCK prefix).
4084         // However, that's not supported in 32-bit mode, so just return 0 even
4085         // for large sizes.
4086         Result = Ctx->getConstantZero(IceType_i32);
4087         break;
4088       case 1:
4089       case 2:
4090       case 4:
4091       case 8:
4092         Result = Ctx->getConstantInt32(1);
4093         break;
4094       }
4095       _mov(Dest, Result);
4096       return;
4097     }
4098     // The PNaCl ABI requires the byte size to be a compile-time constant.
4099     Func->setError("AtomicIsLockFree byte size should be compile-time const");
4100     return;
4101   }
4102   case Intrinsics::AtomicLoad: {
4103     // We require the memory address to be naturally aligned. Given that is the
4104     // case, then normal loads are atomic.
4105     if (!Intrinsics::isMemoryOrderValid(
4106             ID, getConstantMemoryOrder(Instr->getArg(1)))) {
4107       Func->setError("Unexpected memory ordering for AtomicLoad");
4108       return;
4109     }
4110     Variable *Dest = Instr->getDest();
4111     if (!Traits::Is64Bit) {
4112       if (auto *Dest64On32 = llvm::dyn_cast<Variable64On32>(Dest)) {
4113         // Follow what GCC does and use a movq instead of what lowerLoad()
4114         // normally does (split the load into two). Thus, this skips
4115         // load/arithmetic op folding. Load/arithmetic folding can't happen
4116         // anyway, since this is x86-32 and integer arithmetic only happens on
4117         // 32-bit quantities.
4118         Variable *T = makeReg(IceType_f64);
4119         X86OperandMem *Addr = formMemoryOperand(Instr->getArg(0), IceType_f64);
4120         _movq(T, Addr);
4121         // Then cast the bits back out of the XMM register to the i64 Dest.
4122         auto *Cast = InstCast::create(Func, InstCast::Bitcast, Dest, T);
4123         lowerCast(Cast);
4124         // Make sure that the atomic load isn't elided when unused.
4125         Context.insert<InstFakeUse>(Dest64On32->getLo());
4126         Context.insert<InstFakeUse>(Dest64On32->getHi());
4127         return;
4128       }
4129     }
4130     auto *Load = InstLoad::create(Func, Dest, Instr->getArg(0));
4131     lowerLoad(Load);
4132     // Make sure the atomic load isn't elided when unused, by adding a FakeUse.
4133     // Since lowerLoad may fuse the load w/ an arithmetic instruction, insert
4134     // the FakeUse on the last-inserted instruction's dest.
4135     Context.insert<InstFakeUse>(Context.getLastInserted()->getDest());
4136     return;
4137   }
4138   case Intrinsics::AtomicRMW:
4139     if (!Intrinsics::isMemoryOrderValid(
4140             ID, getConstantMemoryOrder(Instr->getArg(3)))) {
4141       Func->setError("Unexpected memory ordering for AtomicRMW");
4142       return;
4143     }
4144     lowerAtomicRMW(
4145         Instr->getDest(),
4146         static_cast<uint32_t>(
4147             llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()),
4148         Instr->getArg(1), Instr->getArg(2));
4149     return;
4150   case Intrinsics::AtomicStore: {
4151     if (!Intrinsics::isMemoryOrderValid(
4152             ID, getConstantMemoryOrder(Instr->getArg(2)))) {
4153       Func->setError("Unexpected memory ordering for AtomicStore");
4154       return;
4155     }
4156     // We require the memory address to be naturally aligned. Given that is the
4157     // case, then normal stores are atomic. Add a fence after the store to make
4158     // it visible.
4159     Operand *Value = Instr->getArg(0);
4160     Operand *Ptr = Instr->getArg(1);
4161     if (!Traits::Is64Bit && Value->getType() == IceType_i64) {
4162       // Use a movq instead of what lowerStore() normally does (split the store
4163       // into two), following what GCC does. Cast the bits from int -> to an
4164       // xmm register first.
4165       Variable *T = makeReg(IceType_f64);
4166       auto *Cast = InstCast::create(Func, InstCast::Bitcast, T, Value);
4167       lowerCast(Cast);
4168       // Then store XMM w/ a movq.
4169       X86OperandMem *Addr = formMemoryOperand(Ptr, IceType_f64);
4170       _storeq(T, Addr);
4171       _mfence();
4172       return;
4173     }
4174     auto *Store = InstStore::create(Func, Value, Ptr);
4175     lowerStore(Store);
4176     _mfence();
4177     return;
4178   }
4179   case Intrinsics::Bswap: {
4180     Variable *Dest = Instr->getDest();
4181     Operand *Val = Instr->getArg(0);
4182     // In 32-bit mode, bswap only works on 32-bit arguments, and the argument
4183     // must be a register. Use rotate left for 16-bit bswap.
4184     if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
4185       Val = legalizeUndef(Val);
4186       Variable *T_Lo = legalizeToReg(loOperand(Val));
4187       Variable *T_Hi = legalizeToReg(hiOperand(Val));
4188       auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
4189       auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
4190       _bswap(T_Lo);
4191       _bswap(T_Hi);
4192       _mov(DestLo, T_Hi);
4193       _mov(DestHi, T_Lo);
4194     } else if ((Traits::Is64Bit && Val->getType() == IceType_i64) ||
4195                Val->getType() == IceType_i32) {
4196       Variable *T = legalizeToReg(Val);
4197       _bswap(T);
4198       _mov(Dest, T);
4199     } else {
4200       assert(Val->getType() == IceType_i16);
4201       Constant *Eight = Ctx->getConstantInt16(8);
4202       Variable *T = nullptr;
4203       Val = legalize(Val);
4204       _mov(T, Val);
4205       _rol(T, Eight);
4206       _mov(Dest, T);
4207     }
4208     return;
4209   }
4210   case Intrinsics::Ctpop: {
4211     Variable *Dest = Instr->getDest();
4212     Variable *T = nullptr;
4213     Operand *Val = Instr->getArg(0);
4214     Type ValTy = Val->getType();
4215     assert(ValTy == IceType_i32 || ValTy == IceType_i64);
4216 
4217     if (!Traits::Is64Bit) {
4218       T = Dest;
4219     } else {
4220       T = makeReg(IceType_i64);
4221       if (ValTy == IceType_i32) {
4222         // in x86-64, __popcountsi2 is not defined, so we cheat a bit by
4223         // converting it to a 64-bit value, and using ctpop_i64. _movzx should
4224         // ensure we will not have any bits set on Val's upper 32 bits.
4225         Variable *V = makeReg(IceType_i64);
4226         Operand *ValRM = legalize(Val, Legal_Reg | Legal_Mem);
4227         _movzx(V, ValRM);
4228         Val = V;
4229       }
4230       ValTy = IceType_i64;
4231     }
4232 
4233     InstCall *Call =
4234         makeHelperCall(ValTy == IceType_i32 ? RuntimeHelper::H_call_ctpop_i32
4235                                             : RuntimeHelper::H_call_ctpop_i64,
4236                        T, 1);
4237     Call->addArg(Val);
4238     lowerCall(Call);
4239     // The popcount helpers always return 32-bit values, while the intrinsic's
4240     // signature matches the native POPCNT instruction and fills a 64-bit reg
4241     // (in 64-bit mode). Thus, clear the upper bits of the dest just in case
4242     // the user doesn't do that in the IR. If the user does that in the IR,
4243     // then this zero'ing instruction is dead and gets optimized out.
4244     if (!Traits::Is64Bit) {
4245       assert(T == Dest);
4246       if (Val->getType() == IceType_i64) {
4247         auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
4248         Constant *Zero = Ctx->getConstantZero(IceType_i32);
4249         _mov(DestHi, Zero);
4250       }
4251     } else {
4252       assert(Val->getType() == IceType_i64);
4253       // T is 64 bit. It needs to be copied to dest. We need to:
4254       //
4255       // T_1.32 = trunc T.64 to i32
4256       // T_2.64 = zext T_1.32 to i64
4257       // Dest.<<right_size>> = T_2.<<right_size>>
4258       //
4259       // which ensures the upper 32 bits will always be cleared. Just doing a
4260       //
4261       // mov Dest.32 = trunc T.32 to i32
4262       //
4263       // is dangerous because there's a chance the compiler will optimize this
4264       // copy out. To use _movzx we need two new registers (one 32-, and
4265       // another 64-bit wide.)
4266       Variable *T_1 = makeReg(IceType_i32);
4267       _mov(T_1, T);
4268       Variable *T_2 = makeReg(IceType_i64);
4269       _movzx(T_2, T_1);
4270       _mov(Dest, T_2);
4271     }
4272     return;
4273   }
4274   case Intrinsics::Ctlz: {
4275     // The "is zero undef" parameter is ignored and we always return a
4276     // well-defined value.
4277     Operand *Val = legalize(Instr->getArg(0));
4278     Operand *FirstVal;
4279     Operand *SecondVal = nullptr;
4280     if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
4281       FirstVal = loOperand(Val);
4282       SecondVal = hiOperand(Val);
4283     } else {
4284       FirstVal = Val;
4285     }
4286     constexpr bool IsCttz = false;
4287     lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
4288                     SecondVal);
4289     return;
4290   }
4291   case Intrinsics::Cttz: {
4292     // The "is zero undef" parameter is ignored and we always return a
4293     // well-defined value.
4294     Operand *Val = legalize(Instr->getArg(0));
4295     Operand *FirstVal;
4296     Operand *SecondVal = nullptr;
4297     if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
4298       FirstVal = hiOperand(Val);
4299       SecondVal = loOperand(Val);
4300     } else {
4301       FirstVal = Val;
4302     }
4303     constexpr bool IsCttz = true;
4304     lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
4305                     SecondVal);
4306     return;
4307   }
4308   case Intrinsics::Fabs: {
4309     Operand *Src = legalize(Instr->getArg(0));
4310     Type Ty = Src->getType();
4311     Variable *Dest = Instr->getDest();
4312     Variable *T = makeVectorOfFabsMask(Ty);
4313     // The pand instruction operates on an m128 memory operand, so if Src is an
4314     // f32 or f64, we need to make sure it's in a register.
4315     if (isVectorType(Ty)) {
4316       if (llvm::isa<X86OperandMem>(Src))
4317         Src = legalizeToReg(Src);
4318     } else {
4319       Src = legalizeToReg(Src);
4320     }
4321     _pand(T, Src);
4322     if (isVectorType(Ty))
4323       _movp(Dest, T);
4324     else
4325       _mov(Dest, T);
4326     return;
4327   }
4328   case Intrinsics::Longjmp: {
4329     InstCall *Call = makeHelperCall(RuntimeHelper::H_call_longjmp, nullptr, 2);
4330     Call->addArg(Instr->getArg(0));
4331     Call->addArg(Instr->getArg(1));
4332     lowerCall(Call);
4333     return;
4334   }
4335   case Intrinsics::Memcpy: {
4336     lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
4337     return;
4338   }
4339   case Intrinsics::Memmove: {
4340     lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
4341     return;
4342   }
4343   case Intrinsics::Memset: {
4344     lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
4345     return;
4346   }
4347   case Intrinsics::NaClReadTP: {
4348     if (NeedSandboxing) {
4349       Operand *Src =
4350           dispatchToConcrete(&ConcreteTarget::createNaClReadTPSrcOperand);
4351       Variable *Dest = Instr->getDest();
4352       Variable *T = nullptr;
4353       _mov(T, Src);
4354       _mov(Dest, T);
4355     } else {
4356       InstCall *Call =
4357           makeHelperCall(RuntimeHelper::H_call_read_tp, Instr->getDest(), 0);
4358       lowerCall(Call);
4359     }
4360     return;
4361   }
4362   case Intrinsics::Setjmp: {
4363     InstCall *Call =
4364         makeHelperCall(RuntimeHelper::H_call_setjmp, Instr->getDest(), 1);
4365     Call->addArg(Instr->getArg(0));
4366     lowerCall(Call);
4367     return;
4368   }
4369   case Intrinsics::Sqrt: {
4370     assert(isScalarFloatingType(Instr->getDest()->getType()) ||
4371            getFlags().getApplicationBinaryInterface() != ::Ice::ABI_PNaCl);
4372     Operand *Src = legalize(Instr->getArg(0));
4373     Variable *Dest = Instr->getDest();
4374     Variable *T = makeReg(Dest->getType());
4375     _sqrt(T, Src);
4376     _mov(Dest, T);
4377     return;
4378   }
4379   case Intrinsics::Stacksave: {
4380     if (!Traits::Is64Bit || !NeedSandboxing) {
4381       Variable *esp = Func->getTarget()->getPhysicalRegister(getStackReg(),
4382                                                              Traits::WordType);
4383       Variable *Dest = Instr->getDest();
4384       _mov(Dest, esp);
4385       return;
4386     }
4387     Variable *esp = Func->getTarget()->getPhysicalRegister(
4388         Traits::RegisterSet::Reg_esp, IceType_i32);
4389     Variable *Dest = Instr->getDest();
4390     _mov(Dest, esp);
4391 
4392     return;
4393   }
4394   case Intrinsics::Stackrestore: {
4395     Operand *Src = Instr->getArg(0);
4396     _mov_sp(Src);
4397     return;
4398   }
4399 
4400   case Intrinsics::Trap:
4401     _ud2();
4402     return;
4403   case Intrinsics::LoadSubVector: {
4404     assert(llvm::isa<ConstantInteger32>(Instr->getArg(1)) &&
4405            "LoadSubVector second argument must be a constant");
4406     Variable *Dest = Instr->getDest();
4407     Type Ty = Dest->getType();
4408     auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(1));
4409     Operand *Addr = Instr->getArg(0);
4410     X86OperandMem *Src = formMemoryOperand(Addr, Ty);
4411     doMockBoundsCheck(Src);
4412 
4413     if (Dest->isRematerializable()) {
4414       Context.insert<InstFakeDef>(Dest);
4415       return;
4416     }
4417 
4418     auto *T = makeReg(Ty);
4419     switch (SubVectorSize->getValue()) {
4420     case 4:
4421       _movd(T, Src);
4422       break;
4423     case 8:
4424       _movq(T, Src);
4425       break;
4426     default:
4427       Func->setError("Unexpected size for LoadSubVector");
4428       return;
4429     }
4430     _movp(Dest, T);
4431     return;
4432   }
4433   case Intrinsics::StoreSubVector: {
4434     assert(llvm::isa<ConstantInteger32>(Instr->getArg(2)) &&
4435            "StoreSubVector third argument must be a constant");
4436     auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(2));
4437     Operand *Value = Instr->getArg(0);
4438     Operand *Addr = Instr->getArg(1);
4439     X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType());
4440     doMockBoundsCheck(NewAddr);
4441 
4442     Value = legalizeToReg(Value);
4443 
4444     switch (SubVectorSize->getValue()) {
4445     case 4:
4446       _stored(Value, NewAddr);
4447       break;
4448     case 8:
4449       _storeq(Value, NewAddr);
4450       break;
4451     default:
4452       Func->setError("Unexpected size for StoreSubVector");
4453       return;
4454     }
4455     return;
4456   }
4457   case Intrinsics::VectorPackSigned: {
4458     Operand *Src0 = Instr->getArg(0);
4459     Operand *Src1 = Instr->getArg(1);
4460     Variable *Dest = Instr->getDest();
4461     auto *T = makeReg(Src0->getType());
4462     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4463     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4464     _movp(T, Src0RM);
4465     _packss(T, Src1RM);
4466     _movp(Dest, T);
4467     return;
4468   }
4469   case Intrinsics::VectorPackUnsigned: {
4470     Operand *Src0 = Instr->getArg(0);
4471     Operand *Src1 = Instr->getArg(1);
4472     Variable *Dest = Instr->getDest();
4473     auto *T = makeReg(Src0->getType());
4474     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4475     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4476     _movp(T, Src0RM);
4477     _packus(T, Src1RM);
4478     _movp(Dest, T);
4479     return;
4480   }
4481   case Intrinsics::SignMask: {
4482     Operand *SrcReg = legalizeToReg(Instr->getArg(0));
4483     Variable *Dest = Instr->getDest();
4484     Variable *T = makeReg(IceType_i32);
4485     if (SrcReg->getType() == IceType_v4f32 ||
4486         SrcReg->getType() == IceType_v4i32 ||
4487         SrcReg->getType() == IceType_v16i8) {
4488       _movmsk(T, SrcReg);
4489     } else {
4490       // TODO(capn): We could implement v8i16 sign mask using packsswb/pmovmskb
4491       llvm::report_fatal_error("Invalid type for SignMask intrinsic");
4492     }
4493     _mov(Dest, T);
4494     return;
4495   }
4496   case Intrinsics::MultiplyHighSigned: {
4497     Operand *Src0 = Instr->getArg(0);
4498     Operand *Src1 = Instr->getArg(1);
4499     Variable *Dest = Instr->getDest();
4500     auto *T = makeReg(Dest->getType());
4501     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4502     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4503     _movp(T, Src0RM);
4504     _pmulhw(T, Src1RM);
4505     _movp(Dest, T);
4506     return;
4507   }
4508   case Intrinsics::MultiplyHighUnsigned: {
4509     Operand *Src0 = Instr->getArg(0);
4510     Operand *Src1 = Instr->getArg(1);
4511     Variable *Dest = Instr->getDest();
4512     auto *T = makeReg(Dest->getType());
4513     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4514     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4515     _movp(T, Src0RM);
4516     _pmulhuw(T, Src1RM);
4517     _movp(Dest, T);
4518     return;
4519   }
4520   case Intrinsics::MultiplyAddPairs: {
4521     Operand *Src0 = Instr->getArg(0);
4522     Operand *Src1 = Instr->getArg(1);
4523     Variable *Dest = Instr->getDest();
4524     auto *T = makeReg(Dest->getType());
4525     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4526     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4527     _movp(T, Src0RM);
4528     _pmaddwd(T, Src1RM);
4529     _movp(Dest, T);
4530     return;
4531   }
4532   case Intrinsics::AddSaturateSigned: {
4533     Operand *Src0 = Instr->getArg(0);
4534     Operand *Src1 = Instr->getArg(1);
4535     Variable *Dest = Instr->getDest();
4536     auto *T = makeReg(Dest->getType());
4537     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4538     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4539     _movp(T, Src0RM);
4540     _padds(T, Src1RM);
4541     _movp(Dest, T);
4542     return;
4543   }
4544   case Intrinsics::SubtractSaturateSigned: {
4545     Operand *Src0 = Instr->getArg(0);
4546     Operand *Src1 = Instr->getArg(1);
4547     Variable *Dest = Instr->getDest();
4548     auto *T = makeReg(Dest->getType());
4549     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4550     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4551     _movp(T, Src0RM);
4552     _psubs(T, Src1RM);
4553     _movp(Dest, T);
4554     return;
4555   }
4556   case Intrinsics::AddSaturateUnsigned: {
4557     Operand *Src0 = Instr->getArg(0);
4558     Operand *Src1 = Instr->getArg(1);
4559     Variable *Dest = Instr->getDest();
4560     auto *T = makeReg(Dest->getType());
4561     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4562     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4563     _movp(T, Src0RM);
4564     _paddus(T, Src1RM);
4565     _movp(Dest, T);
4566     return;
4567   }
4568   case Intrinsics::SubtractSaturateUnsigned: {
4569     Operand *Src0 = Instr->getArg(0);
4570     Operand *Src1 = Instr->getArg(1);
4571     Variable *Dest = Instr->getDest();
4572     auto *T = makeReg(Dest->getType());
4573     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4574     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4575     _movp(T, Src0RM);
4576     _psubus(T, Src1RM);
4577     _movp(Dest, T);
4578     return;
4579   }
4580   case Intrinsics::Nearbyint: {
4581     Operand *Src = Instr->getArg(0);
4582     Variable *Dest = Instr->getDest();
4583     Type DestTy = Dest->getType();
4584     if (isVectorType(DestTy)) {
4585       assert(DestTy == IceType_v4i32);
4586       assert(Src->getType() == IceType_v4f32);
4587       Operand *Src0R = legalizeToReg(Src);
4588       Variable *T = makeReg(DestTy);
4589       _cvt(T, Src0R, Traits::Insts::Cvt::Ps2dq);
4590       _movp(Dest, T);
4591     } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
4592       llvm::report_fatal_error("Helper call was expected");
4593     } else {
4594       Operand *Src0RM = legalize(Src, Legal_Reg | Legal_Mem);
4595       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
4596       Variable *T_1 = nullptr;
4597       if (Traits::Is64Bit && DestTy == IceType_i64) {
4598         T_1 = makeReg(IceType_i64);
4599       } else {
4600         assert(DestTy != IceType_i64);
4601         T_1 = makeReg(IceType_i32);
4602       }
4603       // cvt() requires its integer argument to be a GPR.
4604       Variable *T_2 = makeReg(DestTy);
4605       if (isByteSizedType(DestTy)) {
4606         assert(T_1->getType() == IceType_i32);
4607         T_1->setRegClass(RCX86_Is32To8);
4608         T_2->setRegClass(RCX86_IsTrunc8Rcvr);
4609       }
4610       _cvt(T_1, Src0RM, Traits::Insts::Cvt::Ss2si);
4611       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
4612       if (DestTy == IceType_i1)
4613         _and(T_2, Ctx->getConstantInt1(1));
4614       _mov(Dest, T_2);
4615     }
4616     return;
4617   }
4618   case Intrinsics::Round: {
4619     assert(InstructionSet >= Traits::SSE4_1);
4620     Variable *Dest = Instr->getDest();
4621     Operand *Src = Instr->getArg(0);
4622     Operand *Mode = Instr->getArg(1);
4623     assert(llvm::isa<ConstantInteger32>(Mode) &&
4624            "Round last argument must be a constant");
4625     auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem);
4626     int32_t Imm = llvm::cast<ConstantInteger32>(Mode)->getValue();
4627     (void)Imm;
4628     assert(Imm >= 0 && Imm < 4 && "Invalid rounding mode");
4629     auto *T = makeReg(Dest->getType());
4630     _round(T, SrcRM, Mode);
4631     _movp(Dest, T);
4632     return;
4633   }
4634   default: // UnknownIntrinsic
4635     Func->setError("Unexpected intrinsic");
4636     return;
4637   }
4638   return;
4639 }
4640 
4641 template <typename TraitsType>
4642 void TargetX86Base<TraitsType>::lowerAtomicCmpxchg(Variable *DestPrev,
4643                                                    Operand *Ptr,
4644                                                    Operand *Expected,
4645                                                    Operand *Desired) {
4646   Type Ty = Expected->getType();
4647   if (!Traits::Is64Bit && Ty == IceType_i64) {
4648     // Reserve the pre-colored registers first, before adding any more
4649     // infinite-weight variables from formMemoryOperand's legalization.
4650     Variable *T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
4651     Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
4652     Variable *T_ecx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx);
4653     Variable *T_ebx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ebx);
4654     _mov(T_eax, loOperand(Expected));
4655     _mov(T_edx, hiOperand(Expected));
4656     _mov(T_ebx, loOperand(Desired));
4657     _mov(T_ecx, hiOperand(Desired));
4658     X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
4659     constexpr bool Locked = true;
4660     _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
4661     auto *DestLo = llvm::cast<Variable>(loOperand(DestPrev));
4662     auto *DestHi = llvm::cast<Variable>(hiOperand(DestPrev));
4663     _mov(DestLo, T_eax);
4664     _mov(DestHi, T_edx);
4665     return;
4666   }
4667   RegNumT Eax;
4668   switch (Ty) {
4669   default:
4670     llvm::report_fatal_error("Bad type for cmpxchg");
4671   case IceType_i64:
4672     Eax = Traits::getRaxOrDie();
4673     break;
4674   case IceType_i32:
4675     Eax = Traits::RegisterSet::Reg_eax;
4676     break;
4677   case IceType_i16:
4678     Eax = Traits::RegisterSet::Reg_ax;
4679     break;
4680   case IceType_i8:
4681     Eax = Traits::RegisterSet::Reg_al;
4682     break;
4683   }
4684   Variable *T_eax = makeReg(Ty, Eax);
4685   _mov(T_eax, Expected);
4686   X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
4687   Variable *DesiredReg = legalizeToReg(Desired);
4688   constexpr bool Locked = true;
4689   _cmpxchg(Addr, T_eax, DesiredReg, Locked);
4690   _mov(DestPrev, T_eax);
4691 }
4692 
4693 template <typename TraitsType>
4694 bool TargetX86Base<TraitsType>::tryOptimizedCmpxchgCmpBr(Variable *Dest,
4695                                                          Operand *PtrToMem,
4696                                                          Operand *Expected,
4697                                                          Operand *Desired) {
4698   if (Func->getOptLevel() == Opt_m1)
4699     return false;
4700   // Peek ahead a few instructions and see how Dest is used.
4701   // It's very common to have:
4702   //
4703   // %x = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* ptr, i32 %expected, ...)
4704   // [%y_phi = ...] // list of phi stores
4705   // %p = icmp eq i32 %x, %expected
4706   // br i1 %p, label %l1, label %l2
4707   //
4708   // which we can optimize into:
4709   //
4710   // %x = <cmpxchg code>
4711   // [%y_phi = ...] // list of phi stores
4712   // br eq, %l1, %l2
4713   InstList::iterator I = Context.getCur();
4714   // I is currently the InstIntrinsicCall. Peek past that.
4715   // This assumes that the atomic cmpxchg has not been lowered yet,
4716   // so that the instructions seen in the scan from "Cur" is simple.
4717   assert(llvm::isa<InstIntrinsicCall>(*I));
4718   Inst *NextInst = Context.getNextInst(I);
4719   if (!NextInst)
4720     return false;
4721   // There might be phi assignments right before the compare+branch, since this
4722   // could be a backward branch for a loop. This placement of assignments is
4723   // determined by placePhiStores().
4724   CfgVector<InstAssign *> PhiAssigns;
4725   while (auto *PhiAssign = llvm::dyn_cast<InstAssign>(NextInst)) {
4726     if (PhiAssign->getDest() == Dest)
4727       return false;
4728     PhiAssigns.push_back(PhiAssign);
4729     NextInst = Context.getNextInst(I);
4730     if (!NextInst)
4731       return false;
4732   }
4733   if (auto *NextCmp = llvm::dyn_cast<InstIcmp>(NextInst)) {
4734     if (!(NextCmp->getCondition() == InstIcmp::Eq &&
4735           ((NextCmp->getSrc(0) == Dest && NextCmp->getSrc(1) == Expected) ||
4736            (NextCmp->getSrc(1) == Dest && NextCmp->getSrc(0) == Expected)))) {
4737       return false;
4738     }
4739     NextInst = Context.getNextInst(I);
4740     if (!NextInst)
4741       return false;
4742     if (auto *NextBr = llvm::dyn_cast<InstBr>(NextInst)) {
4743       if (!NextBr->isUnconditional() &&
4744           NextCmp->getDest() == NextBr->getCondition() &&
4745           NextBr->isLastUse(NextCmp->getDest())) {
4746         lowerAtomicCmpxchg(Dest, PtrToMem, Expected, Desired);
4747         for (size_t i = 0; i < PhiAssigns.size(); ++i) {
4748           // Lower the phi assignments now, before the branch (same placement
4749           // as before).
4750           InstAssign *PhiAssign = PhiAssigns[i];
4751           PhiAssign->setDeleted();
4752           lowerAssign(PhiAssign);
4753           Context.advanceNext();
4754         }
4755         _br(Traits::Cond::Br_e, NextBr->getTargetTrue(),
4756             NextBr->getTargetFalse());
4757         // Skip over the old compare and branch, by deleting them.
4758         NextCmp->setDeleted();
4759         NextBr->setDeleted();
4760         Context.advanceNext();
4761         Context.advanceNext();
4762         return true;
4763       }
4764     }
4765   }
4766   return false;
4767 }
4768 
4769 template <typename TraitsType>
4770 void TargetX86Base<TraitsType>::lowerAtomicRMW(Variable *Dest,
4771                                                uint32_t Operation, Operand *Ptr,
4772                                                Operand *Val) {
4773   bool NeedsCmpxchg = false;
4774   LowerBinOp Op_Lo = nullptr;
4775   LowerBinOp Op_Hi = nullptr;
4776   switch (Operation) {
4777   default:
4778     Func->setError("Unknown AtomicRMW operation");
4779     return;
4780   case Intrinsics::AtomicAdd: {
4781     if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
4782       // All the fall-through paths must set this to true, but use this
4783       // for asserting.
4784       NeedsCmpxchg = true;
4785       Op_Lo = &TargetX86Base<TraitsType>::_add;
4786       Op_Hi = &TargetX86Base<TraitsType>::_adc;
4787       break;
4788     }
4789     X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
4790     constexpr bool Locked = true;
4791     Variable *T = nullptr;
4792     _mov(T, Val);
4793     _xadd(Addr, T, Locked);
4794     _mov(Dest, T);
4795     return;
4796   }
4797   case Intrinsics::AtomicSub: {
4798     if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
4799       NeedsCmpxchg = true;
4800       Op_Lo = &TargetX86Base<TraitsType>::_sub;
4801       Op_Hi = &TargetX86Base<TraitsType>::_sbb;
4802       break;
4803     }
4804     X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
4805     constexpr bool Locked = true;
4806     Variable *T = nullptr;
4807     _mov(T, Val);
4808     _neg(T);
4809     _xadd(Addr, T, Locked);
4810     _mov(Dest, T);
4811     return;
4812   }
4813   case Intrinsics::AtomicOr:
4814     // TODO(jvoung): If Dest is null or dead, then some of these
4815     // operations do not need an "exchange", but just a locked op.
4816     // That appears to be "worth" it for sub, or, and, and xor.
4817     // xadd is probably fine vs lock add for add, and xchg is fine
4818     // vs an atomic store.
4819     NeedsCmpxchg = true;
4820     Op_Lo = &TargetX86Base<TraitsType>::_or;
4821     Op_Hi = &TargetX86Base<TraitsType>::_or;
4822     break;
4823   case Intrinsics::AtomicAnd:
4824     NeedsCmpxchg = true;
4825     Op_Lo = &TargetX86Base<TraitsType>::_and;
4826     Op_Hi = &TargetX86Base<TraitsType>::_and;
4827     break;
4828   case Intrinsics::AtomicXor:
4829     NeedsCmpxchg = true;
4830     Op_Lo = &TargetX86Base<TraitsType>::_xor;
4831     Op_Hi = &TargetX86Base<TraitsType>::_xor;
4832     break;
4833   case Intrinsics::AtomicExchange:
4834     if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
4835       NeedsCmpxchg = true;
4836       // NeedsCmpxchg, but no real Op_Lo/Op_Hi need to be done. The values
4837       // just need to be moved to the ecx and ebx registers.
4838       Op_Lo = nullptr;
4839       Op_Hi = nullptr;
4840       break;
4841     }
4842     X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
4843     Variable *T = nullptr;
4844     _mov(T, Val);
4845     _xchg(Addr, T);
4846     _mov(Dest, T);
4847     return;
4848   }
4849   // Otherwise, we need a cmpxchg loop.
4850   (void)NeedsCmpxchg;
4851   assert(NeedsCmpxchg);
4852   expandAtomicRMWAsCmpxchg(Op_Lo, Op_Hi, Dest, Ptr, Val);
4853 }
4854 
4855 template <typename TraitsType>
4856 void TargetX86Base<TraitsType>::expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo,
4857                                                          LowerBinOp Op_Hi,
4858                                                          Variable *Dest,
4859                                                          Operand *Ptr,
4860                                                          Operand *Val) {
4861   // Expand a more complex RMW operation as a cmpxchg loop:
4862   // For 64-bit:
4863   //   mov     eax, [ptr]
4864   //   mov     edx, [ptr + 4]
4865   // .LABEL:
4866   //   mov     ebx, eax
4867   //   <Op_Lo> ebx, <desired_adj_lo>
4868   //   mov     ecx, edx
4869   //   <Op_Hi> ecx, <desired_adj_hi>
4870   //   lock cmpxchg8b [ptr]
4871   //   jne     .LABEL
4872   //   mov     <dest_lo>, eax
4873   //   mov     <dest_lo>, edx
4874   //
4875   // For 32-bit:
4876   //   mov     eax, [ptr]
4877   // .LABEL:
4878   //   mov     <reg>, eax
4879   //   op      <reg>, [desired_adj]
4880   //   lock cmpxchg [ptr], <reg>
4881   //   jne     .LABEL
4882   //   mov     <dest>, eax
4883   //
4884   // If Op_{Lo,Hi} are nullptr, then just copy the value.
4885   Val = legalize(Val);
4886   Type Ty = Val->getType();
4887   if (!Traits::Is64Bit && Ty == IceType_i64) {
4888     Variable *T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
4889     Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
4890     X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
4891     _mov(T_eax, loOperand(Addr));
4892     _mov(T_edx, hiOperand(Addr));
4893     Variable *T_ecx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx);
4894     Variable *T_ebx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ebx);
4895     InstX86Label *Label = InstX86Label::create(Func, this);
4896     const bool IsXchg8b = Op_Lo == nullptr && Op_Hi == nullptr;
4897     if (!IsXchg8b) {
4898       Context.insert(Label);
4899       _mov(T_ebx, T_eax);
4900       (this->*Op_Lo)(T_ebx, loOperand(Val));
4901       _mov(T_ecx, T_edx);
4902       (this->*Op_Hi)(T_ecx, hiOperand(Val));
4903     } else {
4904       // This is for xchg, which doesn't need an actual Op_Lo/Op_Hi.
4905       // It just needs the Val loaded into ebx and ecx.
4906       // That can also be done before the loop.
4907       _mov(T_ebx, loOperand(Val));
4908       _mov(T_ecx, hiOperand(Val));
4909       Context.insert(Label);
4910     }
4911     constexpr bool Locked = true;
4912     _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
4913     _br(Traits::Cond::Br_ne, Label);
4914     if (!IsXchg8b) {
4915       // If Val is a variable, model the extended live range of Val through
4916       // the end of the loop, since it will be re-used by the loop.
4917       if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) {
4918         auto *ValLo = llvm::cast<Variable>(loOperand(ValVar));
4919         auto *ValHi = llvm::cast<Variable>(hiOperand(ValVar));
4920         Context.insert<InstFakeUse>(ValLo);
4921         Context.insert<InstFakeUse>(ValHi);
4922       }
4923     } else {
4924       // For xchg, the loop is slightly smaller and ebx/ecx are used.
4925       Context.insert<InstFakeUse>(T_ebx);
4926       Context.insert<InstFakeUse>(T_ecx);
4927     }
4928     // The address base (if any) is also reused in the loop.
4929     if (Variable *Base = Addr->getBase())
4930       Context.insert<InstFakeUse>(Base);
4931     auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
4932     auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
4933     _mov(DestLo, T_eax);
4934     _mov(DestHi, T_edx);
4935     return;
4936   }
4937   X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
4938   RegNumT Eax;
4939   switch (Ty) {
4940   default:
4941     llvm::report_fatal_error("Bad type for atomicRMW");
4942   case IceType_i64:
4943     Eax = Traits::getRaxOrDie();
4944     break;
4945   case IceType_i32:
4946     Eax = Traits::RegisterSet::Reg_eax;
4947     break;
4948   case IceType_i16:
4949     Eax = Traits::RegisterSet::Reg_ax;
4950     break;
4951   case IceType_i8:
4952     Eax = Traits::RegisterSet::Reg_al;
4953     break;
4954   }
4955   Variable *T_eax = makeReg(Ty, Eax);
4956   _mov(T_eax, Addr);
4957   auto *Label = Context.insert<InstX86Label>(this);
4958   // We want to pick a different register for T than Eax, so don't use
4959   // _mov(T == nullptr, T_eax).
4960   Variable *T = makeReg(Ty);
4961   _mov(T, T_eax);
4962   (this->*Op_Lo)(T, Val);
4963   constexpr bool Locked = true;
4964   _cmpxchg(Addr, T_eax, T, Locked);
4965   _br(Traits::Cond::Br_ne, Label);
4966   // If Val is a variable, model the extended live range of Val through
4967   // the end of the loop, since it will be re-used by the loop.
4968   if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) {
4969     Context.insert<InstFakeUse>(ValVar);
4970   }
4971   // The address base (if any) is also reused in the loop.
4972   if (Variable *Base = Addr->getBase())
4973     Context.insert<InstFakeUse>(Base);
4974   _mov(Dest, T_eax);
4975 }
4976 
4977 /// Lowers count {trailing, leading} zeros intrinsic.
4978 ///
4979 /// We could do constant folding here, but that should have
4980 /// been done by the front-end/middle-end optimizations.
4981 template <typename TraitsType>
4982 void TargetX86Base<TraitsType>::lowerCountZeros(bool Cttz, Type Ty,
4983                                                 Variable *Dest,
4984                                                 Operand *FirstVal,
4985                                                 Operand *SecondVal) {
4986   // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI).
4987   // Then the instructions will handle the Val == 0 case much more simply
4988   // and won't require conversion from bit position to number of zeros.
4989   //
4990   // Otherwise:
4991   //   bsr IF_NOT_ZERO, Val
4992   //   mov T_DEST, ((Ty == i32) ? 63 : 127)
4993   //   cmovne T_DEST, IF_NOT_ZERO
4994   //   xor T_DEST, ((Ty == i32) ? 31 : 63)
4995   //   mov DEST, T_DEST
4996   //
4997   // NOTE: T_DEST must be a register because cmov requires its dest to be a
4998   // register. Also, bsf and bsr require their dest to be a register.
4999   //
5000   // The xor DEST, C(31|63) converts a bit position to # of leading zeroes.
5001   // E.g., for 000... 00001100, bsr will say that the most significant bit
5002   // set is at position 3, while the number of leading zeros is 28. Xor is
5003   // like (M - N) for N <= M, and converts 63 to 32, and 127 to 64 (for the
5004   // all-zeros case).
5005   //
5006   // X8632 only: Similar for 64-bit, but start w/ speculating that the upper 32
5007   // bits are all zero, and compute the result for that case (checking the
5008   // lower 32 bits). Then actually compute the result for the upper bits and
5009   // cmov in the result from the lower computation if the earlier speculation
5010   // was correct.
5011   //
5012   // Cttz, is similar, but uses bsf instead, and doesn't require the xor
5013   // bit position conversion, and the speculation is reversed.
5014 
5015   // TODO(jpp): refactor this method.
5016   assert(Ty == IceType_i32 || Ty == IceType_i64);
5017   const Type DestTy = Traits::Is64Bit ? Dest->getType() : IceType_i32;
5018   Variable *T = makeReg(DestTy);
5019   Operand *FirstValRM = legalize(FirstVal, Legal_Mem | Legal_Reg);
5020   if (Cttz) {
5021     _bsf(T, FirstValRM);
5022   } else {
5023     _bsr(T, FirstValRM);
5024   }
5025   Variable *T_Dest = makeReg(DestTy);
5026   Constant *_31 = Ctx->getConstantInt32(31);
5027   Constant *_32 = Ctx->getConstantInt(DestTy, 32);
5028   Constant *_63 = Ctx->getConstantInt(DestTy, 63);
5029   Constant *_64 = Ctx->getConstantInt(DestTy, 64);
5030   if (Cttz) {
5031     if (DestTy == IceType_i64) {
5032       _mov(T_Dest, _64);
5033     } else {
5034       _mov(T_Dest, _32);
5035     }
5036   } else {
5037     Constant *_127 = Ctx->getConstantInt(DestTy, 127);
5038     if (DestTy == IceType_i64) {
5039       _mov(T_Dest, _127);
5040     } else {
5041       _mov(T_Dest, _63);
5042     }
5043   }
5044   _cmov(T_Dest, T, Traits::Cond::Br_ne);
5045   if (!Cttz) {
5046     if (DestTy == IceType_i64) {
5047       // Even though there's a _63 available at this point, that constant might
5048       // not be an i32, which will cause the xor emission to fail.
5049       Constant *_63 = Ctx->getConstantInt32(63);
5050       _xor(T_Dest, _63);
5051     } else {
5052       _xor(T_Dest, _31);
5053     }
5054   }
5055   if (Traits::Is64Bit || Ty == IceType_i32) {
5056     _mov(Dest, T_Dest);
5057     return;
5058   }
5059   _add(T_Dest, _32);
5060   auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
5061   auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
5062   // Will be using "test" on this, so we need a registerized variable.
5063   Variable *SecondVar = legalizeToReg(SecondVal);
5064   Variable *T_Dest2 = makeReg(IceType_i32);
5065   if (Cttz) {
5066     _bsf(T_Dest2, SecondVar);
5067   } else {
5068     _bsr(T_Dest2, SecondVar);
5069     _xor(T_Dest2, _31);
5070   }
5071   _test(SecondVar, SecondVar);
5072   _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e);
5073   _mov(DestLo, T_Dest2);
5074   _mov(DestHi, Ctx->getConstantZero(IceType_i32));
5075 }
5076 
5077 template <typename TraitsType>
5078 void TargetX86Base<TraitsType>::typedLoad(Type Ty, Variable *Dest,
5079                                           Variable *Base, Constant *Offset) {
5080   // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
5081   // legalize Mem properly.
5082   if (Offset)
5083     assert(!llvm::isa<ConstantRelocatable>(Offset));
5084 
5085   auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
5086 
5087   if (isVectorType(Ty))
5088     _movp(Dest, Mem);
5089   else if (Ty == IceType_f64)
5090     _movq(Dest, Mem);
5091   else
5092     _mov(Dest, Mem);
5093 }
5094 
5095 template <typename TraitsType>
5096 void TargetX86Base<TraitsType>::typedStore(Type Ty, Variable *Value,
5097                                            Variable *Base, Constant *Offset) {
5098   // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
5099   // legalize Mem properly.
5100   if (Offset)
5101     assert(!llvm::isa<ConstantRelocatable>(Offset));
5102 
5103   auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
5104 
5105   if (isVectorType(Ty))
5106     _storep(Value, Mem);
5107   else if (Ty == IceType_f64)
5108     _storeq(Value, Mem);
5109   else
5110     _store(Value, Mem);
5111 }
5112 
5113 template <typename TraitsType>
5114 void TargetX86Base<TraitsType>::copyMemory(Type Ty, Variable *Dest,
5115                                            Variable *Src, int32_t OffsetAmt) {
5116   Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
5117   // TODO(ascull): this or add nullptr test to _movp, _movq
5118   Variable *Data = makeReg(Ty);
5119 
5120   typedLoad(Ty, Data, Src, Offset);
5121   typedStore(Ty, Data, Dest, Offset);
5122 }
5123 
5124 template <typename TraitsType>
5125 void TargetX86Base<TraitsType>::lowerMemcpy(Operand *Dest, Operand *Src,
5126                                             Operand *Count) {
5127   // There is a load and store for each chunk in the unroll
5128   constexpr uint32_t BytesPerStorep = 16;
5129 
5130   // Check if the operands are constants
5131   const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
5132   const bool IsCountConst = CountConst != nullptr;
5133   const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
5134 
5135   if (shouldOptimizeMemIntrins() && IsCountConst &&
5136       CountValue <= BytesPerStorep * Traits::MEMCPY_UNROLL_LIMIT) {
5137     // Unlikely, but nothing to do if it does happen
5138     if (CountValue == 0)
5139       return;
5140 
5141     Variable *SrcBase = legalizeToReg(Src);
5142     Variable *DestBase = legalizeToReg(Dest);
5143 
5144     // Find the largest type that can be used and use it as much as possible in
5145     // reverse order. Then handle any remainder with overlapping copies. Since
5146     // the remainder will be at the end, there will be reduced pressure on the
5147     // memory unit as the accesses to the same memory are far apart.
5148     Type Ty = largestTypeInSize(CountValue);
5149     uint32_t TyWidth = typeWidthInBytes(Ty);
5150 
5151     uint32_t RemainingBytes = CountValue;
5152     int32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
5153     while (RemainingBytes >= TyWidth) {
5154       copyMemory(Ty, DestBase, SrcBase, Offset);
5155       RemainingBytes -= TyWidth;
5156       Offset -= TyWidth;
5157     }
5158 
5159     if (RemainingBytes == 0)
5160       return;
5161 
5162     // Lower the remaining bytes. Adjust to larger types in order to make use
5163     // of overlaps in the copies.
5164     Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
5165     Offset = CountValue - typeWidthInBytes(LeftOverTy);
5166     copyMemory(LeftOverTy, DestBase, SrcBase, Offset);
5167     return;
5168   }
5169 
5170   // Fall back on a function call
5171   InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memcpy, nullptr, 3);
5172   Call->addArg(Dest);
5173   Call->addArg(Src);
5174   Call->addArg(Count);
5175   lowerCall(Call);
5176 }
5177 
5178 template <typename TraitsType>
5179 void TargetX86Base<TraitsType>::lowerMemmove(Operand *Dest, Operand *Src,
5180                                              Operand *Count) {
5181   // There is a load and store for each chunk in the unroll
5182   constexpr uint32_t BytesPerStorep = 16;
5183 
5184   // Check if the operands are constants
5185   const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
5186   const bool IsCountConst = CountConst != nullptr;
5187   const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
5188 
5189   if (shouldOptimizeMemIntrins() && IsCountConst &&
5190       CountValue <= BytesPerStorep * Traits::MEMMOVE_UNROLL_LIMIT) {
5191     // Unlikely, but nothing to do if it does happen
5192     if (CountValue == 0)
5193       return;
5194 
5195     Variable *SrcBase = legalizeToReg(Src);
5196     Variable *DestBase = legalizeToReg(Dest);
5197 
5198     std::tuple<Type, Constant *, Variable *>
5199         Moves[Traits::MEMMOVE_UNROLL_LIMIT];
5200     Constant *Offset;
5201     Variable *Reg;
5202 
5203     // Copy the data into registers as the source and destination could overlap
5204     // so make sure not to clobber the memory. This also means overlapping
5205     // moves can be used as we are taking a safe snapshot of the memory.
5206     Type Ty = largestTypeInSize(CountValue);
5207     uint32_t TyWidth = typeWidthInBytes(Ty);
5208 
5209     uint32_t RemainingBytes = CountValue;
5210     int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth;
5211     size_t N = 0;
5212     while (RemainingBytes >= TyWidth) {
5213       assert(N <= Traits::MEMMOVE_UNROLL_LIMIT);
5214       Offset = Ctx->getConstantInt32(OffsetAmt);
5215       Reg = makeReg(Ty);
5216       typedLoad(Ty, Reg, SrcBase, Offset);
5217       RemainingBytes -= TyWidth;
5218       OffsetAmt -= TyWidth;
5219       Moves[N++] = std::make_tuple(Ty, Offset, Reg);
5220     }
5221 
5222     if (RemainingBytes != 0) {
5223       // Lower the remaining bytes. Adjust to larger types in order to make use
5224       // of overlaps in the copies.
5225       assert(N <= Traits::MEMMOVE_UNROLL_LIMIT);
5226       Ty = firstTypeThatFitsSize(RemainingBytes);
5227       Offset = Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty));
5228       Reg = makeReg(Ty);
5229       typedLoad(Ty, Reg, SrcBase, Offset);
5230       Moves[N++] = std::make_tuple(Ty, Offset, Reg);
5231     }
5232 
5233     // Copy the data out into the destination memory
5234     for (size_t i = 0; i < N; ++i) {
5235       std::tie(Ty, Offset, Reg) = Moves[i];
5236       typedStore(Ty, Reg, DestBase, Offset);
5237     }
5238 
5239     return;
5240   }
5241 
5242   // Fall back on a function call
5243   InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memmove, nullptr, 3);
5244   Call->addArg(Dest);
5245   Call->addArg(Src);
5246   Call->addArg(Count);
5247   lowerCall(Call);
5248 }
5249 
5250 template <typename TraitsType>
5251 void TargetX86Base<TraitsType>::lowerMemset(Operand *Dest, Operand *Val,
5252                                             Operand *Count) {
5253   constexpr uint32_t BytesPerStorep = 16;
5254   constexpr uint32_t BytesPerStoreq = 8;
5255   constexpr uint32_t BytesPerStorei32 = 4;
5256   assert(Val->getType() == IceType_i8);
5257 
5258   // Check if the operands are constants
5259   const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
5260   const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val);
5261   const bool IsCountConst = CountConst != nullptr;
5262   const bool IsValConst = ValConst != nullptr;
5263   const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
5264   const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0;
5265 
5266   // Unlikely, but nothing to do if it does happen
5267   if (IsCountConst && CountValue == 0)
5268     return;
5269 
5270   // TODO(ascull): if the count is constant but val is not it would be possible
5271   // to inline by spreading the value across 4 bytes and accessing subregs e.g.
5272   // eax, ax and al.
5273   if (shouldOptimizeMemIntrins() && IsCountConst && IsValConst) {
5274     Variable *Base = nullptr;
5275     Variable *VecReg = nullptr;
5276     const uint32_t MaskValue = (ValValue & 0xff);
5277     const uint32_t SpreadValue =
5278         (MaskValue << 24) | (MaskValue << 16) | (MaskValue << 8) | MaskValue;
5279 
5280     auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,
5281                                                         uint32_t OffsetAmt) {
5282       assert(Base != nullptr);
5283       Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
5284 
5285       // TODO(ascull): is 64-bit better with vector or scalar movq?
5286       auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
5287       if (isVectorType(Ty)) {
5288         assert(VecReg != nullptr);
5289         _storep(VecReg, Mem);
5290       } else if (Ty == IceType_f64) {
5291         assert(VecReg != nullptr);
5292         _storeq(VecReg, Mem);
5293       } else {
5294         assert(Ty != IceType_i64);
5295         _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);
5296       }
5297     };
5298 
5299     // Find the largest type that can be used and use it as much as possible in
5300     // reverse order. Then handle any remainder with overlapping copies. Since
5301     // the remainder will be at the end, there will be reduces pressure on the
5302     // memory unit as the access to the same memory are far apart.
5303     Type Ty;
5304     if (ValValue == 0 && CountValue >= BytesPerStoreq &&
5305         CountValue <= BytesPerStorep * Traits::MEMSET_UNROLL_LIMIT) {
5306       // When the value is zero it can be loaded into a vector register cheaply
5307       // using the xor trick.
5308       Base = legalizeToReg(Dest);
5309       VecReg = makeVectorOfZeros(IceType_v16i8);
5310       Ty = largestTypeInSize(CountValue);
5311     } else if (CountValue <= BytesPerStorei32 * Traits::MEMSET_UNROLL_LIMIT) {
5312       // When the value is non-zero or the count is small we can't use vector
5313       // instructions so are limited to 32-bit stores.
5314       Base = legalizeToReg(Dest);
5315       constexpr uint32_t MaxSize = 4;
5316       Ty = largestTypeInSize(CountValue, MaxSize);
5317     }
5318 
5319     if (Base) {
5320       uint32_t TyWidth = typeWidthInBytes(Ty);
5321 
5322       uint32_t RemainingBytes = CountValue;
5323       uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
5324       while (RemainingBytes >= TyWidth) {
5325         lowerSet(Ty, Offset);
5326         RemainingBytes -= TyWidth;
5327         Offset -= TyWidth;
5328       }
5329 
5330       if (RemainingBytes == 0)
5331         return;
5332 
5333       // Lower the remaining bytes. Adjust to larger types in order to make use
5334       // of overlaps in the copies.
5335       Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
5336       Offset = CountValue - typeWidthInBytes(LeftOverTy);
5337       lowerSet(LeftOverTy, Offset);
5338       return;
5339     }
5340   }
5341 
5342   // Fall back on calling the memset function. The value operand needs to be
5343   // extended to a stack slot size because the PNaCl ABI requires arguments to
5344   // be at least 32 bits wide.
5345   Operand *ValExt;
5346   if (IsValConst) {
5347     ValExt = Ctx->getConstantInt(stackSlotType(), ValValue);
5348   } else {
5349     Variable *ValExtVar = Func->makeVariable(stackSlotType());
5350     lowerCast(InstCast::create(Func, InstCast::Zext, ValExtVar, Val));
5351     ValExt = ValExtVar;
5352   }
5353   InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memset, nullptr, 3);
5354   Call->addArg(Dest);
5355   Call->addArg(ValExt);
5356   Call->addArg(Count);
5357   lowerCall(Call);
5358 }
5359 
5360 class AddressOptimizer {
5361   AddressOptimizer() = delete;
5362   AddressOptimizer(const AddressOptimizer &) = delete;
5363   AddressOptimizer &operator=(const AddressOptimizer &) = delete;
5364 
5365 public:
5366   explicit AddressOptimizer(const Cfg *Func)
5367       : Func(Func), VMetadata(Func->getVMetadata()) {}
5368 
5369   inline void dumpAddressOpt(const ConstantRelocatable *const Relocatable,
5370                              int32_t Offset, const Variable *Base,
5371                              const Variable *Index, uint16_t Shift,
5372                              const Inst *Reason) const;
5373 
5374   inline const Inst *matchAssign(Variable **Var,
5375                                  ConstantRelocatable **Relocatable,
5376                                  int32_t *Offset);
5377 
5378   inline const Inst *matchCombinedBaseIndex(Variable **Base, Variable **Index,
5379                                             uint16_t *Shift);
5380 
5381   inline const Inst *matchShiftedIndex(Variable **Index, uint16_t *Shift);
5382 
5383   inline const Inst *matchOffsetIndexOrBase(Variable **IndexOrBase,
5384                                             const uint16_t Shift,
5385                                             ConstantRelocatable **Relocatable,
5386                                             int32_t *Offset);
5387 
5388 private:
5389   const Cfg *const Func;
5390   const VariablesMetadata *const VMetadata;
5391 
5392   static bool isAdd(const Inst *Instr) {
5393     if (auto *Arith = llvm::dyn_cast_or_null<const InstArithmetic>(Instr)) {
5394       return (Arith->getOp() == InstArithmetic::Add);
5395     }
5396     return false;
5397   }
5398 };
5399 
5400 void AddressOptimizer::dumpAddressOpt(
5401     const ConstantRelocatable *const Relocatable, int32_t Offset,
5402     const Variable *Base, const Variable *Index, uint16_t Shift,
5403     const Inst *Reason) const {
5404   if (!BuildDefs::dump())
5405     return;
5406   if (!Func->isVerbose(IceV_AddrOpt))
5407     return;
5408   OstreamLocker L(Func->getContext());
5409   Ostream &Str = Func->getContext()->getStrDump();
5410   Str << "Instruction: ";
5411   Reason->dumpDecorated(Func);
5412   Str << "  results in Base=";
5413   if (Base)
5414     Base->dump(Func);
5415   else
5416     Str << "<null>";
5417   Str << ", Index=";
5418   if (Index)
5419     Index->dump(Func);
5420   else
5421     Str << "<null>";
5422   Str << ", Shift=" << Shift << ", Offset=" << Offset
5423       << ", Relocatable=" << Relocatable << "\n";
5424 }
5425 
5426 const Inst *AddressOptimizer::matchAssign(Variable **Var,
5427                                           ConstantRelocatable **Relocatable,
5428                                           int32_t *Offset) {
5429   // Var originates from Var=SrcVar ==> set Var:=SrcVar
5430   if (*Var == nullptr)
5431     return nullptr;
5432   if (const Inst *VarAssign = VMetadata->getSingleDefinition(*Var)) {
5433     assert(!VMetadata->isMultiDef(*Var));
5434     if (llvm::isa<InstAssign>(VarAssign)) {
5435       Operand *SrcOp = VarAssign->getSrc(0);
5436       assert(SrcOp);
5437       if (auto *SrcVar = llvm::dyn_cast<Variable>(SrcOp)) {
5438         if (!VMetadata->isMultiDef(SrcVar) &&
5439             // TODO: ensure SrcVar stays single-BB
5440             true) {
5441           *Var = SrcVar;
5442           return VarAssign;
5443         }
5444       } else if (auto *Const = llvm::dyn_cast<ConstantInteger32>(SrcOp)) {
5445         int32_t MoreOffset = Const->getValue();
5446         if (Utils::WouldOverflowAdd(*Offset, MoreOffset))
5447           return nullptr;
5448         *Var = nullptr;
5449         *Offset += MoreOffset;
5450         return VarAssign;
5451       } else if (auto *AddReloc = llvm::dyn_cast<ConstantRelocatable>(SrcOp)) {
5452         if (*Relocatable == nullptr) {
5453           // It is always safe to fold a relocatable through assignment -- the
5454           // assignment frees a slot in the address operand that can be used to
5455           // hold the Sandbox Pointer -- if any.
5456           *Var = nullptr;
5457           *Relocatable = AddReloc;
5458           return VarAssign;
5459         }
5460       }
5461     }
5462   }
5463   return nullptr;
5464 }
5465 
5466 const Inst *AddressOptimizer::matchCombinedBaseIndex(Variable **Base,
5467                                                      Variable **Index,
5468                                                      uint16_t *Shift) {
5469   // Index==nullptr && Base is Base=Var1+Var2 ==>
5470   //   set Base=Var1, Index=Var2, Shift=0
5471   if (*Base == nullptr)
5472     return nullptr;
5473   if (*Index != nullptr)
5474     return nullptr;
5475   auto *BaseInst = VMetadata->getSingleDefinition(*Base);
5476   if (BaseInst == nullptr)
5477     return nullptr;
5478   assert(!VMetadata->isMultiDef(*Base));
5479   if (BaseInst->getSrcSize() < 2)
5480     return nullptr;
5481   if (auto *Var1 = llvm::dyn_cast<Variable>(BaseInst->getSrc(0))) {
5482     if (VMetadata->isMultiDef(Var1))
5483       return nullptr;
5484     if (auto *Var2 = llvm::dyn_cast<Variable>(BaseInst->getSrc(1))) {
5485       if (VMetadata->isMultiDef(Var2))
5486         return nullptr;
5487       if (isAdd(BaseInst) &&
5488           // TODO: ensure Var1 and Var2 stay single-BB
5489           true) {
5490         *Base = Var1;
5491         *Index = Var2;
5492         *Shift = 0; // should already have been 0
5493         return BaseInst;
5494       }
5495     }
5496   }
5497   return nullptr;
5498 }
5499 
5500 const Inst *AddressOptimizer::matchShiftedIndex(Variable **Index,
5501                                                 uint16_t *Shift) {
5502   // Index is Index=Var*Const && log2(Const)+Shift<=3 ==>
5503   //   Index=Var, Shift+=log2(Const)
5504   if (*Index == nullptr)
5505     return nullptr;
5506   auto *IndexInst = VMetadata->getSingleDefinition(*Index);
5507   if (IndexInst == nullptr)
5508     return nullptr;
5509   assert(!VMetadata->isMultiDef(*Index));
5510 
5511   // When using an unsigned 32-bit array index on x64, it gets zero-extended
5512   // before the shift & add. The explicit zero extension can be eliminated
5513   // because x86 32-bit operations automatically get zero-extended into the
5514   // corresponding 64-bit register.
5515   if (auto *CastInst = llvm::dyn_cast<InstCast>(IndexInst)) {
5516     if (CastInst->getCastKind() == InstCast::Zext) {
5517       if (auto *Var = llvm::dyn_cast<Variable>(CastInst->getSrc(0))) {
5518         if (Var->getType() == IceType_i32 &&
5519             CastInst->getDest()->getType() == IceType_i64) {
5520           IndexInst = VMetadata->getSingleDefinition(Var);
5521         }
5522       }
5523     }
5524   }
5525 
5526   if (IndexInst->getSrcSize() < 2)
5527     return nullptr;
5528   if (auto *ArithInst = llvm::dyn_cast<InstArithmetic>(IndexInst)) {
5529     if (auto *Var = llvm::dyn_cast<Variable>(ArithInst->getSrc(0))) {
5530       if (auto *Const =
5531               llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(1))) {
5532         if (VMetadata->isMultiDef(Var) || Const->getType() != IceType_i32)
5533           return nullptr;
5534         switch (ArithInst->getOp()) {
5535         default:
5536           return nullptr;
5537         case InstArithmetic::Mul: {
5538           uint32_t Mult = Const->getValue();
5539           uint32_t LogMult;
5540           switch (Mult) {
5541           case 1:
5542             LogMult = 0;
5543             break;
5544           case 2:
5545             LogMult = 1;
5546             break;
5547           case 4:
5548             LogMult = 2;
5549             break;
5550           case 8:
5551             LogMult = 3;
5552             break;
5553           default:
5554             return nullptr;
5555           }
5556           if (*Shift + LogMult <= 3) {
5557             *Index = Var;
5558             *Shift += LogMult;
5559             return IndexInst;
5560           }
5561         }
5562         case InstArithmetic::Shl: {
5563           uint32_t ShiftAmount = Const->getValue();
5564           switch (ShiftAmount) {
5565           case 0:
5566           case 1:
5567           case 2:
5568           case 3:
5569             break;
5570           default:
5571             return nullptr;
5572           }
5573           if (*Shift + ShiftAmount <= 3) {
5574             *Index = Var;
5575             *Shift += ShiftAmount;
5576             return IndexInst;
5577           }
5578         }
5579         }
5580       }
5581     }
5582   }
5583   return nullptr;
5584 }
5585 
5586 const Inst *AddressOptimizer::matchOffsetIndexOrBase(
5587     Variable **IndexOrBase, const uint16_t Shift,
5588     ConstantRelocatable **Relocatable, int32_t *Offset) {
5589   // Base is Base=Var+Const || Base is Base=Const+Var ==>
5590   //   set Base=Var, Offset+=Const
5591   // Base is Base=Var-Const ==>
5592   //   set Base=Var, Offset-=Const
5593   // Index is Index=Var+Const ==>
5594   //   set Index=Var, Offset+=(Const<<Shift)
5595   // Index is Index=Const+Var ==>
5596   //   set Index=Var, Offset+=(Const<<Shift)
5597   // Index is Index=Var-Const ==>
5598   //   set Index=Var, Offset-=(Const<<Shift)
5599   // Treat Index=Var Or Const as Index=Var + Const
5600   //    when Var = Var' << N and log2(Const) <= N
5601   // or when Var = (2^M) * (2^N) and log2(Const) <= (M+N)
5602 
5603   if (*IndexOrBase == nullptr) {
5604     return nullptr;
5605   }
5606   const Inst *Definition = VMetadata->getSingleDefinition(*IndexOrBase);
5607   if (Definition == nullptr) {
5608     return nullptr;
5609   }
5610   assert(!VMetadata->isMultiDef(*IndexOrBase));
5611   if (auto *ArithInst = llvm::dyn_cast<const InstArithmetic>(Definition)) {
5612     switch (ArithInst->getOp()) {
5613     case InstArithmetic::Add:
5614     case InstArithmetic::Sub:
5615     case InstArithmetic::Or:
5616       break;
5617     default:
5618       return nullptr;
5619     }
5620 
5621     Operand *Src0 = ArithInst->getSrc(0);
5622     Operand *Src1 = ArithInst->getSrc(1);
5623     auto *Var0 = llvm::dyn_cast<Variable>(Src0);
5624     auto *Var1 = llvm::dyn_cast<Variable>(Src1);
5625     auto *Const0 = llvm::dyn_cast<ConstantInteger32>(Src0);
5626     auto *Const1 = llvm::dyn_cast<ConstantInteger32>(Src1);
5627     auto *Reloc0 = llvm::dyn_cast<ConstantRelocatable>(Src0);
5628     auto *Reloc1 = llvm::dyn_cast<ConstantRelocatable>(Src1);
5629 
5630     bool IsAdd = false;
5631     if (ArithInst->getOp() == InstArithmetic::Or) {
5632       Variable *Var = nullptr;
5633       ConstantInteger32 *Const = nullptr;
5634       if (Var0 && Const1) {
5635         Var = Var0;
5636         Const = Const1;
5637       } else if (Const0 && Var1) {
5638         Var = Var1;
5639         Const = Const0;
5640       } else {
5641         return nullptr;
5642       }
5643       auto *VarDef =
5644           llvm::dyn_cast<InstArithmetic>(VMetadata->getSingleDefinition(Var));
5645       if (VarDef == nullptr)
5646         return nullptr;
5647 
5648       SizeT ZeroesAvailable = 0;
5649       if (VarDef->getOp() == InstArithmetic::Shl) {
5650         if (auto *ConstInt =
5651                 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) {
5652           ZeroesAvailable = ConstInt->getValue();
5653         }
5654       } else if (VarDef->getOp() == InstArithmetic::Mul) {
5655         SizeT PowerOfTwo = 0;
5656         if (auto *MultConst =
5657                 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(0))) {
5658           if (llvm::isPowerOf2_32(MultConst->getValue())) {
5659             PowerOfTwo += MultConst->getValue();
5660           }
5661         }
5662         if (auto *MultConst =
5663                 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) {
5664           if (llvm::isPowerOf2_32(MultConst->getValue())) {
5665             PowerOfTwo += MultConst->getValue();
5666           }
5667         }
5668         ZeroesAvailable = llvm::Log2_32(PowerOfTwo) + 1;
5669       }
5670       SizeT ZeroesNeeded = llvm::Log2_32(Const->getValue()) + 1;
5671       if (ZeroesNeeded == 0 || ZeroesNeeded > ZeroesAvailable)
5672         return nullptr;
5673       IsAdd = true; // treat it as an add if the above conditions hold
5674     } else {
5675       IsAdd = ArithInst->getOp() == InstArithmetic::Add;
5676     }
5677 
5678     Variable *NewIndexOrBase = nullptr;
5679     int32_t NewOffset = 0;
5680     ConstantRelocatable *NewRelocatable = *Relocatable;
5681     if (Var0 && Var1)
5682       // TODO(sehr): merge base/index splitting into here.
5683       return nullptr;
5684     if (!IsAdd && Var1)
5685       return nullptr;
5686     if (Var0)
5687       NewIndexOrBase = Var0;
5688     else if (Var1)
5689       NewIndexOrBase = Var1;
5690     // Don't know how to add/subtract two relocatables.
5691     if ((*Relocatable && (Reloc0 || Reloc1)) || (Reloc0 && Reloc1))
5692       return nullptr;
5693     // Don't know how to subtract a relocatable.
5694     if (!IsAdd && Reloc1)
5695       return nullptr;
5696     // Incorporate ConstantRelocatables.
5697     if (Reloc0)
5698       NewRelocatable = Reloc0;
5699     else if (Reloc1)
5700       NewRelocatable = Reloc1;
5701     // Compute the updated constant offset.
5702     if (Const0) {
5703       const int32_t MoreOffset =
5704           IsAdd ? Const0->getValue() : -Const0->getValue();
5705       if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset))
5706         return nullptr;
5707       NewOffset += MoreOffset;
5708     }
5709     if (Const1) {
5710       const int32_t MoreOffset =
5711           IsAdd ? Const1->getValue() : -Const1->getValue();
5712       if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset))
5713         return nullptr;
5714       NewOffset += MoreOffset;
5715     }
5716     if (Utils::WouldOverflowAdd(*Offset, NewOffset << Shift))
5717       return nullptr;
5718     *IndexOrBase = NewIndexOrBase;
5719     *Offset += (NewOffset << Shift);
5720     // Shift is always zero if this is called with the base
5721     *Relocatable = NewRelocatable;
5722     return Definition;
5723   }
5724   return nullptr;
5725 }
5726 
5727 template <typename TypeTraits>
5728 typename TargetX86Base<TypeTraits>::X86OperandMem *
5729 TargetX86Base<TypeTraits>::computeAddressOpt(const Inst *Instr, Type MemType,
5730                                              Operand *Addr) {
5731   Func->resetCurrentNode();
5732   if (Func->isVerbose(IceV_AddrOpt)) {
5733     OstreamLocker L(Func->getContext());
5734     Ostream &Str = Func->getContext()->getStrDump();
5735     Str << "\nStarting computeAddressOpt for instruction:\n  ";
5736     Instr->dumpDecorated(Func);
5737   }
5738 
5739   OptAddr NewAddr;
5740   NewAddr.Base = llvm::dyn_cast<Variable>(Addr);
5741   if (NewAddr.Base == nullptr)
5742     return nullptr;
5743 
5744   // If the Base has more than one use or is live across multiple blocks, then
5745   // don't go further. Alternatively (?), never consider a transformation that
5746   // would change a variable that is currently *not* live across basic block
5747   // boundaries into one that *is*.
5748   if (!getFlags().getLoopInvariantCodeMotion()) {
5749     // Need multi block address opt when licm is enabled.
5750     // Might make sense to restrict to current node and loop header.
5751     if (Func->getVMetadata()->isMultiBlock(
5752             NewAddr.Base) /* || Base->getUseCount() > 1*/)
5753       return nullptr;
5754   }
5755   AddressOptimizer AddrOpt(Func);
5756   const bool MockBounds = getFlags().getMockBoundsCheck();
5757   const Inst *Reason = nullptr;
5758   bool AddressWasOptimized = false;
5759   // The following unnamed struct identifies the address mode formation steps
5760   // that could potentially create an invalid memory operand (i.e., no free
5761   // slots for RebasePtr.) We add all those variables to this struct so that we
5762   // can use memset() to reset all members to false.
5763   struct {
5764     bool AssignBase = false;
5765     bool AssignIndex = false;
5766     bool OffsetFromBase = false;
5767     bool OffsetFromIndex = false;
5768     bool CombinedBaseIndex = false;
5769   } Skip;
5770   // This points to the boolean in Skip that represents the last folding
5771   // performed. This is used to disable a pattern match that generated an
5772   // invalid address. Without this, the algorithm would never finish.
5773   bool *SkipLastFolding = nullptr;
5774   // NewAddrCheckpoint is used to rollback the address being formed in case an
5775   // invalid address is formed.
5776   OptAddr NewAddrCheckpoint;
5777   Reason = Instr;
5778   do {
5779     if (SandboxingType != ST_None) {
5780       // When sandboxing, we defer the sandboxing of NewAddr to the Concrete
5781       // Target. If our optimization was overly aggressive, then we simply undo
5782       // what the previous iteration did, and set the previous pattern's skip
5783       // bit to true.
5784       if (!legalizeOptAddrForSandbox(&NewAddr)) {
5785         *SkipLastFolding = true;
5786         SkipLastFolding = nullptr;
5787         NewAddr = NewAddrCheckpoint;
5788         Reason = nullptr;
5789       }
5790     }
5791 
5792     if (Reason) {
5793       AddrOpt.dumpAddressOpt(NewAddr.Relocatable, NewAddr.Offset, NewAddr.Base,
5794                              NewAddr.Index, NewAddr.Shift, Reason);
5795       AddressWasOptimized = true;
5796       Reason = nullptr;
5797       SkipLastFolding = nullptr;
5798       memset(&Skip, 0, sizeof(Skip));
5799     }
5800 
5801     NewAddrCheckpoint = NewAddr;
5802 
5803     // Update Base and Index to follow through assignments to definitions.
5804     if (!Skip.AssignBase &&
5805         (Reason = AddrOpt.matchAssign(&NewAddr.Base, &NewAddr.Relocatable,
5806                                       &NewAddr.Offset))) {
5807       SkipLastFolding = &Skip.AssignBase;
5808       // Assignments of Base from a Relocatable or ConstantInt32 can result
5809       // in Base becoming nullptr.  To avoid code duplication in this loop we
5810       // prefer that Base be non-nullptr if possible.
5811       if ((NewAddr.Base == nullptr) && (NewAddr.Index != nullptr) &&
5812           NewAddr.Shift == 0) {
5813         std::swap(NewAddr.Base, NewAddr.Index);
5814       }
5815       continue;
5816     }
5817     if (!Skip.AssignBase &&
5818         (Reason = AddrOpt.matchAssign(&NewAddr.Index, &NewAddr.Relocatable,
5819                                       &NewAddr.Offset))) {
5820       SkipLastFolding = &Skip.AssignIndex;
5821       continue;
5822     }
5823 
5824     if (!MockBounds) {
5825       // Transition from:
5826       //   <Relocatable + Offset>(Base) to
5827       //   <Relocatable + Offset>(Base, Index)
5828       if (!Skip.CombinedBaseIndex &&
5829           (Reason = AddrOpt.matchCombinedBaseIndex(
5830                &NewAddr.Base, &NewAddr.Index, &NewAddr.Shift))) {
5831         SkipLastFolding = &Skip.CombinedBaseIndex;
5832         continue;
5833       }
5834 
5835       // Recognize multiply/shift and update Shift amount.
5836       // Index becomes Index=Var<<Const && Const+Shift<=3 ==>
5837       //   Index=Var, Shift+=Const
5838       // Index becomes Index=Const*Var && log2(Const)+Shift<=3 ==>
5839       //   Index=Var, Shift+=log2(Const)
5840       if ((Reason =
5841                AddrOpt.matchShiftedIndex(&NewAddr.Index, &NewAddr.Shift))) {
5842         continue;
5843       }
5844 
5845       // If Shift is zero, the choice of Base and Index was purely arbitrary.
5846       // Recognize multiply/shift and set Shift amount.
5847       // Shift==0 && Base is Base=Var*Const && log2(Const)+Shift<=3 ==>
5848       //   swap(Index,Base)
5849       // Similar for Base=Const*Var and Base=Var<<Const
5850       if (NewAddr.Shift == 0 &&
5851           (Reason = AddrOpt.matchShiftedIndex(&NewAddr.Base, &NewAddr.Shift))) {
5852         std::swap(NewAddr.Base, NewAddr.Index);
5853         continue;
5854       }
5855     }
5856 
5857     // Update Offset to reflect additions/subtractions with constants and
5858     // relocatables.
5859     // TODO: consider overflow issues with respect to Offset.
5860     if (!Skip.OffsetFromBase && (Reason = AddrOpt.matchOffsetIndexOrBase(
5861                                      &NewAddr.Base, /*Shift =*/0,
5862                                      &NewAddr.Relocatable, &NewAddr.Offset))) {
5863       SkipLastFolding = &Skip.OffsetFromBase;
5864       continue;
5865     }
5866     if (!Skip.OffsetFromIndex && (Reason = AddrOpt.matchOffsetIndexOrBase(
5867                                       &NewAddr.Index, NewAddr.Shift,
5868                                       &NewAddr.Relocatable, &NewAddr.Offset))) {
5869       SkipLastFolding = &Skip.OffsetFromIndex;
5870       continue;
5871     }
5872 
5873     break;
5874   } while (Reason);
5875 
5876   if (!AddressWasOptimized) {
5877     return nullptr;
5878   }
5879 
5880   // Undo any addition of RebasePtr.  It will be added back when the mem
5881   // operand is sandboxed.
5882   if (NewAddr.Base == RebasePtr) {
5883     NewAddr.Base = nullptr;
5884   }
5885 
5886   if (NewAddr.Index == RebasePtr) {
5887     NewAddr.Index = nullptr;
5888     NewAddr.Shift = 0;
5889   }
5890 
5891   Constant *OffsetOp = nullptr;
5892   if (NewAddr.Relocatable == nullptr) {
5893     OffsetOp = Ctx->getConstantInt32(NewAddr.Offset);
5894   } else {
5895     OffsetOp =
5896         Ctx->getConstantSym(NewAddr.Relocatable->getOffset() + NewAddr.Offset,
5897                             NewAddr.Relocatable->getName());
5898   }
5899   // Vanilla ICE load instructions should not use the segment registers, and
5900   // computeAddressOpt only works at the level of Variables and Constants, not
5901   // other X86OperandMem, so there should be no mention of segment
5902   // registers there either.
5903   static constexpr auto SegmentReg =
5904       X86OperandMem::SegmentRegisters::DefaultSegment;
5905 
5906   return X86OperandMem::create(Func, MemType, NewAddr.Base, OffsetOp,
5907                                NewAddr.Index, NewAddr.Shift, SegmentReg);
5908 }
5909 
5910 /// Add a mock bounds check on the memory address before using it as a load or
5911 /// store operand.  The basic idea is that given a memory operand [reg], we
5912 /// would first add bounds-check code something like:
5913 ///
5914 ///   cmp reg, <lb>
5915 ///   jl out_of_line_error
5916 ///   cmp reg, <ub>
5917 ///   jg out_of_line_error
5918 ///
5919 /// In reality, the specific code will depend on how <lb> and <ub> are
5920 /// represented, e.g. an immediate, a global, or a function argument.
5921 ///
5922 /// As such, we need to enforce that the memory operand does not have the form
5923 /// [reg1+reg2], because then there is no simple cmp instruction that would
5924 /// suffice.  However, we consider [reg+offset] to be OK because the offset is
5925 /// usually small, and so <ub> could have a safety buffer built in and then we
5926 /// could instead branch to a custom out_of_line_error that does the precise
5927 /// check and jumps back if it turns out OK.
5928 ///
5929 /// For the purpose of mocking the bounds check, we'll do something like this:
5930 ///
5931 ///   cmp reg, 0
5932 ///   je label
5933 ///   cmp reg, 1
5934 ///   je label
5935 ///   label:
5936 ///
5937 /// Also note that we don't need to add a bounds check to a dereference of a
5938 /// simple global variable address.
5939 template <typename TraitsType>
5940 void TargetX86Base<TraitsType>::doMockBoundsCheck(Operand *Opnd) {
5941   if (!getFlags().getMockBoundsCheck())
5942     return;
5943   if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd)) {
5944     if (Mem->getIndex()) {
5945       llvm::report_fatal_error("doMockBoundsCheck: Opnd contains index reg");
5946     }
5947     Opnd = Mem->getBase();
5948   }
5949   // At this point Opnd could be nullptr, or Variable, or Constant, or perhaps
5950   // something else.  We only care if it is Variable.
5951   auto *Var = llvm::dyn_cast_or_null<Variable>(Opnd);
5952   if (Var == nullptr)
5953     return;
5954   // We use lowerStore() to copy out-args onto the stack.  This creates a memory
5955   // operand with the stack pointer as the base register.  Don't do bounds
5956   // checks on that.
5957   if (Var->getRegNum() == getStackReg())
5958     return;
5959 
5960   auto *Label = InstX86Label::create(Func, this);
5961   _cmp(Opnd, Ctx->getConstantZero(IceType_i32));
5962   _br(Traits::Cond::Br_e, Label);
5963   _cmp(Opnd, Ctx->getConstantInt32(1));
5964   _br(Traits::Cond::Br_e, Label);
5965   Context.insert(Label);
5966 }
5967 
5968 template <typename TraitsType>
5969 void TargetX86Base<TraitsType>::lowerLoad(const InstLoad *Load) {
5970   // A Load instruction can be treated the same as an Assign instruction, after
5971   // the source operand is transformed into an X86OperandMem operand.  Note that
5972   // the address mode optimization already creates an X86OperandMem operand, so
5973   // it doesn't need another level of transformation.
5974   Variable *DestLoad = Load->getDest();
5975   Type Ty = DestLoad->getType();
5976   Operand *Src0 = formMemoryOperand(Load->getSourceAddress(), Ty);
5977   doMockBoundsCheck(Src0);
5978   auto *Assign = InstAssign::create(Func, DestLoad, Src0);
5979   lowerAssign(Assign);
5980 }
5981 
5982 template <typename TraitsType>
5983 void TargetX86Base<TraitsType>::doAddressOptOther() {
5984   // Inverts some Icmp instructions which helps doAddressOptLoad later.
5985   // TODO(manasijm): Refactor to unify the conditions for Var0 and Var1
5986   Inst *Instr = iteratorToInst(Context.getCur());
5987   auto *VMetadata = Func->getVMetadata();
5988   if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Instr)) {
5989     if (llvm::isa<Constant>(Icmp->getSrc(0)) ||
5990         llvm::isa<Constant>(Icmp->getSrc(1)))
5991       return;
5992     auto *Var0 = llvm::dyn_cast<Variable>(Icmp->getSrc(0));
5993     if (Var0 == nullptr)
5994       return;
5995     if (!VMetadata->isTracked(Var0))
5996       return;
5997     auto *Op0Def = VMetadata->getFirstDefinitionSingleBlock(Var0);
5998     if (Op0Def == nullptr || !llvm::isa<InstLoad>(Op0Def))
5999       return;
6000     if (VMetadata->getLocalUseNode(Var0) != Context.getNode())
6001       return;
6002 
6003     auto *Var1 = llvm::dyn_cast<Variable>(Icmp->getSrc(1));
6004     if (Var1 != nullptr && VMetadata->isTracked(Var1)) {
6005       auto *Op1Def = VMetadata->getFirstDefinitionSingleBlock(Var1);
6006       if (Op1Def != nullptr && !VMetadata->isMultiBlock(Var1) &&
6007           llvm::isa<InstLoad>(Op1Def)) {
6008         return; // Both are loads
6009       }
6010     }
6011     Icmp->reverseConditionAndOperands();
6012   }
6013 }
6014 
6015 template <typename TraitsType>
6016 void TargetX86Base<TraitsType>::doAddressOptLoad() {
6017   Inst *Instr = iteratorToInst(Context.getCur());
6018   Operand *Addr = Instr->getSrc(0);
6019   Variable *Dest = Instr->getDest();
6020   if (auto *OptAddr = computeAddressOpt(Instr, Dest->getType(), Addr)) {
6021     Instr->setDeleted();
6022     Context.insert<InstLoad>(Dest, OptAddr);
6023   }
6024 }
6025 
6026 template <typename TraitsType>
6027 void TargetX86Base<TraitsType>::doAddressOptLoadSubVector() {
6028   auto *Intrinsic = llvm::cast<InstIntrinsicCall>(Context.getCur());
6029   Operand *Addr = Intrinsic->getArg(0);
6030   Variable *Dest = Intrinsic->getDest();
6031   if (auto *OptAddr = computeAddressOpt(Intrinsic, Dest->getType(), Addr)) {
6032     Intrinsic->setDeleted();
6033     const Ice::Intrinsics::IntrinsicInfo Info = {
6034         Ice::Intrinsics::LoadSubVector, Ice::Intrinsics::SideEffects_F,
6035         Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
6036     auto Target = Ctx->getConstantUndef(Ice::IceType_i32);
6037     auto *NewLoad = Context.insert<InstIntrinsicCall>(2, Dest, Target, Info);
6038     NewLoad->addArg(OptAddr);
6039     NewLoad->addArg(Intrinsic->getArg(1));
6040   }
6041 }
6042 
6043 template <typename TraitsType>
6044 void TargetX86Base<TraitsType>::randomlyInsertNop(float Probability,
6045                                                   RandomNumberGenerator &RNG) {
6046   RandomNumberGeneratorWrapper RNGW(RNG);
6047   if (RNGW.getTrueWithProbability(Probability)) {
6048     _nop(RNGW(Traits::X86_NUM_NOP_VARIANTS));
6049   }
6050 }
6051 
6052 template <typename TraitsType>
6053 void TargetX86Base<TraitsType>::lowerPhi(const InstPhi * /*Instr*/) {
6054   Func->setError("Phi found in regular instruction list");
6055 }
6056 
6057 template <typename TraitsType>
6058 void TargetX86Base<TraitsType>::lowerRet(const InstRet *Instr) {
6059   Variable *Reg = nullptr;
6060   if (Instr->hasRetValue()) {
6061     Operand *RetValue = legalize(Instr->getRetValue());
6062     const Type ReturnType = RetValue->getType();
6063     assert(isVectorType(ReturnType) || isScalarFloatingType(ReturnType) ||
6064            (ReturnType == IceType_i32) || (ReturnType == IceType_i64));
6065     Reg = moveReturnValueToRegister(RetValue, ReturnType);
6066   }
6067   // Add a ret instruction even if sandboxing is enabled, because addEpilog
6068   // explicitly looks for a ret instruction as a marker for where to insert the
6069   // frame removal instructions.
6070   _ret(Reg);
6071   // Add a fake use of esp to make sure esp stays alive for the entire
6072   // function. Otherwise post-call esp adjustments get dead-code eliminated.
6073   keepEspLiveAtExit();
6074 }
6075 
6076 inline uint32_t makePshufdMask(SizeT Index0, SizeT Index1, SizeT Index2,
6077                                SizeT Index3) {
6078   const SizeT Mask = (Index0 & 0x3) | ((Index1 & 0x3) << 2) |
6079                      ((Index2 & 0x3) << 4) | ((Index3 & 0x3) << 6);
6080   assert(Mask < 256);
6081   return Mask;
6082 }
6083 
6084 template <typename TraitsType>
6085 Variable *TargetX86Base<TraitsType>::lowerShuffleVector_AllFromSameSrc(
6086     Operand *Src, SizeT Index0, SizeT Index1, SizeT Index2, SizeT Index3) {
6087   constexpr SizeT SrcBit = 1 << 2;
6088   assert((Index0 & SrcBit) == (Index1 & SrcBit));
6089   assert((Index0 & SrcBit) == (Index2 & SrcBit));
6090   assert((Index0 & SrcBit) == (Index3 & SrcBit));
6091   (void)SrcBit;
6092 
6093   const Type SrcTy = Src->getType();
6094   auto *T = makeReg(SrcTy);
6095   auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem);
6096   auto *Mask =
6097       Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
6098   _pshufd(T, SrcRM, Mask);
6099   return T;
6100 }
6101 
6102 template <typename TraitsType>
6103 Variable *TargetX86Base<TraitsType>::lowerShuffleVector_TwoFromSameSrc(
6104     Operand *Src0, SizeT Index0, SizeT Index1, Operand *Src1, SizeT Index2,
6105     SizeT Index3) {
6106   constexpr SizeT SrcBit = 1 << 2;
6107   assert((Index0 & SrcBit) == (Index1 & SrcBit) || (Index1 == IGNORE_INDEX));
6108   assert((Index2 & SrcBit) == (Index3 & SrcBit) || (Index3 == IGNORE_INDEX));
6109   (void)SrcBit;
6110 
6111   const Type SrcTy = Src0->getType();
6112   assert(Src1->getType() == SrcTy);
6113   auto *T = makeReg(SrcTy);
6114   auto *Src0R = legalizeToReg(Src0);
6115   auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
6116   auto *Mask =
6117       Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
6118   _movp(T, Src0R);
6119   _shufps(T, Src1RM, Mask);
6120   return T;
6121 }
6122 
6123 template <typename TraitsType>
6124 Variable *TargetX86Base<TraitsType>::lowerShuffleVector_UnifyFromDifferentSrcs(
6125     Operand *Src0, SizeT Index0, Operand *Src1, SizeT Index1) {
6126   return lowerShuffleVector_TwoFromSameSrc(Src0, Index0, IGNORE_INDEX, Src1,
6127                                            Index1, IGNORE_INDEX);
6128 }
6129 
6130 inline SizeT makeSrcSwitchMask(SizeT Index0, SizeT Index1, SizeT Index2,
6131                                SizeT Index3) {
6132   constexpr SizeT SrcBit = 1 << 2;
6133   const SizeT Index0Bits = ((Index0 & SrcBit) == 0) ? 0 : (1 << 0);
6134   const SizeT Index1Bits = ((Index1 & SrcBit) == 0) ? 0 : (1 << 1);
6135   const SizeT Index2Bits = ((Index2 & SrcBit) == 0) ? 0 : (1 << 2);
6136   const SizeT Index3Bits = ((Index3 & SrcBit) == 0) ? 0 : (1 << 3);
6137   return Index0Bits | Index1Bits | Index2Bits | Index3Bits;
6138 }
6139 
6140 template <typename TraitsType>
6141 GlobalString TargetX86Base<TraitsType>::lowerShuffleVector_NewMaskName() {
6142   GlobalString FuncName = Func->getFunctionName();
6143   const SizeT Id = PshufbMaskCount++;
6144   if (!BuildDefs::dump() || !FuncName.hasStdString()) {
6145     return GlobalString::createWithString(
6146         Ctx,
6147         "$PS" + std::to_string(FuncName.getID()) + "_" + std::to_string(Id));
6148   }
6149   return GlobalString::createWithString(
6150       Ctx, "Pshufb$" + Func->getFunctionName() + "$" + std::to_string(Id));
6151 }
6152 
6153 template <typename TraitsType>
6154 ConstantRelocatable *
6155 TargetX86Base<TraitsType>::lowerShuffleVector_CreatePshufbMask(
6156     int8_t Idx0, int8_t Idx1, int8_t Idx2, int8_t Idx3, int8_t Idx4,
6157     int8_t Idx5, int8_t Idx6, int8_t Idx7, int8_t Idx8, int8_t Idx9,
6158     int8_t Idx10, int8_t Idx11, int8_t Idx12, int8_t Idx13, int8_t Idx14,
6159     int8_t Idx15) {
6160   static constexpr uint8_t NumElements = 16;
6161   const char Initializer[NumElements] = {
6162       Idx0, Idx1, Idx2,  Idx3,  Idx4,  Idx5,  Idx6,  Idx7,
6163       Idx8, Idx9, Idx10, Idx11, Idx12, Idx13, Idx14, Idx15,
6164   };
6165 
6166   static constexpr Type V4VectorType = IceType_v4i32;
6167   const uint32_t MaskAlignment = typeWidthInBytesOnStack(V4VectorType);
6168   auto *Mask = VariableDeclaration::create(Func->getGlobalPool());
6169   GlobalString MaskName = lowerShuffleVector_NewMaskName();
6170   Mask->setIsConstant(true);
6171   Mask->addInitializer(VariableDeclaration::DataInitializer::create(
6172       Func->getGlobalPool(), Initializer, NumElements));
6173   Mask->setName(MaskName);
6174   // Mask needs to be 16-byte aligned, or pshufb will seg fault.
6175   Mask->setAlignment(MaskAlignment);
6176   Func->addGlobal(Mask);
6177 
6178   constexpr RelocOffsetT Offset = 0;
6179   return llvm::cast<ConstantRelocatable>(Ctx->getConstantSym(Offset, MaskName));
6180 }
6181 
6182 template <typename TraitsType>
6183 void TargetX86Base<TraitsType>::lowerShuffleVector_UsingPshufb(
6184     Variable *Dest, Operand *Src0, Operand *Src1, int8_t Idx0, int8_t Idx1,
6185     int8_t Idx2, int8_t Idx3, int8_t Idx4, int8_t Idx5, int8_t Idx6,
6186     int8_t Idx7, int8_t Idx8, int8_t Idx9, int8_t Idx10, int8_t Idx11,
6187     int8_t Idx12, int8_t Idx13, int8_t Idx14, int8_t Idx15) {
6188   const Type DestTy = Dest->getType();
6189   static constexpr bool NotRebased = false;
6190   static constexpr Variable *NoBase = nullptr;
6191   // We use void for the memory operand instead of DestTy because using the
6192   // latter causes a validation failure: the X86 Inst layer complains that
6193   // vector mem operands could be under aligned. Thus, using void we avoid the
6194   // validation error. Note that the mask global declaration is aligned, so it
6195   // can be used as an XMM mem operand.
6196   static constexpr Type MaskType = IceType_void;
6197 #define IDX_IN_SRC(N, S)                                                       \
6198   ((((N) & (1 << 4)) == (S << 4)) ? ((N)&0xf) : CLEAR_ALL_BITS)
6199   auto *Mask0M = X86OperandMem::create(
6200       Func, MaskType, NoBase,
6201       lowerShuffleVector_CreatePshufbMask(
6202           IDX_IN_SRC(Idx0, 0), IDX_IN_SRC(Idx1, 0), IDX_IN_SRC(Idx2, 0),
6203           IDX_IN_SRC(Idx3, 0), IDX_IN_SRC(Idx4, 0), IDX_IN_SRC(Idx5, 0),
6204           IDX_IN_SRC(Idx6, 0), IDX_IN_SRC(Idx7, 0), IDX_IN_SRC(Idx8, 0),
6205           IDX_IN_SRC(Idx9, 0), IDX_IN_SRC(Idx10, 0), IDX_IN_SRC(Idx11, 0),
6206           IDX_IN_SRC(Idx12, 0), IDX_IN_SRC(Idx13, 0), IDX_IN_SRC(Idx14, 0),
6207           IDX_IN_SRC(Idx15, 0)),
6208       NotRebased);
6209 
6210   auto *T0 = makeReg(DestTy);
6211   auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
6212   _movp(T0, Src0RM);
6213 
6214   _pshufb(T0, Mask0M);
6215 
6216   if (Idx0 >= 16 || Idx1 >= 16 || Idx2 >= 16 || Idx3 >= 16 || Idx4 >= 16 ||
6217       Idx5 >= 16 || Idx6 >= 16 || Idx7 >= 16 || Idx8 >= 16 || Idx9 >= 16 ||
6218       Idx10 >= 16 || Idx11 >= 16 || Idx12 >= 16 || Idx13 >= 16 || Idx14 >= 16 ||
6219       Idx15 >= 16) {
6220     auto *Mask1M = X86OperandMem::create(
6221         Func, MaskType, NoBase,
6222         lowerShuffleVector_CreatePshufbMask(
6223             IDX_IN_SRC(Idx0, 1), IDX_IN_SRC(Idx1, 1), IDX_IN_SRC(Idx2, 1),
6224             IDX_IN_SRC(Idx3, 1), IDX_IN_SRC(Idx4, 1), IDX_IN_SRC(Idx5, 1),
6225             IDX_IN_SRC(Idx6, 1), IDX_IN_SRC(Idx7, 1), IDX_IN_SRC(Idx8, 1),
6226             IDX_IN_SRC(Idx9, 1), IDX_IN_SRC(Idx10, 1), IDX_IN_SRC(Idx11, 1),
6227             IDX_IN_SRC(Idx12, 1), IDX_IN_SRC(Idx13, 1), IDX_IN_SRC(Idx14, 1),
6228             IDX_IN_SRC(Idx15, 1)),
6229         NotRebased);
6230 #undef IDX_IN_SRC
6231     auto *T1 = makeReg(DestTy);
6232     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
6233     _movp(T1, Src1RM);
6234     _pshufb(T1, Mask1M);
6235     _por(T0, T1);
6236   }
6237 
6238   _movp(Dest, T0);
6239 }
6240 
6241 template <typename TraitsType>
6242 void TargetX86Base<TraitsType>::lowerShuffleVector(
6243     const InstShuffleVector *Instr) {
6244   auto *Dest = Instr->getDest();
6245   const Type DestTy = Dest->getType();
6246   auto *Src0 = Instr->getSrc(0);
6247   auto *Src1 = Instr->getSrc(1);
6248   const SizeT NumElements = typeNumElements(DestTy);
6249 
6250   auto *T = makeReg(DestTy);
6251 
6252   switch (DestTy) {
6253   default:
6254     llvm::report_fatal_error("Unexpected vector type.");
6255   case IceType_v16i1:
6256   case IceType_v16i8: {
6257     static constexpr SizeT ExpectedNumElements = 16;
6258     assert(ExpectedNumElements == Instr->getNumIndexes());
6259     (void)ExpectedNumElements;
6260 
6261     if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7)) {
6262       auto *T = makeReg(DestTy);
6263       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
6264       _movp(T, Src0RM);
6265       _punpckl(T, Src0RM);
6266       _movp(Dest, T);
6267       return;
6268     }
6269 
6270     if (Instr->indexesAre(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7,
6271                           23)) {
6272       auto *T = makeReg(DestTy);
6273       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
6274       auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
6275       _movp(T, Src0RM);
6276       _punpckl(T, Src1RM);
6277       _movp(Dest, T);
6278       return;
6279     }
6280 
6281     if (Instr->indexesAre(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
6282                           15, 15)) {
6283       auto *T = makeReg(DestTy);
6284       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
6285       _movp(T, Src0RM);
6286       _punpckh(T, Src0RM);
6287       _movp(Dest, T);
6288       return;
6289     }
6290 
6291     if (Instr->indexesAre(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30,
6292                           15, 31)) {
6293       auto *T = makeReg(DestTy);
6294       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
6295       auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
6296       _movp(T, Src0RM);
6297       _punpckh(T, Src1RM);
6298       _movp(Dest, T);
6299       return;
6300     }
6301 
6302     if (InstructionSet < Traits::SSE4_1) {
6303       // TODO(jpp): figure out how to lower with sse2.
6304       break;
6305     }
6306 
6307     const SizeT Index0 = Instr->getIndexValue(0);
6308     const SizeT Index1 = Instr->getIndexValue(1);
6309     const SizeT Index2 = Instr->getIndexValue(2);
6310     const SizeT Index3 = Instr->getIndexValue(3);
6311     const SizeT Index4 = Instr->getIndexValue(4);
6312     const SizeT Index5 = Instr->getIndexValue(5);
6313     const SizeT Index6 = Instr->getIndexValue(6);
6314     const SizeT Index7 = Instr->getIndexValue(7);
6315     const SizeT Index8 = Instr->getIndexValue(8);
6316     const SizeT Index9 = Instr->getIndexValue(9);
6317     const SizeT Index10 = Instr->getIndexValue(10);
6318     const SizeT Index11 = Instr->getIndexValue(11);
6319     const SizeT Index12 = Instr->getIndexValue(12);
6320     const SizeT Index13 = Instr->getIndexValue(13);
6321     const SizeT Index14 = Instr->getIndexValue(14);
6322     const SizeT Index15 = Instr->getIndexValue(15);
6323 
6324     lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2,
6325                                    Index3, Index4, Index5, Index6, Index7,
6326                                    Index8, Index9, Index10, Index11, Index12,
6327                                    Index13, Index14, Index15);
6328     return;
6329   }
6330   case IceType_v8i1:
6331   case IceType_v8i16: {
6332     static constexpr SizeT ExpectedNumElements = 8;
6333     assert(ExpectedNumElements == Instr->getNumIndexes());
6334     (void)ExpectedNumElements;
6335 
6336     if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3)) {
6337       auto *T = makeReg(DestTy);
6338       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
6339       _movp(T, Src0RM);
6340       _punpckl(T, Src0RM);
6341       _movp(Dest, T);
6342       return;
6343     }
6344 
6345     if (Instr->indexesAre(0, 8, 1, 9, 2, 10, 3, 11)) {
6346       auto *T = makeReg(DestTy);
6347       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
6348       auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
6349       _movp(T, Src0RM);
6350       _punpckl(T, Src1RM);
6351       _movp(Dest, T);
6352       return;
6353     }
6354 
6355     if (Instr->indexesAre(4, 4, 5, 5, 6, 6, 7, 7)) {
6356       auto *T = makeReg(DestTy);
6357       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
6358       _movp(T, Src0RM);
6359       _punpckh(T, Src0RM);
6360       _movp(Dest, T);
6361       return;
6362     }
6363 
6364     if (Instr->indexesAre(4, 12, 5, 13, 6, 14, 7, 15)) {
6365       auto *T = makeReg(DestTy);
6366       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
6367       auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
6368       _movp(T, Src0RM);
6369       _punpckh(T, Src1RM);
6370       _movp(Dest, T);
6371       return;
6372     }
6373 
6374     if (InstructionSet < Traits::SSE4_1) {
6375       // TODO(jpp): figure out how to lower with sse2.
6376       break;
6377     }
6378 
6379     const SizeT Index0 = Instr->getIndexValue(0);
6380     const SizeT Index1 = Instr->getIndexValue(1);
6381     const SizeT Index2 = Instr->getIndexValue(2);
6382     const SizeT Index3 = Instr->getIndexValue(3);
6383     const SizeT Index4 = Instr->getIndexValue(4);
6384     const SizeT Index5 = Instr->getIndexValue(5);
6385     const SizeT Index6 = Instr->getIndexValue(6);
6386     const SizeT Index7 = Instr->getIndexValue(7);
6387 
6388 #define TO_BYTE_INDEX(I) ((I) << 1)
6389     lowerShuffleVector_UsingPshufb(
6390         Dest, Src0, Src1, TO_BYTE_INDEX(Index0), TO_BYTE_INDEX(Index0) + 1,
6391         TO_BYTE_INDEX(Index1), TO_BYTE_INDEX(Index1) + 1, TO_BYTE_INDEX(Index2),
6392         TO_BYTE_INDEX(Index2) + 1, TO_BYTE_INDEX(Index3),
6393         TO_BYTE_INDEX(Index3) + 1, TO_BYTE_INDEX(Index4),
6394         TO_BYTE_INDEX(Index4) + 1, TO_BYTE_INDEX(Index5),
6395         TO_BYTE_INDEX(Index5) + 1, TO_BYTE_INDEX(Index6),
6396         TO_BYTE_INDEX(Index6) + 1, TO_BYTE_INDEX(Index7),
6397         TO_BYTE_INDEX(Index7) + 1);
6398 #undef TO_BYTE_INDEX
6399     return;
6400   }
6401   case IceType_v4i1:
6402   case IceType_v4i32:
6403   case IceType_v4f32: {
6404     static constexpr SizeT ExpectedNumElements = 4;
6405     assert(ExpectedNumElements == Instr->getNumIndexes());
6406     const SizeT Index0 = Instr->getIndexValue(0);
6407     const SizeT Index1 = Instr->getIndexValue(1);
6408     const SizeT Index2 = Instr->getIndexValue(2);
6409     const SizeT Index3 = Instr->getIndexValue(3);
6410     Variable *T = nullptr;
6411     switch (makeSrcSwitchMask(Index0, Index1, Index2, Index3)) {
6412 #define CASE_SRCS_IN(S0, S1, S2, S3)                                           \
6413   case (((S0) << 0) | ((S1) << 1) | ((S2) << 2) | ((S3) << 3))
6414       CASE_SRCS_IN(0, 0, 0, 0) : {
6415         T = lowerShuffleVector_AllFromSameSrc(Src0, Index0, Index1, Index2,
6416                                               Index3);
6417       }
6418       break;
6419       CASE_SRCS_IN(0, 0, 0, 1) : {
6420         assert(false && "Following code is untested but likely correct; test "
6421                         "and remove assert.");
6422         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
6423                                                                   Src1, Index3);
6424         T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
6425                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6426       }
6427       break;
6428       CASE_SRCS_IN(0, 0, 1, 0) : {
6429         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
6430                                                                   Src0, Index3);
6431         T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
6432                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6433       }
6434       break;
6435       CASE_SRCS_IN(0, 0, 1, 1) : {
6436         T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Src1,
6437                                               Index2, Index3);
6438       }
6439       break;
6440       CASE_SRCS_IN(0, 1, 0, 0) : {
6441         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
6442                                                                   Src1, Index1);
6443         T = lowerShuffleVector_TwoFromSameSrc(
6444             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
6445       }
6446       break;
6447       CASE_SRCS_IN(0, 1, 0, 1) : {
6448         if (Index0 == 0 && (Index1 - ExpectedNumElements) == 0 && Index2 == 1 &&
6449             (Index3 - ExpectedNumElements) == 1) {
6450           auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
6451           auto *Src0R = legalizeToReg(Src0);
6452           T = makeReg(DestTy);
6453           _movp(T, Src0R);
6454           _punpckl(T, Src1RM);
6455         } else if (Index0 == Index2 && Index1 == Index3) {
6456           assert(false && "Following code is untested but likely correct; test "
6457                           "and remove assert.");
6458           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
6459               Src0, Index0, Src1, Index1);
6460           T = lowerShuffleVector_AllFromSameSrc(
6461               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
6462               UNIFIED_INDEX_1);
6463         } else {
6464           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
6465               Src0, Index0, Src1, Index1);
6466           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
6467               Src0, Index2, Src1, Index3);
6468           T = lowerShuffleVector_TwoFromSameSrc(
6469               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
6470               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6471         }
6472       }
6473       break;
6474       CASE_SRCS_IN(0, 1, 1, 0) : {
6475         if (Index0 == Index3 && Index1 == Index2) {
6476           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
6477               Src0, Index0, Src1, Index1);
6478           T = lowerShuffleVector_AllFromSameSrc(
6479               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
6480               UNIFIED_INDEX_0);
6481         } else {
6482           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
6483               Src0, Index0, Src1, Index1);
6484           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
6485               Src1, Index2, Src0, Index3);
6486           T = lowerShuffleVector_TwoFromSameSrc(
6487               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
6488               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6489         }
6490       }
6491       break;
6492       CASE_SRCS_IN(0, 1, 1, 1) : {
6493         assert(false && "Following code is untested but likely correct; test "
6494                         "and remove assert.");
6495         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
6496                                                                   Src1, Index1);
6497         T = lowerShuffleVector_TwoFromSameSrc(
6498             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
6499       }
6500       break;
6501       CASE_SRCS_IN(1, 0, 0, 0) : {
6502         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
6503                                                                   Src0, Index1);
6504         T = lowerShuffleVector_TwoFromSameSrc(
6505             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
6506       }
6507       break;
6508       CASE_SRCS_IN(1, 0, 0, 1) : {
6509         if (Index0 == Index3 && Index1 == Index2) {
6510           assert(false && "Following code is untested but likely correct; test "
6511                           "and remove assert.");
6512           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
6513               Src1, Index0, Src0, Index1);
6514           T = lowerShuffleVector_AllFromSameSrc(
6515               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
6516               UNIFIED_INDEX_0);
6517         } else {
6518           assert(false && "Following code is untested but likely correct; test "
6519                           "and remove assert.");
6520           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
6521               Src1, Index0, Src0, Index1);
6522           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
6523               Src0, Index2, Src1, Index3);
6524           T = lowerShuffleVector_TwoFromSameSrc(
6525               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
6526               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6527         }
6528       }
6529       break;
6530       CASE_SRCS_IN(1, 0, 1, 0) : {
6531         if ((Index0 - ExpectedNumElements) == 0 && Index1 == 0 &&
6532             (Index2 - ExpectedNumElements) == 1 && Index3 == 1) {
6533           auto *Src1RM = legalize(Src0, Legal_Reg | Legal_Mem);
6534           auto *Src0R = legalizeToReg(Src1);
6535           T = makeReg(DestTy);
6536           _movp(T, Src0R);
6537           _punpckl(T, Src1RM);
6538         } else if (Index0 == Index2 && Index1 == Index3) {
6539           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
6540               Src1, Index0, Src0, Index1);
6541           T = lowerShuffleVector_AllFromSameSrc(
6542               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
6543               UNIFIED_INDEX_1);
6544         } else {
6545           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
6546               Src1, Index0, Src0, Index1);
6547           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
6548               Src1, Index2, Src0, Index3);
6549           T = lowerShuffleVector_TwoFromSameSrc(
6550               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
6551               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6552         }
6553       }
6554       break;
6555       CASE_SRCS_IN(1, 0, 1, 1) : {
6556         assert(false && "Following code is untested but likely correct; test "
6557                         "and remove assert.");
6558         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
6559                                                                   Src0, Index1);
6560         T = lowerShuffleVector_TwoFromSameSrc(
6561             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
6562       }
6563       break;
6564       CASE_SRCS_IN(1, 1, 0, 0) : {
6565         T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Src0,
6566                                               Index2, Index3);
6567       }
6568       break;
6569       CASE_SRCS_IN(1, 1, 0, 1) : {
6570         assert(false && "Following code is untested but likely correct; test "
6571                         "and remove assert.");
6572         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
6573                                                                   Src1, Index3);
6574         T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
6575                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6576       }
6577       break;
6578       CASE_SRCS_IN(1, 1, 1, 0) : {
6579         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
6580                                                                   Src0, Index3);
6581         T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
6582                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6583       }
6584       break;
6585       CASE_SRCS_IN(1, 1, 1, 1) : {
6586         assert(false && "Following code is untested but likely correct; test "
6587                         "and remove assert.");
6588         T = lowerShuffleVector_AllFromSameSrc(Src1, Index0, Index1, Index2,
6589                                               Index3);
6590       }
6591       break;
6592 #undef CASE_SRCS_IN
6593     }
6594 
6595     assert(T != nullptr);
6596     assert(T->getType() == DestTy);
6597     _movp(Dest, T);
6598     return;
6599   } break;
6600   }
6601 
6602   // Unoptimized shuffle. Perform a series of inserts and extracts.
6603   Context.insert<InstFakeDef>(T);
6604   const Type ElementType = typeElementType(DestTy);
6605   for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) {
6606     auto *Index = Instr->getIndex(I);
6607     const SizeT Elem = Index->getValue();
6608     auto *ExtElmt = makeReg(ElementType);
6609     if (Elem < NumElements) {
6610       lowerExtractElement(
6611           InstExtractElement::create(Func, ExtElmt, Src0, Index));
6612     } else {
6613       lowerExtractElement(InstExtractElement::create(
6614           Func, ExtElmt, Src1, Ctx->getConstantInt32(Elem - NumElements)));
6615     }
6616     auto *NewT = makeReg(DestTy);
6617     lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt,
6618                                                  Ctx->getConstantInt32(I)));
6619     T = NewT;
6620   }
6621   _movp(Dest, T);
6622 }
6623 
6624 template <typename TraitsType>
6625 void TargetX86Base<TraitsType>::lowerSelect(const InstSelect *Select) {
6626   Variable *Dest = Select->getDest();
6627 
6628   Operand *Condition = Select->getCondition();
6629   // Handle folding opportunities.
6630   if (const Inst *Producer = FoldingInfo.getProducerFor(Condition)) {
6631     assert(Producer->isDeleted());
6632     switch (BoolFolding<Traits>::getProducerKind(Producer)) {
6633     default:
6634       break;
6635     case BoolFolding<Traits>::PK_Icmp32:
6636     case BoolFolding<Traits>::PK_Icmp64: {
6637       lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Select);
6638       return;
6639     }
6640     case BoolFolding<Traits>::PK_Fcmp: {
6641       lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Select);
6642       return;
6643     }
6644     }
6645   }
6646 
6647   if (isVectorType(Dest->getType())) {
6648     lowerSelectVector(Select);
6649     return;
6650   }
6651 
6652   Operand *CmpResult = legalize(Condition, Legal_Reg | Legal_Mem);
6653   Operand *Zero = Ctx->getConstantZero(IceType_i32);
6654   _cmp(CmpResult, Zero);
6655   Operand *SrcT = Select->getTrueOperand();
6656   Operand *SrcF = Select->getFalseOperand();
6657   const BrCond Cond = Traits::Cond::Br_ne;
6658   lowerSelectMove(Dest, Cond, SrcT, SrcF);
6659 }
6660 
6661 template <typename TraitsType>
6662 void TargetX86Base<TraitsType>::lowerSelectMove(Variable *Dest, BrCond Cond,
6663                                                 Operand *SrcT, Operand *SrcF) {
6664   Type DestTy = Dest->getType();
6665   if (typeWidthInBytes(DestTy) == 1 || isFloatingType(DestTy)) {
6666     // The cmov instruction doesn't allow 8-bit or FP operands, so we need
6667     // explicit control flow.
6668     // d=cmp e,f; a=d?b:c ==> cmp e,f; a=b; jne L1; a=c; L1:
6669     auto *Label = InstX86Label::create(Func, this);
6670     SrcT = legalize(SrcT, Legal_Reg | Legal_Imm);
6671     _mov(Dest, SrcT);
6672     _br(Cond, Label);
6673     SrcF = legalize(SrcF, Legal_Reg | Legal_Imm);
6674     _redefined(_mov(Dest, SrcF));
6675     Context.insert(Label);
6676     return;
6677   }
6678   // mov t, SrcF; cmov_cond t, SrcT; mov dest, t
6679   // But if SrcT is immediate, we might be able to do better, as the cmov
6680   // instruction doesn't allow an immediate operand:
6681   // mov t, SrcT; cmov_!cond t, SrcF; mov dest, t
6682   if (llvm::isa<Constant>(SrcT) && !llvm::isa<Constant>(SrcF)) {
6683     std::swap(SrcT, SrcF);
6684     Cond = InstImpl<TraitsType>::InstX86Base::getOppositeCondition(Cond);
6685   }
6686   if (!Traits::Is64Bit && DestTy == IceType_i64) {
6687     SrcT = legalizeUndef(SrcT);
6688     SrcF = legalizeUndef(SrcF);
6689     // Set the low portion.
6690     auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
6691     lowerSelectIntMove(DestLo, Cond, loOperand(SrcT), loOperand(SrcF));
6692     // Set the high portion.
6693     auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
6694     lowerSelectIntMove(DestHi, Cond, hiOperand(SrcT), hiOperand(SrcF));
6695     return;
6696   }
6697 
6698   assert(DestTy == IceType_i16 || DestTy == IceType_i32 ||
6699          (Traits::Is64Bit && DestTy == IceType_i64));
6700   lowerSelectIntMove(Dest, Cond, SrcT, SrcF);
6701 }
6702 
6703 template <typename TraitsType>
6704 void TargetX86Base<TraitsType>::lowerSelectIntMove(Variable *Dest, BrCond Cond,
6705                                                    Operand *SrcT,
6706                                                    Operand *SrcF) {
6707   Variable *T = nullptr;
6708   SrcF = legalize(SrcF);
6709   _mov(T, SrcF);
6710   SrcT = legalize(SrcT, Legal_Reg | Legal_Mem);
6711   _cmov(T, SrcT, Cond);
6712   _mov(Dest, T);
6713 }
6714 
6715 template <typename TraitsType>
6716 void TargetX86Base<TraitsType>::lowerMove(Variable *Dest, Operand *Src,
6717                                           bool IsRedefinition) {
6718   assert(Dest->getType() == Src->getType());
6719   assert(!Dest->isRematerializable());
6720   if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
6721     Src = legalize(Src);
6722     Operand *SrcLo = loOperand(Src);
6723     Operand *SrcHi = hiOperand(Src);
6724     auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
6725     auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
6726     Variable *T_Lo = nullptr, *T_Hi = nullptr;
6727     _mov(T_Lo, SrcLo);
6728     _redefined(_mov(DestLo, T_Lo), IsRedefinition);
6729     _mov(T_Hi, SrcHi);
6730     _redefined(_mov(DestHi, T_Hi), IsRedefinition);
6731   } else {
6732     Operand *SrcLegal;
6733     if (Dest->hasReg()) {
6734       // If Dest already has a physical register, then only basic legalization
6735       // is needed, as the source operand can be a register, immediate, or
6736       // memory.
6737       SrcLegal = legalize(Src, Legal_Reg, Dest->getRegNum());
6738     } else {
6739       // If Dest could be a stack operand, then RI must be a physical register
6740       // or a scalar integer immediate.
6741       SrcLegal = legalize(Src, Legal_Reg | Legal_Imm);
6742     }
6743     if (isVectorType(Dest->getType())) {
6744       _redefined(_movp(Dest, SrcLegal), IsRedefinition);
6745     } else {
6746       _redefined(_mov(Dest, SrcLegal), IsRedefinition);
6747     }
6748   }
6749 }
6750 
6751 template <typename TraitsType>
6752 bool TargetX86Base<TraitsType>::lowerOptimizeFcmpSelect(
6753     const InstFcmp *Fcmp, const InstSelect *Select) {
6754   Operand *CmpSrc0 = Fcmp->getSrc(0);
6755   Operand *CmpSrc1 = Fcmp->getSrc(1);
6756   Operand *SelectSrcT = Select->getTrueOperand();
6757   Operand *SelectSrcF = Select->getFalseOperand();
6758   Variable *SelectDest = Select->getDest();
6759 
6760   // TODO(capn): also handle swapped compare/select operand order.
6761   if (CmpSrc0 != SelectSrcT || CmpSrc1 != SelectSrcF)
6762     return false;
6763 
6764   // TODO(sehr, stichnot): fcmp/select patterns (e.g., minsd/maxss) go here.
6765   InstFcmp::FCond Condition = Fcmp->getCondition();
6766   switch (Condition) {
6767   default:
6768     return false;
6769   case InstFcmp::True:
6770     break;
6771   case InstFcmp::False:
6772     break;
6773   case InstFcmp::Ogt: {
6774     Variable *T = makeReg(SelectDest->getType());
6775     if (isScalarFloatingType(SelectSrcT->getType())) {
6776       _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
6777       _maxss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
6778       _mov(SelectDest, T);
6779     } else {
6780       _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
6781       _maxps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
6782       _movp(SelectDest, T);
6783     }
6784     return true;
6785   } break;
6786   case InstFcmp::Olt: {
6787     Variable *T = makeReg(SelectSrcT->getType());
6788     if (isScalarFloatingType(SelectSrcT->getType())) {
6789       _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
6790       _minss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
6791       _mov(SelectDest, T);
6792     } else {
6793       _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
6794       _minps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
6795       _movp(SelectDest, T);
6796     }
6797     return true;
6798   } break;
6799   }
6800   return false;
6801 }
6802 
6803 template <typename TraitsType>
6804 void TargetX86Base<TraitsType>::lowerIcmp(const InstIcmp *Icmp) {
6805   Variable *Dest = Icmp->getDest();
6806   if (isVectorType(Dest->getType())) {
6807     lowerIcmpVector(Icmp);
6808   } else {
6809     constexpr Inst *Consumer = nullptr;
6810     lowerIcmpAndConsumer(Icmp, Consumer);
6811   }
6812 }
6813 
6814 template <typename TraitsType>
6815 void TargetX86Base<TraitsType>::lowerSelectVector(const InstSelect *Instr) {
6816   Variable *Dest = Instr->getDest();
6817   Type DestTy = Dest->getType();
6818   Operand *SrcT = Instr->getTrueOperand();
6819   Operand *SrcF = Instr->getFalseOperand();
6820   Operand *Condition = Instr->getCondition();
6821 
6822   if (!isVectorType(DestTy))
6823     llvm::report_fatal_error("Expected a vector select");
6824 
6825   Type SrcTy = SrcT->getType();
6826   Variable *T = makeReg(SrcTy);
6827   Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem);
6828   Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem);
6829 
6830   if (InstructionSet >= Traits::SSE4_1) {
6831     // TODO(wala): If the condition operand is a constant, use blendps or
6832     // pblendw.
6833     //
6834     // Use blendvps or pblendvb to implement select.
6835     if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 ||
6836         SrcTy == IceType_v4f32) {
6837       Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
6838       Variable *xmm0 = makeReg(IceType_v4i32, Traits::RegisterSet::Reg_xmm0);
6839       _movp(xmm0, ConditionRM);
6840       _psll(xmm0, Ctx->getConstantInt8(31));
6841       _movp(T, SrcFRM);
6842       _blendvps(T, SrcTRM, xmm0);
6843       _movp(Dest, T);
6844     } else {
6845       assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16);
6846       Type SignExtTy =
6847           Condition->getType() == IceType_v8i1 ? IceType_v8i16 : IceType_v16i8;
6848       Variable *xmm0 = makeReg(SignExtTy, Traits::RegisterSet::Reg_xmm0);
6849       lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition));
6850       _movp(T, SrcFRM);
6851       _pblendvb(T, SrcTRM, xmm0);
6852       _movp(Dest, T);
6853     }
6854     return;
6855   }
6856   // Lower select without Traits::SSE4.1:
6857   // a=d?b:c ==>
6858   //   if elementtype(d) != i1:
6859   //      d=sext(d);
6860   //   a=(b&d)|(c&~d);
6861   Variable *T2 = makeReg(SrcTy);
6862   // Sign extend the condition operand if applicable.
6863   if (SrcTy == IceType_v4f32) {
6864     // The sext operation takes only integer arguments.
6865     Variable *T3 = Func->makeVariable(IceType_v4i32);
6866     lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition));
6867     _movp(T, T3);
6868   } else if (typeElementType(SrcTy) != IceType_i1) {
6869     lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));
6870   } else {
6871     Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
6872     _movp(T, ConditionRM);
6873   }
6874   _movp(T2, T);
6875   _pand(T, SrcTRM);
6876   _pandn(T2, SrcFRM);
6877   _por(T, T2);
6878   _movp(Dest, T);
6879 
6880   return;
6881 }
6882 
6883 template <typename TraitsType>
6884 void TargetX86Base<TraitsType>::lowerStore(const InstStore *Instr) {
6885   Operand *Value = Instr->getData();
6886   Operand *Addr = Instr->getAddr();
6887   X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType());
6888   doMockBoundsCheck(NewAddr);
6889   Type Ty = NewAddr->getType();
6890 
6891   if (!Traits::Is64Bit && Ty == IceType_i64) {
6892     Value = legalizeUndef(Value);
6893     Operand *ValueHi = legalize(hiOperand(Value), Legal_Reg | Legal_Imm);
6894     _store(ValueHi, llvm::cast<X86OperandMem>(hiOperand(NewAddr)));
6895     Operand *ValueLo = legalize(loOperand(Value), Legal_Reg | Legal_Imm);
6896     _store(ValueLo, llvm::cast<X86OperandMem>(loOperand(NewAddr)));
6897   } else if (isVectorType(Ty)) {
6898     _storep(legalizeToReg(Value), NewAddr);
6899   } else {
6900     Value = legalize(Value, Legal_Reg | Legal_Imm);
6901     _store(Value, NewAddr);
6902   }
6903 }
6904 
6905 template <typename TraitsType>
6906 void TargetX86Base<TraitsType>::doAddressOptStore() {
6907   auto *Instr = llvm::cast<InstStore>(Context.getCur());
6908   Operand *Addr = Instr->getAddr();
6909   Operand *Data = Instr->getData();
6910   if (auto *OptAddr = computeAddressOpt(Instr, Data->getType(), Addr)) {
6911     Instr->setDeleted();
6912     auto *NewStore = Context.insert<InstStore>(Data, OptAddr);
6913     if (Instr->getDest())
6914       NewStore->setRmwBeacon(Instr->getRmwBeacon());
6915   }
6916 }
6917 
6918 template <typename TraitsType>
6919 void TargetX86Base<TraitsType>::doAddressOptStoreSubVector() {
6920   auto *Intrinsic = llvm::cast<InstIntrinsicCall>(Context.getCur());
6921   Operand *Addr = Intrinsic->getArg(1);
6922   Operand *Data = Intrinsic->getArg(0);
6923   if (auto *OptAddr = computeAddressOpt(Intrinsic, Data->getType(), Addr)) {
6924     Intrinsic->setDeleted();
6925     const Ice::Intrinsics::IntrinsicInfo Info = {
6926         Ice::Intrinsics::StoreSubVector, Ice::Intrinsics::SideEffects_T,
6927         Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T};
6928     auto Target = Ctx->getConstantUndef(Ice::IceType_i32);
6929     auto *NewStore =
6930         Context.insert<InstIntrinsicCall>(3, nullptr, Target, Info);
6931     NewStore->addArg(Data);
6932     NewStore->addArg(OptAddr);
6933     NewStore->addArg(Intrinsic->getArg(2));
6934   }
6935 }
6936 
6937 template <typename TraitsType>
6938 Operand *TargetX86Base<TraitsType>::lowerCmpRange(Operand *Comparison,
6939                                                   uint64_t Min, uint64_t Max) {
6940   // TODO(ascull): 64-bit should not reach here but only because it is not
6941   // implemented yet. This should be able to handle the 64-bit case.
6942   assert(Traits::Is64Bit || Comparison->getType() != IceType_i64);
6943   // Subtracting 0 is a nop so don't do it
6944   if (Min != 0) {
6945     // Avoid clobbering the comparison by copying it
6946     Variable *T = nullptr;
6947     _mov(T, Comparison);
6948     _sub(T, Ctx->getConstantInt32(Min));
6949     Comparison = T;
6950   }
6951 
6952   _cmp(Comparison, Ctx->getConstantInt32(Max - Min));
6953 
6954   return Comparison;
6955 }
6956 
6957 template <typename TraitsType>
6958 void TargetX86Base<TraitsType>::lowerCaseCluster(const CaseCluster &Case,
6959                                                  Operand *Comparison,
6960                                                  bool DoneCmp,
6961                                                  CfgNode *DefaultTarget) {
6962   switch (Case.getKind()) {
6963   case CaseCluster::JumpTable: {
6964     InstX86Label *SkipJumpTable;
6965 
6966     Operand *RangeIndex =
6967         lowerCmpRange(Comparison, Case.getLow(), Case.getHigh());
6968     if (DefaultTarget == nullptr) {
6969       // Skip over jump table logic if comparison not in range and no default
6970       SkipJumpTable = InstX86Label::create(Func, this);
6971       _br(Traits::Cond::Br_a, SkipJumpTable);
6972     } else {
6973       _br(Traits::Cond::Br_a, DefaultTarget);
6974     }
6975 
6976     InstJumpTable *JumpTable = Case.getJumpTable();
6977     Context.insert(JumpTable);
6978 
6979     // Make sure the index is a register of the same width as the base
6980     Variable *Index;
6981     const Type PointerType = getPointerType();
6982     if (RangeIndex->getType() != PointerType) {
6983       Index = makeReg(PointerType);
6984       if (RangeIndex->getType() == IceType_i64) {
6985         assert(Traits::Is64Bit);
6986         _mov(Index, RangeIndex); // trunc
6987       } else {
6988         Operand *RangeIndexRM = legalize(RangeIndex, Legal_Reg | Legal_Mem);
6989         _movzx(Index, RangeIndexRM);
6990       }
6991     } else {
6992       Index = legalizeToReg(RangeIndex);
6993     }
6994 
6995     constexpr RelocOffsetT RelocOffset = 0;
6996     constexpr Variable *NoBase = nullptr;
6997     constexpr Constant *NoOffset = nullptr;
6998     auto JTName = GlobalString::createWithString(Ctx, JumpTable->getName());
6999     Constant *Offset = Ctx->getConstantSym(RelocOffset, JTName);
7000     uint16_t Shift = typeWidthInBytesLog2(PointerType);
7001     constexpr auto Segment = X86OperandMem::SegmentRegisters::DefaultSegment;
7002 
7003     Variable *Target = nullptr;
7004     if (Traits::Is64Bit && NeedSandboxing) {
7005       assert(Index != nullptr && Index->getType() == IceType_i32);
7006     }
7007 
7008     if (PointerType == IceType_i32) {
7009       _mov(Target, X86OperandMem::create(Func, PointerType, NoBase, Offset,
7010                                          Index, Shift, Segment));
7011     } else {
7012       auto *Base = makeReg(IceType_i64);
7013       _lea(Base, X86OperandMem::create(Func, IceType_void, NoBase, Offset));
7014       _mov(Target, X86OperandMem::create(Func, PointerType, Base, NoOffset,
7015                                          Index, Shift, Segment));
7016     }
7017 
7018     lowerIndirectJump(Target);
7019 
7020     if (DefaultTarget == nullptr)
7021       Context.insert(SkipJumpTable);
7022     return;
7023   }
7024   case CaseCluster::Range: {
7025     if (Case.isUnitRange()) {
7026       // Single item
7027       if (!DoneCmp) {
7028         Constant *Value = Ctx->getConstantInt32(Case.getLow());
7029         _cmp(Comparison, Value);
7030       }
7031       _br(Traits::Cond::Br_e, Case.getTarget());
7032     } else if (DoneCmp && Case.isPairRange()) {
7033       // Range of two items with first item aleady compared against
7034       _br(Traits::Cond::Br_e, Case.getTarget());
7035       Constant *Value = Ctx->getConstantInt32(Case.getHigh());
7036       _cmp(Comparison, Value);
7037       _br(Traits::Cond::Br_e, Case.getTarget());
7038     } else {
7039       // Range
7040       lowerCmpRange(Comparison, Case.getLow(), Case.getHigh());
7041       _br(Traits::Cond::Br_be, Case.getTarget());
7042     }
7043     if (DefaultTarget != nullptr)
7044       _br(DefaultTarget);
7045     return;
7046   }
7047   }
7048 }
7049 
7050 template <typename TraitsType>
7051 void TargetX86Base<TraitsType>::lowerSwitch(const InstSwitch *Instr) {
7052   // Group cases together and navigate through them with a binary search
7053   CaseClusterArray CaseClusters = CaseCluster::clusterizeSwitch(Func, Instr);
7054   Operand *Src0 = Instr->getComparison();
7055   CfgNode *DefaultTarget = Instr->getLabelDefault();
7056 
7057   assert(CaseClusters.size() != 0); // Should always be at least one
7058 
7059   if (!Traits::Is64Bit && Src0->getType() == IceType_i64) {
7060     Src0 = legalize(Src0); // get Base/Index into physical registers
7061     Operand *Src0Lo = loOperand(Src0);
7062     Operand *Src0Hi = hiOperand(Src0);
7063     if (CaseClusters.back().getHigh() > UINT32_MAX) {
7064       // TODO(ascull): handle 64-bit case properly (currently naive version)
7065       // This might be handled by a higher level lowering of switches.
7066       SizeT NumCases = Instr->getNumCases();
7067       if (NumCases >= 2) {
7068         Src0Lo = legalizeToReg(Src0Lo);
7069         Src0Hi = legalizeToReg(Src0Hi);
7070       } else {
7071         Src0Lo = legalize(Src0Lo, Legal_Reg | Legal_Mem);
7072         Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem);
7073       }
7074       for (SizeT I = 0; I < NumCases; ++I) {
7075         Constant *ValueLo = Ctx->getConstantInt32(Instr->getValue(I));
7076         Constant *ValueHi = Ctx->getConstantInt32(Instr->getValue(I) >> 32);
7077         InstX86Label *Label = InstX86Label::create(Func, this);
7078         _cmp(Src0Lo, ValueLo);
7079         _br(Traits::Cond::Br_ne, Label);
7080         _cmp(Src0Hi, ValueHi);
7081         _br(Traits::Cond::Br_e, Instr->getLabel(I));
7082         Context.insert(Label);
7083       }
7084       _br(Instr->getLabelDefault());
7085       return;
7086     } else {
7087       // All the values are 32-bit so just check the operand is too and then
7088       // fall through to the 32-bit implementation. This is a common case.
7089       Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem);
7090       Constant *Zero = Ctx->getConstantInt32(0);
7091       _cmp(Src0Hi, Zero);
7092       _br(Traits::Cond::Br_ne, DefaultTarget);
7093       Src0 = Src0Lo;
7094     }
7095   }
7096 
7097   // 32-bit lowering
7098 
7099   if (CaseClusters.size() == 1) {
7100     // Jump straight to default if needed. Currently a common case as jump
7101     // tables occur on their own.
7102     constexpr bool DoneCmp = false;
7103     lowerCaseCluster(CaseClusters.front(), Src0, DoneCmp, DefaultTarget);
7104     return;
7105   }
7106 
7107   // Going to be using multiple times so get it in a register early
7108   Variable *Comparison = legalizeToReg(Src0);
7109 
7110   // A span is over the clusters
7111   struct SearchSpan {
7112     SearchSpan(SizeT Begin, SizeT Size, InstX86Label *Label)
7113         : Begin(Begin), Size(Size), Label(Label) {}
7114 
7115     SizeT Begin;
7116     SizeT Size;
7117     InstX86Label *Label;
7118   };
7119   // The stack will only grow to the height of the tree so 12 should be plenty
7120   std::stack<SearchSpan, llvm::SmallVector<SearchSpan, 12>> SearchSpanStack;
7121   SearchSpanStack.emplace(0, CaseClusters.size(), nullptr);
7122   bool DoneCmp = false;
7123 
7124   while (!SearchSpanStack.empty()) {
7125     SearchSpan Span = SearchSpanStack.top();
7126     SearchSpanStack.pop();
7127 
7128     if (Span.Label != nullptr)
7129       Context.insert(Span.Label);
7130 
7131     switch (Span.Size) {
7132     case 0:
7133       llvm::report_fatal_error("Invalid SearchSpan size");
7134       break;
7135 
7136     case 1:
7137       lowerCaseCluster(CaseClusters[Span.Begin], Comparison, DoneCmp,
7138                        SearchSpanStack.empty() ? nullptr : DefaultTarget);
7139       DoneCmp = false;
7140       break;
7141 
7142     case 2: {
7143       const CaseCluster *CaseA = &CaseClusters[Span.Begin];
7144       const CaseCluster *CaseB = &CaseClusters[Span.Begin + 1];
7145 
7146       // Placing a range last may allow register clobbering during the range
7147       // test. That means there is no need to clone the register. If it is a
7148       // unit range the comparison may have already been done in the binary
7149       // search (DoneCmp) and so it should be placed first. If this is a range
7150       // of two items and the comparison with the low value has already been
7151       // done, comparing with the other element is cheaper than a range test.
7152       // If the low end of the range is zero then there is no subtraction and
7153       // nothing to be gained.
7154       if (!CaseA->isUnitRange() &&
7155           !(CaseA->getLow() == 0 || (DoneCmp && CaseA->isPairRange()))) {
7156         std::swap(CaseA, CaseB);
7157         DoneCmp = false;
7158       }
7159 
7160       lowerCaseCluster(*CaseA, Comparison, DoneCmp);
7161       DoneCmp = false;
7162       lowerCaseCluster(*CaseB, Comparison, DoneCmp,
7163                        SearchSpanStack.empty() ? nullptr : DefaultTarget);
7164     } break;
7165 
7166     default:
7167       // Pick the middle item and branch b or ae
7168       SizeT PivotIndex = Span.Begin + (Span.Size / 2);
7169       const CaseCluster &Pivot = CaseClusters[PivotIndex];
7170       Constant *Value = Ctx->getConstantInt32(Pivot.getLow());
7171       InstX86Label *Label = InstX86Label::create(Func, this);
7172       _cmp(Comparison, Value);
7173       // TODO(ascull): does it alway have to be far?
7174       _br(Traits::Cond::Br_b, Label, InstX86Br::Far);
7175       // Lower the left and (pivot+right) sides, falling through to the right
7176       SearchSpanStack.emplace(Span.Begin, Span.Size / 2, Label);
7177       SearchSpanStack.emplace(PivotIndex, Span.Size - (Span.Size / 2), nullptr);
7178       DoneCmp = true;
7179       break;
7180     }
7181   }
7182 
7183   _br(DefaultTarget);
7184 }
7185 
7186 /// The following pattern occurs often in lowered C and C++ code:
7187 ///
7188 ///   %cmp     = fcmp/icmp pred <n x ty> %src0, %src1
7189 ///   %cmp.ext = sext <n x i1> %cmp to <n x ty>
7190 ///
7191 /// We can eliminate the sext operation by copying the result of pcmpeqd,
7192 /// pcmpgtd, or cmpps (which produce sign extended results) to the result of the
7193 /// sext operation.
7194 template <typename TraitsType>
7195 void TargetX86Base<TraitsType>::eliminateNextVectorSextInstruction(
7196     Variable *SignExtendedResult) {
7197   if (auto *NextCast =
7198           llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
7199     if (NextCast->getCastKind() == InstCast::Sext &&
7200         NextCast->getSrc(0) == SignExtendedResult) {
7201       NextCast->setDeleted();
7202       _movp(NextCast->getDest(), legalizeToReg(SignExtendedResult));
7203       // Skip over the instruction.
7204       Context.advanceNext();
7205     }
7206   }
7207 }
7208 
7209 template <typename TraitsType>
7210 void TargetX86Base<TraitsType>::lowerUnreachable(
7211     const InstUnreachable * /*Instr*/) {
7212   _ud2();
7213   // Add a fake use of esp to make sure esp adjustments after the unreachable
7214   // do not get dead-code eliminated.
7215   keepEspLiveAtExit();
7216 }
7217 
7218 template <typename TraitsType>
7219 void TargetX86Base<TraitsType>::lowerBreakpoint(
7220     const InstBreakpoint * /*Instr*/) {
7221   _int3();
7222 }
7223 
7224 template <typename TraitsType>
7225 void TargetX86Base<TraitsType>::lowerRMW(const InstX86FakeRMW *RMW) {
7226   // If the beacon variable's live range does not end in this instruction, then
7227   // it must end in the modified Store instruction that follows. This means
7228   // that the original Store instruction is still there, either because the
7229   // value being stored is used beyond the Store instruction, or because dead
7230   // code elimination did not happen. In either case, we cancel RMW lowering
7231   // (and the caller deletes the RMW instruction).
7232   if (!RMW->isLastUse(RMW->getBeacon()))
7233     return;
7234   Operand *Src = RMW->getData();
7235   Type Ty = Src->getType();
7236   X86OperandMem *Addr = formMemoryOperand(RMW->getAddr(), Ty);
7237   doMockBoundsCheck(Addr);
7238   if (!Traits::Is64Bit && Ty == IceType_i64) {
7239     Src = legalizeUndef(Src);
7240     Operand *SrcLo = legalize(loOperand(Src), Legal_Reg | Legal_Imm);
7241     Operand *SrcHi = legalize(hiOperand(Src), Legal_Reg | Legal_Imm);
7242     auto *AddrLo = llvm::cast<X86OperandMem>(loOperand(Addr));
7243     auto *AddrHi = llvm::cast<X86OperandMem>(hiOperand(Addr));
7244     switch (RMW->getOp()) {
7245     default:
7246       // TODO(stichnot): Implement other arithmetic operators.
7247       break;
7248     case InstArithmetic::Add:
7249       _add_rmw(AddrLo, SrcLo);
7250       _adc_rmw(AddrHi, SrcHi);
7251       return;
7252     case InstArithmetic::Sub:
7253       _sub_rmw(AddrLo, SrcLo);
7254       _sbb_rmw(AddrHi, SrcHi);
7255       return;
7256     case InstArithmetic::And:
7257       _and_rmw(AddrLo, SrcLo);
7258       _and_rmw(AddrHi, SrcHi);
7259       return;
7260     case InstArithmetic::Or:
7261       _or_rmw(AddrLo, SrcLo);
7262       _or_rmw(AddrHi, SrcHi);
7263       return;
7264     case InstArithmetic::Xor:
7265       _xor_rmw(AddrLo, SrcLo);
7266       _xor_rmw(AddrHi, SrcHi);
7267       return;
7268     }
7269   } else {
7270     // x86-32: i8, i16, i32
7271     // x86-64: i8, i16, i32, i64
7272     switch (RMW->getOp()) {
7273     default:
7274       // TODO(stichnot): Implement other arithmetic operators.
7275       break;
7276     case InstArithmetic::Add:
7277       Src = legalize(Src, Legal_Reg | Legal_Imm);
7278       _add_rmw(Addr, Src);
7279       return;
7280     case InstArithmetic::Sub:
7281       Src = legalize(Src, Legal_Reg | Legal_Imm);
7282       _sub_rmw(Addr, Src);
7283       return;
7284     case InstArithmetic::And:
7285       Src = legalize(Src, Legal_Reg | Legal_Imm);
7286       _and_rmw(Addr, Src);
7287       return;
7288     case InstArithmetic::Or:
7289       Src = legalize(Src, Legal_Reg | Legal_Imm);
7290       _or_rmw(Addr, Src);
7291       return;
7292     case InstArithmetic::Xor:
7293       Src = legalize(Src, Legal_Reg | Legal_Imm);
7294       _xor_rmw(Addr, Src);
7295       return;
7296     }
7297   }
7298   llvm::report_fatal_error("Couldn't lower RMW instruction");
7299 }
7300 
7301 template <typename TraitsType>
7302 void TargetX86Base<TraitsType>::lowerOther(const Inst *Instr) {
7303   if (const auto *RMW = llvm::dyn_cast<InstX86FakeRMW>(Instr)) {
7304     lowerRMW(RMW);
7305   } else {
7306     TargetLowering::lowerOther(Instr);
7307   }
7308 }
7309 
7310 /// Turn an i64 Phi instruction into a pair of i32 Phi instructions, to preserve
7311 /// integrity of liveness analysis. Undef values are also turned into zeroes,
7312 /// since loOperand() and hiOperand() don't expect Undef input.  Also, in
7313 /// Non-SFI mode, add a FakeUse(RebasePtr) for every pooled constant operand.
7314 template <typename TraitsType> void TargetX86Base<TraitsType>::prelowerPhis() {
7315   if (getFlags().getUseNonsfi()) {
7316     assert(RebasePtr);
7317     CfgNode *Node = Context.getNode();
7318     uint32_t RebasePtrUseCount = 0;
7319     for (Inst &I : Node->getPhis()) {
7320       auto *Phi = llvm::dyn_cast<InstPhi>(&I);
7321       if (Phi->isDeleted())
7322         continue;
7323       for (SizeT I = 0; I < Phi->getSrcSize(); ++I) {
7324         Operand *Src = Phi->getSrc(I);
7325         // TODO(stichnot): This over-counts for +0.0, and under-counts for other
7326         // kinds of pooling.
7327         if (llvm::isa<ConstantRelocatable>(Src) ||
7328             llvm::isa<ConstantFloat>(Src) || llvm::isa<ConstantDouble>(Src)) {
7329           ++RebasePtrUseCount;
7330         }
7331       }
7332     }
7333     if (RebasePtrUseCount) {
7334       Node->getInsts().push_front(InstFakeUse::create(Func, RebasePtr));
7335     }
7336   }
7337   if (Traits::Is64Bit) {
7338     // On x86-64 we don't need to prelower phis -- the architecture can handle
7339     // 64-bit integer natively.
7340     return;
7341   }
7342 
7343   // Pause constant blinding or pooling, blinding or pooling will be done later
7344   // during phi lowering assignments
7345   BoolFlagSaver B(RandomizationPoolingPaused, true);
7346   PhiLowering::prelowerPhis32Bit<TargetX86Base<TraitsType>>(
7347       this, Context.getNode(), Func);
7348 }
7349 
7350 template <typename TraitsType>
7351 void TargetX86Base<TraitsType>::genTargetHelperCallFor(Inst *Instr) {
7352   uint32_t StackArgumentsSize = 0;
7353   if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) {
7354     RuntimeHelper HelperID = RuntimeHelper::H_Num;
7355     Variable *Dest = Arith->getDest();
7356     Type DestTy = Dest->getType();
7357     if (!Traits::Is64Bit && DestTy == IceType_i64) {
7358       switch (Arith->getOp()) {
7359       default:
7360         return;
7361       case InstArithmetic::Udiv:
7362         HelperID = RuntimeHelper::H_udiv_i64;
7363         break;
7364       case InstArithmetic::Sdiv:
7365         HelperID = RuntimeHelper::H_sdiv_i64;
7366         break;
7367       case InstArithmetic::Urem:
7368         HelperID = RuntimeHelper::H_urem_i64;
7369         break;
7370       case InstArithmetic::Srem:
7371         HelperID = RuntimeHelper::H_srem_i64;
7372         break;
7373       }
7374     } else if (isVectorType(DestTy)) {
7375       Variable *Dest = Arith->getDest();
7376       Operand *Src0 = Arith->getSrc(0);
7377       Operand *Src1 = Arith->getSrc(1);
7378       switch (Arith->getOp()) {
7379       default:
7380         return;
7381       case InstArithmetic::Mul:
7382         if (DestTy == IceType_v16i8) {
7383           scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1);
7384           Arith->setDeleted();
7385         }
7386         return;
7387       case InstArithmetic::Shl:
7388       case InstArithmetic::Lshr:
7389       case InstArithmetic::Ashr:
7390         if (llvm::isa<Constant>(Src1)) {
7391           return;
7392         }
7393       case InstArithmetic::Udiv:
7394       case InstArithmetic::Urem:
7395       case InstArithmetic::Sdiv:
7396       case InstArithmetic::Srem:
7397       case InstArithmetic::Frem:
7398         scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1);
7399         Arith->setDeleted();
7400         return;
7401       }
7402     } else {
7403       switch (Arith->getOp()) {
7404       default:
7405         return;
7406       case InstArithmetic::Frem:
7407         if (isFloat32Asserting32Or64(DestTy))
7408           HelperID = RuntimeHelper::H_frem_f32;
7409         else
7410           HelperID = RuntimeHelper::H_frem_f64;
7411       }
7412     }
7413     constexpr SizeT MaxSrcs = 2;
7414     InstCall *Call = makeHelperCall(HelperID, Dest, MaxSrcs);
7415     Call->addArg(Arith->getSrc(0));
7416     Call->addArg(Arith->getSrc(1));
7417     StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
7418     Context.insert(Call);
7419     Arith->setDeleted();
7420   } else if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
7421     InstCast::OpKind CastKind = Cast->getCastKind();
7422     Operand *Src0 = Cast->getSrc(0);
7423     const Type SrcType = Src0->getType();
7424     Variable *Dest = Cast->getDest();
7425     const Type DestTy = Dest->getType();
7426     RuntimeHelper HelperID = RuntimeHelper::H_Num;
7427     Variable *CallDest = Dest;
7428     switch (CastKind) {
7429     default:
7430       return;
7431     case InstCast::Fptosi:
7432       if (!Traits::Is64Bit && DestTy == IceType_i64) {
7433         HelperID = isFloat32Asserting32Or64(SrcType)
7434                        ? RuntimeHelper::H_fptosi_f32_i64
7435                        : RuntimeHelper::H_fptosi_f64_i64;
7436       } else {
7437         return;
7438       }
7439       break;
7440     case InstCast::Fptoui:
7441       if (isVectorType(DestTy)) {
7442         assert(DestTy == IceType_v4i32);
7443         assert(SrcType == IceType_v4f32);
7444         HelperID = RuntimeHelper::H_fptoui_4xi32_f32;
7445       } else if (DestTy == IceType_i64 ||
7446                  (!Traits::Is64Bit && DestTy == IceType_i32)) {
7447         if (Traits::Is64Bit) {
7448           HelperID = isFloat32Asserting32Or64(SrcType)
7449                          ? RuntimeHelper::H_fptoui_f32_i64
7450                          : RuntimeHelper::H_fptoui_f64_i64;
7451         } else if (isInt32Asserting32Or64(DestTy)) {
7452           HelperID = isFloat32Asserting32Or64(SrcType)
7453                          ? RuntimeHelper::H_fptoui_f32_i32
7454                          : RuntimeHelper::H_fptoui_f64_i32;
7455         } else {
7456           HelperID = isFloat32Asserting32Or64(SrcType)
7457                          ? RuntimeHelper::H_fptoui_f32_i64
7458                          : RuntimeHelper::H_fptoui_f64_i64;
7459         }
7460       } else {
7461         return;
7462       }
7463       break;
7464     case InstCast::Sitofp:
7465       if (!Traits::Is64Bit && SrcType == IceType_i64) {
7466         HelperID = isFloat32Asserting32Or64(DestTy)
7467                        ? RuntimeHelper::H_sitofp_i64_f32
7468                        : RuntimeHelper::H_sitofp_i64_f64;
7469       } else {
7470         return;
7471       }
7472       break;
7473     case InstCast::Uitofp:
7474       if (isVectorType(SrcType)) {
7475         assert(DestTy == IceType_v4f32);
7476         assert(SrcType == IceType_v4i32);
7477         HelperID = RuntimeHelper::H_uitofp_4xi32_4xf32;
7478       } else if (SrcType == IceType_i64 ||
7479                  (!Traits::Is64Bit && SrcType == IceType_i32)) {
7480         if (isInt32Asserting32Or64(SrcType)) {
7481           HelperID = isFloat32Asserting32Or64(DestTy)
7482                          ? RuntimeHelper::H_uitofp_i32_f32
7483                          : RuntimeHelper::H_uitofp_i32_f64;
7484         } else {
7485           HelperID = isFloat32Asserting32Or64(DestTy)
7486                          ? RuntimeHelper::H_uitofp_i64_f32
7487                          : RuntimeHelper::H_uitofp_i64_f64;
7488         }
7489       } else {
7490         return;
7491       }
7492       break;
7493     case InstCast::Bitcast: {
7494       if (DestTy == Src0->getType())
7495         return;
7496       switch (DestTy) {
7497       default:
7498         return;
7499       case IceType_i8:
7500         assert(Src0->getType() == IceType_v8i1);
7501         HelperID = RuntimeHelper::H_bitcast_8xi1_i8;
7502         CallDest = Func->makeVariable(IceType_i32);
7503         break;
7504       case IceType_i16:
7505         assert(Src0->getType() == IceType_v16i1);
7506         HelperID = RuntimeHelper::H_bitcast_16xi1_i16;
7507         CallDest = Func->makeVariable(IceType_i32);
7508         break;
7509       case IceType_v8i1: {
7510         assert(Src0->getType() == IceType_i8);
7511         HelperID = RuntimeHelper::H_bitcast_i8_8xi1;
7512         Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
7513         // Arguments to functions are required to be at least 32 bits wide.
7514         Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
7515         Src0 = Src0AsI32;
7516       } break;
7517       case IceType_v16i1: {
7518         assert(Src0->getType() == IceType_i16);
7519         HelperID = RuntimeHelper::H_bitcast_i16_16xi1;
7520         Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
7521         // Arguments to functions are required to be at least 32 bits wide.
7522         Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
7523         Src0 = Src0AsI32;
7524       } break;
7525       }
7526     } break;
7527     }
7528     constexpr SizeT MaxSrcs = 1;
7529     InstCall *Call = makeHelperCall(HelperID, CallDest, MaxSrcs);
7530     Call->addArg(Src0);
7531     StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
7532     Context.insert(Call);
7533     // The PNaCl ABI disallows i8/i16 return types, so truncate the helper call
7534     // result to the appropriate type as necessary.
7535     if (CallDest->getType() != Dest->getType())
7536       Context.insert<InstCast>(InstCast::Trunc, Dest, CallDest);
7537     Cast->setDeleted();
7538   } else if (auto *Intrinsic = llvm::dyn_cast<InstIntrinsicCall>(Instr)) {
7539     CfgVector<Type> ArgTypes;
7540     Type ReturnType = IceType_void;
7541     switch (Intrinsics::IntrinsicID ID = Intrinsic->getIntrinsicInfo().ID) {
7542     default:
7543       return;
7544     case Intrinsics::Ctpop: {
7545       Operand *Val = Intrinsic->getArg(0);
7546       Type ValTy = Val->getType();
7547       if (ValTy == IceType_i64)
7548         ArgTypes = {IceType_i64};
7549       else
7550         ArgTypes = {IceType_i32};
7551       ReturnType = IceType_i32;
7552     } break;
7553     case Intrinsics::Longjmp:
7554       ArgTypes = {IceType_i32, IceType_i32};
7555       ReturnType = IceType_void;
7556       break;
7557     case Intrinsics::Memcpy:
7558       ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
7559       ReturnType = IceType_void;
7560       break;
7561     case Intrinsics::Memmove:
7562       ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
7563       ReturnType = IceType_void;
7564       break;
7565     case Intrinsics::Memset:
7566       ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
7567       ReturnType = IceType_void;
7568       break;
7569     case Intrinsics::NaClReadTP:
7570       ReturnType = IceType_i32;
7571       break;
7572     case Intrinsics::Setjmp:
7573       ArgTypes = {IceType_i32};
7574       ReturnType = IceType_i32;
7575       break;
7576     }
7577     StackArgumentsSize = getCallStackArgumentsSizeBytes(ArgTypes, ReturnType);
7578   } else if (auto *Call = llvm::dyn_cast<InstCall>(Instr)) {
7579     StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
7580   } else if (auto *Ret = llvm::dyn_cast<InstRet>(Instr)) {
7581     if (!Ret->hasRetValue())
7582       return;
7583     Operand *RetValue = Ret->getRetValue();
7584     Type ReturnType = RetValue->getType();
7585     if (!isScalarFloatingType(ReturnType))
7586       return;
7587     StackArgumentsSize = typeWidthInBytes(ReturnType);
7588   } else {
7589     return;
7590   }
7591   StackArgumentsSize = Traits::applyStackAlignment(StackArgumentsSize);
7592   updateMaxOutArgsSizeBytes(StackArgumentsSize);
7593 }
7594 
7595 template <typename TraitsType>
7596 uint32_t TargetX86Base<TraitsType>::getCallStackArgumentsSizeBytes(
7597     const CfgVector<Type> &ArgTypes, Type ReturnType) {
7598   uint32_t OutArgumentsSizeBytes = 0;
7599   uint32_t XmmArgCount = 0;
7600   uint32_t GprArgCount = 0;
7601   for (Type Ty : ArgTypes) {
7602     // The PNaCl ABI requires the width of arguments to be at least 32 bits.
7603     assert(typeWidthInBytes(Ty) >= 4);
7604     if (isVectorType(Ty) && XmmArgCount < Traits::X86_MAX_XMM_ARGS) {
7605       ++XmmArgCount;
7606     } else if (isScalarFloatingType(Ty) && Traits::X86_PASS_SCALAR_FP_IN_XMM &&
7607                XmmArgCount < Traits::X86_MAX_XMM_ARGS) {
7608       ++XmmArgCount;
7609     } else if (isScalarIntegerType(Ty) &&
7610                GprArgCount < Traits::X86_MAX_GPR_ARGS) {
7611       // The 64 bit ABI allows some integers to be passed in GPRs.
7612       ++GprArgCount;
7613     } else {
7614       if (isVectorType(Ty)) {
7615         OutArgumentsSizeBytes =
7616             Traits::applyStackAlignment(OutArgumentsSizeBytes);
7617       }
7618       OutArgumentsSizeBytes += typeWidthInBytesOnStack(Ty);
7619     }
7620   }
7621   if (Traits::Is64Bit)
7622     return OutArgumentsSizeBytes;
7623   // The 32 bit ABI requires floating point values to be returned on the x87 FP
7624   // stack. Ensure there is enough space for the fstp/movs for floating returns.
7625   if (isScalarFloatingType(ReturnType)) {
7626     OutArgumentsSizeBytes =
7627         std::max(OutArgumentsSizeBytes,
7628                  static_cast<uint32_t>(typeWidthInBytesOnStack(ReturnType)));
7629   }
7630   return OutArgumentsSizeBytes;
7631 }
7632 
7633 template <typename TraitsType>
7634 uint32_t TargetX86Base<TraitsType>::getCallStackArgumentsSizeBytes(
7635     const InstCall *Instr) {
7636   // Build a vector of the arguments' types.
7637   const SizeT NumArgs = Instr->getNumArgs();
7638   CfgVector<Type> ArgTypes;
7639   ArgTypes.reserve(NumArgs);
7640   for (SizeT i = 0; i < NumArgs; ++i) {
7641     Operand *Arg = Instr->getArg(i);
7642     ArgTypes.emplace_back(Arg->getType());
7643   }
7644   // Compute the return type (if any);
7645   Type ReturnType = IceType_void;
7646   Variable *Dest = Instr->getDest();
7647   if (Dest != nullptr)
7648     ReturnType = Dest->getType();
7649   return getCallStackArgumentsSizeBytes(ArgTypes, ReturnType);
7650 }
7651 
7652 template <typename TraitsType>
7653 Variable *TargetX86Base<TraitsType>::makeZeroedRegister(Type Ty,
7654                                                         RegNumT RegNum) {
7655   Variable *Reg = makeReg(Ty, RegNum);
7656   switch (Ty) {
7657   case IceType_i1:
7658   case IceType_i8:
7659   case IceType_i16:
7660   case IceType_i32:
7661   case IceType_i64:
7662     // Conservatively do "mov reg, 0" to avoid modifying FLAGS.
7663     _mov(Reg, Ctx->getConstantZero(Ty));
7664     break;
7665   case IceType_f32:
7666   case IceType_f64:
7667     Context.insert<InstFakeDef>(Reg);
7668     _xorps(Reg, Reg);
7669     break;
7670   default:
7671     // All vector types use the same pxor instruction.
7672     assert(isVectorType(Ty));
7673     Context.insert<InstFakeDef>(Reg);
7674     _pxor(Reg, Reg);
7675     break;
7676   }
7677   return Reg;
7678 }
7679 
7680 // There is no support for loading or emitting vector constants, so the vector
7681 // values returned from makeVectorOfZeros, makeVectorOfOnes, etc. are
7682 // initialized with register operations.
7683 //
7684 // TODO(wala): Add limited support for vector constants so that complex
7685 // initialization in registers is unnecessary.
7686 
7687 template <typename TraitsType>
7688 Variable *TargetX86Base<TraitsType>::makeVectorOfZeros(Type Ty,
7689                                                        RegNumT RegNum) {
7690   return makeZeroedRegister(Ty, RegNum);
7691 }
7692 
7693 template <typename TraitsType>
7694 Variable *TargetX86Base<TraitsType>::makeVectorOfMinusOnes(Type Ty,
7695                                                            RegNumT RegNum) {
7696   Variable *MinusOnes = makeReg(Ty, RegNum);
7697   // Insert a FakeDef so the live range of MinusOnes is not overestimated.
7698   Context.insert<InstFakeDef>(MinusOnes);
7699   if (Ty == IceType_f64)
7700     // Making a vector of minus ones of type f64 is currently only used for the
7701     // fabs intrinsic.  To use the f64 type to create this mask with pcmpeqq
7702     // requires SSE 4.1.  Since we're just creating a mask, pcmpeqd does the
7703     // same job and only requires SSE2.
7704     _pcmpeq(MinusOnes, MinusOnes, IceType_f32);
7705   else
7706     _pcmpeq(MinusOnes, MinusOnes);
7707   return MinusOnes;
7708 }
7709 
7710 template <typename TraitsType>
7711 Variable *TargetX86Base<TraitsType>::makeVectorOfOnes(Type Ty, RegNumT RegNum) {
7712   Variable *Dest = makeVectorOfZeros(Ty, RegNum);
7713   Variable *MinusOne = makeVectorOfMinusOnes(Ty);
7714   _psub(Dest, MinusOne);
7715   return Dest;
7716 }
7717 
7718 template <typename TraitsType>
7719 Variable *TargetX86Base<TraitsType>::makeVectorOfHighOrderBits(Type Ty,
7720                                                                RegNumT RegNum) {
7721   assert(Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v8i16 ||
7722          Ty == IceType_v16i8);
7723   if (Ty == IceType_v4f32 || Ty == IceType_v4i32 || Ty == IceType_v8i16) {
7724     Variable *Reg = makeVectorOfOnes(Ty, RegNum);
7725     SizeT Shift =
7726         typeWidthInBytes(typeElementType(Ty)) * Traits::X86_CHAR_BIT - 1;
7727     _psll(Reg, Ctx->getConstantInt8(Shift));
7728     return Reg;
7729   } else {
7730     // SSE has no left shift operation for vectors of 8 bit integers.
7731     constexpr uint32_t HIGH_ORDER_BITS_MASK = 0x80808080;
7732     Constant *ConstantMask = Ctx->getConstantInt32(HIGH_ORDER_BITS_MASK);
7733     Variable *Reg = makeReg(Ty, RegNum);
7734     _movd(Reg, legalize(ConstantMask, Legal_Reg | Legal_Mem));
7735     _pshufd(Reg, Reg, Ctx->getConstantZero(IceType_i8));
7736     return Reg;
7737   }
7738 }
7739 
7740 /// Construct a mask in a register that can be and'ed with a floating-point
7741 /// value to mask off its sign bit. The value will be <4 x 0x7fffffff> for f32
7742 /// and v4f32, and <2 x 0x7fffffffffffffff> for f64. Construct it as vector of
7743 /// ones logically right shifted one bit.
7744 // TODO(stichnot): Fix the wala
7745 // TODO: above, to represent vector constants in memory.
7746 template <typename TraitsType>
7747 Variable *TargetX86Base<TraitsType>::makeVectorOfFabsMask(Type Ty,
7748                                                           RegNumT RegNum) {
7749   Variable *Reg = makeVectorOfMinusOnes(Ty, RegNum);
7750   _psrl(Reg, Ctx->getConstantInt8(1));
7751   return Reg;
7752 }
7753 
7754 template <typename TraitsType>
7755 typename TargetX86Base<TraitsType>::X86OperandMem *
7756 TargetX86Base<TraitsType>::getMemoryOperandForStackSlot(Type Ty, Variable *Slot,
7757                                                         uint32_t Offset) {
7758   // Ensure that Loc is a stack slot.
7759   assert(Slot->mustNotHaveReg());
7760   assert(Slot->getRegNum().hasNoValue());
7761   // Compute the location of Loc in memory.
7762   // TODO(wala,stichnot): lea should not
7763   // be required. The address of the stack slot is known at compile time
7764   // (although not until after addProlog()).
7765   const Type PointerType = getPointerType();
7766   Variable *Loc = makeReg(PointerType);
7767   _lea(Loc, Slot);
7768   Constant *ConstantOffset = Ctx->getConstantInt32(Offset);
7769   return X86OperandMem::create(Func, Ty, Loc, ConstantOffset);
7770 }
7771 
7772 /// Lowering helper to copy a scalar integer source operand into some 8-bit GPR.
7773 /// Src is assumed to already be legalized.  If the source operand is known to
7774 /// be a memory or immediate operand, a simple mov will suffice.  But if the
7775 /// source operand can be a physical register, then it must first be copied into
7776 /// a physical register that is truncable to 8-bit, then truncated into a
7777 /// physical register that can receive a truncation, and finally copied into the
7778 /// result 8-bit register (which in general can be any 8-bit register).  For
7779 /// example, moving %ebp into %ah may be accomplished as:
7780 ///   movl %ebp, %edx
7781 ///   mov_trunc %edx, %dl  // this redundant assignment is ultimately elided
7782 ///   movb %dl, %ah
7783 /// On the other hand, moving a memory or immediate operand into ah:
7784 ///   movb 4(%ebp), %ah
7785 ///   movb $my_imm, %ah
7786 ///
7787 /// Note #1.  On a 64-bit target, the "movb 4(%ebp), %ah" is likely not
7788 /// encodable, so RegNum=Reg_ah should NOT be given as an argument.  Instead,
7789 /// use RegNum=RegNumT() and then let the caller do a separate copy into
7790 /// Reg_ah.
7791 ///
7792 /// Note #2.  ConstantRelocatable operands are also put through this process
7793 /// (not truncated directly) because our ELF emitter does R_386_32 relocations
7794 /// but not R_386_8 relocations.
7795 ///
7796 /// Note #3.  If Src is a Variable, the result will be an infinite-weight i8
7797 /// Variable with the RCX86_IsTrunc8Rcvr register class.  As such, this helper
7798 /// is a convenient way to prevent ah/bh/ch/dh from being an (invalid) argument
7799 /// to the pinsrb instruction.
7800 template <typename TraitsType>
7801 Variable *TargetX86Base<TraitsType>::copyToReg8(Operand *Src, RegNumT RegNum) {
7802   Type Ty = Src->getType();
7803   assert(isScalarIntegerType(Ty));
7804   assert(Ty != IceType_i1);
7805   Variable *Reg = makeReg(IceType_i8, RegNum);
7806   Reg->setRegClass(RCX86_IsTrunc8Rcvr);
7807   if (llvm::isa<Variable>(Src) || llvm::isa<ConstantRelocatable>(Src)) {
7808     Variable *SrcTruncable = makeReg(Ty);
7809     switch (Ty) {
7810     case IceType_i64:
7811       SrcTruncable->setRegClass(RCX86_Is64To8);
7812       break;
7813     case IceType_i32:
7814       SrcTruncable->setRegClass(RCX86_Is32To8);
7815       break;
7816     case IceType_i16:
7817       SrcTruncable->setRegClass(RCX86_Is16To8);
7818       break;
7819     default:
7820       // i8 - just use default register class
7821       break;
7822     }
7823     Variable *SrcRcvr = makeReg(IceType_i8);
7824     SrcRcvr->setRegClass(RCX86_IsTrunc8Rcvr);
7825     _mov(SrcTruncable, Src);
7826     _mov(SrcRcvr, SrcTruncable);
7827     Src = SrcRcvr;
7828   }
7829   _mov(Reg, Src);
7830   return Reg;
7831 }
7832 
7833 /// Helper for legalize() to emit the right code to lower an operand to a
7834 /// register of the appropriate type.
7835 template <typename TraitsType>
7836 Variable *TargetX86Base<TraitsType>::copyToReg(Operand *Src, RegNumT RegNum) {
7837   Type Ty = Src->getType();
7838   Variable *Reg = makeReg(Ty, RegNum);
7839   if (isVectorType(Ty)) {
7840     _movp(Reg, Src);
7841   } else {
7842     _mov(Reg, Src);
7843   }
7844   return Reg;
7845 }
7846 
7847 template <typename TraitsType>
7848 Operand *TargetX86Base<TraitsType>::legalize(Operand *From, LegalMask Allowed,
7849                                              RegNumT RegNum) {
7850   const bool UseNonsfi = getFlags().getUseNonsfi();
7851   const Type Ty = From->getType();
7852   // Assert that a physical register is allowed. To date, all calls to
7853   // legalize() allow a physical register. If a physical register needs to be
7854   // explicitly disallowed, then new code will need to be written to force a
7855   // spill.
7856   assert(Allowed & Legal_Reg);
7857   // If we're asking for a specific physical register, make sure we're not
7858   // allowing any other operand kinds. (This could be future work, e.g. allow
7859   // the shl shift amount to be either an immediate or in ecx.)
7860   assert(RegNum.hasNoValue() || Allowed == Legal_Reg);
7861 
7862   // Substitute with an available infinite-weight variable if possible.  Only do
7863   // this when we are not asking for a specific register, and when the
7864   // substitution is not locked to a specific register, and when the types
7865   // match, in order to capture the vast majority of opportunities and avoid
7866   // corner cases in the lowering.
7867   if (RegNum.hasNoValue()) {
7868     if (Variable *Subst = getContext().availabilityGet(From)) {
7869       // At this point we know there is a potential substitution available.
7870       if (Subst->mustHaveReg() && !Subst->hasReg()) {
7871         // At this point we know the substitution will have a register.
7872         if (From->getType() == Subst->getType()) {
7873           // At this point we know the substitution's register is compatible.
7874           return Subst;
7875         }
7876       }
7877     }
7878   }
7879 
7880   if (auto *Mem = llvm::dyn_cast<X86OperandMem>(From)) {
7881     // Before doing anything with a Mem operand, we need to ensure that the
7882     // Base and Index components are in physical registers.
7883     Variable *Base = Mem->getBase();
7884     Variable *Index = Mem->getIndex();
7885     Constant *Offset = Mem->getOffset();
7886     Variable *RegBase = nullptr;
7887     Variable *RegIndex = nullptr;
7888     uint16_t Shift = Mem->getShift();
7889     if (Base) {
7890       RegBase = llvm::cast<Variable>(
7891           legalize(Base, Legal_Reg | Legal_Rematerializable));
7892     }
7893     if (Index) {
7894       // TODO(jpp): perhaps we should only allow Legal_Reg if
7895       // Base->isRematerializable.
7896       RegIndex = llvm::cast<Variable>(
7897           legalize(Index, Legal_Reg | Legal_Rematerializable));
7898     }
7899 
7900     if (Base != RegBase || Index != RegIndex) {
7901       Mem = X86OperandMem::create(Func, Ty, RegBase, Offset, RegIndex, Shift,
7902                                   Mem->getSegmentRegister());
7903     }
7904 
7905     // For all Memory Operands, we do randomization/pooling here.
7906     From = randomizeOrPoolImmediate(Mem);
7907 
7908     if (!(Allowed & Legal_Mem)) {
7909       From = copyToReg(From, RegNum);
7910     }
7911     return From;
7912   }
7913 
7914   if (auto *Const = llvm::dyn_cast<Constant>(From)) {
7915     if (llvm::isa<ConstantUndef>(Const)) {
7916       From = legalizeUndef(Const, RegNum);
7917       if (isVectorType(Ty))
7918         return From;
7919       Const = llvm::cast<Constant>(From);
7920     }
7921     // There should be no constants of vector type (other than undef).
7922     assert(!isVectorType(Ty));
7923 
7924     // If the operand is a 64 bit constant integer we need to legalize it to a
7925     // register in x86-64.
7926     if (Traits::Is64Bit) {
7927       if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Const)) {
7928         if (!Utils::IsInt(32, C64->getValue())) {
7929           if (RegNum.hasValue()) {
7930             assert(Traits::getGprForType(IceType_i64, RegNum) == RegNum);
7931           }
7932           return copyToReg(Const, RegNum);
7933         }
7934       }
7935     }
7936 
7937     // If the operand is an 32 bit constant integer, we should check whether we
7938     // need to randomize it or pool it.
7939     if (auto *C = llvm::dyn_cast<ConstantInteger32>(Const)) {
7940       Operand *NewConst = randomizeOrPoolImmediate(C, RegNum);
7941       if (NewConst != Const) {
7942         return NewConst;
7943       }
7944     }
7945 
7946     if (auto *CR = llvm::dyn_cast<ConstantRelocatable>(Const)) {
7947       // If the operand is a ConstantRelocatable, and Legal_AddrAbs is not
7948       // specified, and UseNonsfi is indicated, we need to add RebasePtr.
7949       if (UseNonsfi && !(Allowed & Legal_AddrAbs)) {
7950         assert(Ty == IceType_i32);
7951         Variable *NewVar = makeReg(Ty, RegNum);
7952         auto *Mem = Traits::X86OperandMem::create(Func, Ty, nullptr, CR);
7953         // LEAs are not automatically sandboxed, thus we explicitly invoke
7954         // _sandbox_mem_reference.
7955         _lea(NewVar, _sandbox_mem_reference(Mem));
7956         From = NewVar;
7957       }
7958     } else if (isScalarFloatingType(Ty)) {
7959       // Convert a scalar floating point constant into an explicit memory
7960       // operand.
7961       if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(Const)) {
7962         if (Utils::isPositiveZero(ConstFloat->getValue()))
7963           return makeZeroedRegister(Ty, RegNum);
7964       } else if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(Const)) {
7965         if (Utils::isPositiveZero(ConstDouble->getValue()))
7966           return makeZeroedRegister(Ty, RegNum);
7967       }
7968 
7969       auto *CFrom = llvm::cast<Constant>(From);
7970       assert(CFrom->getShouldBePooled());
7971       Constant *Offset = Ctx->getConstantSym(0, CFrom->getLabelName());
7972       auto *Mem = X86OperandMem::create(Func, Ty, nullptr, Offset);
7973       From = Mem;
7974     }
7975 
7976     bool NeedsReg = false;
7977     if (!(Allowed & Legal_Imm) && !isScalarFloatingType(Ty))
7978       // Immediate specifically not allowed.
7979       NeedsReg = true;
7980     if (!(Allowed & Legal_Mem) && isScalarFloatingType(Ty))
7981       // On x86, FP constants are lowered to mem operands.
7982       NeedsReg = true;
7983     if (NeedsReg) {
7984       From = copyToReg(From, RegNum);
7985     }
7986     return From;
7987   }
7988 
7989   if (auto *Var = llvm::dyn_cast<Variable>(From)) {
7990     // Check if the variable is guaranteed a physical register. This can happen
7991     // either when the variable is pre-colored or when it is assigned infinite
7992     // weight.
7993     bool MustHaveRegister = (Var->hasReg() || Var->mustHaveReg());
7994     bool MustRematerialize =
7995         (Var->isRematerializable() && !(Allowed & Legal_Rematerializable));
7996     // We need a new physical register for the operand if:
7997     // - Mem is not allowed and Var isn't guaranteed a physical register, or
7998     // - RegNum is required and Var->getRegNum() doesn't match, or
7999     // - Var is a rematerializable variable and rematerializable pass-through is
8000     //   not allowed (in which case we need a lea instruction).
8001     if (MustRematerialize) {
8002       Variable *NewVar = makeReg(Ty, RegNum);
8003       // Since Var is rematerializable, the offset will be added when the lea is
8004       // emitted.
8005       constexpr Constant *NoOffset = nullptr;
8006       auto *Mem = X86OperandMem::create(Func, Ty, Var, NoOffset);
8007       _lea(NewVar, Mem);
8008       From = NewVar;
8009     } else if ((!(Allowed & Legal_Mem) && !MustHaveRegister) ||
8010                (RegNum.hasValue() && RegNum != Var->getRegNum())) {
8011       From = copyToReg(From, RegNum);
8012     }
8013     return From;
8014   }
8015 
8016   llvm::report_fatal_error("Unhandled operand kind in legalize()");
8017   return From;
8018 }
8019 
8020 /// Provide a trivial wrapper to legalize() for this common usage.
8021 template <typename TraitsType>
8022 Variable *TargetX86Base<TraitsType>::legalizeToReg(Operand *From,
8023                                                    RegNumT RegNum) {
8024   return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum));
8025 }
8026 
8027 /// Legalize undef values to concrete values.
8028 template <typename TraitsType>
8029 Operand *TargetX86Base<TraitsType>::legalizeUndef(Operand *From,
8030                                                   RegNumT RegNum) {
8031   Type Ty = From->getType();
8032   if (llvm::isa<ConstantUndef>(From)) {
8033     // Lower undefs to zero.  Another option is to lower undefs to an
8034     // uninitialized register; however, using an uninitialized register results
8035     // in less predictable code.
8036     //
8037     // If in the future the implementation is changed to lower undef values to
8038     // uninitialized registers, a FakeDef will be needed:
8039     //     Context.insert<InstFakeDef>(Reg);
8040     // This is in order to ensure that the live range of Reg is not
8041     // overestimated.  If the constant being lowered is a 64 bit value, then
8042     // the result should be split and the lo and hi components will need to go
8043     // in uninitialized registers.
8044     if (isVectorType(Ty))
8045       return makeVectorOfZeros(Ty, RegNum);
8046     return Ctx->getConstantZero(Ty);
8047   }
8048   return From;
8049 }
8050 
8051 /// For the cmp instruction, if Src1 is an immediate, or known to be a physical
8052 /// register, we can allow Src0 to be a memory operand. Otherwise, Src0 must be
8053 /// copied into a physical register. (Actually, either Src0 or Src1 can be
8054 /// chosen for the physical register, but unfortunately we have to commit to one
8055 /// or the other before register allocation.)
8056 template <typename TraitsType>
8057 Operand *TargetX86Base<TraitsType>::legalizeSrc0ForCmp(Operand *Src0,
8058                                                        Operand *Src1) {
8059   bool IsSrc1ImmOrReg = false;
8060   if (llvm::isa<Constant>(Src1)) {
8061     IsSrc1ImmOrReg = true;
8062   } else if (auto *Var = llvm::dyn_cast<Variable>(Src1)) {
8063     if (Var->hasReg())
8064       IsSrc1ImmOrReg = true;
8065   }
8066   return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg);
8067 }
8068 
8069 template <typename TraitsType>
8070 typename TargetX86Base<TraitsType>::X86OperandMem *
8071 TargetX86Base<TraitsType>::formMemoryOperand(Operand *Opnd, Type Ty,
8072                                              bool DoLegalize) {
8073   auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd);
8074   // It may be the case that address mode optimization already creates an
8075   // X86OperandMem, so in that case it wouldn't need another level of
8076   // transformation.
8077   if (!Mem) {
8078     auto *Base = llvm::dyn_cast<Variable>(Opnd);
8079     auto *Offset = llvm::dyn_cast<Constant>(Opnd);
8080     assert(Base || Offset);
8081     if (Offset) {
8082       // During memory operand building, we do not blind or pool the constant
8083       // offset, we will work on the whole memory operand later as one entity
8084       // later, this save one instruction. By turning blinding and pooling off,
8085       // we guarantee legalize(Offset) will return a Constant*.
8086       if (!llvm::isa<ConstantRelocatable>(Offset)) {
8087         BoolFlagSaver B(RandomizationPoolingPaused, true);
8088 
8089         Offset = llvm::cast<Constant>(legalize(Offset));
8090       }
8091 
8092       assert(llvm::isa<ConstantInteger32>(Offset) ||
8093              llvm::isa<ConstantRelocatable>(Offset));
8094     }
8095     // Not completely sure whether it's OK to leave IsRebased unset when
8096     // creating the mem operand.  If DoLegalize is true, it will definitely be
8097     // applied during the legalize() call, but perhaps not during the
8098     // randomizeOrPoolImmediate() call.  In any case, the emit routines will
8099     // assert that PIC legalization has been applied.
8100     Mem = X86OperandMem::create(Func, Ty, Base, Offset);
8101   }
8102   // Do legalization, which contains randomization/pooling or do
8103   // randomization/pooling.
8104   return llvm::cast<X86OperandMem>(DoLegalize ? legalize(Mem)
8105                                               : randomizeOrPoolImmediate(Mem));
8106 }
8107 
8108 template <typename TraitsType>
8109 Variable *TargetX86Base<TraitsType>::makeReg(Type Type, RegNumT RegNum) {
8110   // There aren't any 64-bit integer registers for x86-32.
8111   assert(Traits::Is64Bit || Type != IceType_i64);
8112   Variable *Reg = Func->makeVariable(Type);
8113   if (RegNum.hasValue())
8114     Reg->setRegNum(RegNum);
8115   else
8116     Reg->setMustHaveReg();
8117   return Reg;
8118 }
8119 
8120 const Type TypeForSize[] = {IceType_i8, IceType_i16, IceType_i32, IceType_f64,
8121                             IceType_v16i8};
8122 
8123 template <typename TraitsType>
8124 Type TargetX86Base<TraitsType>::largestTypeInSize(uint32_t Size,
8125                                                   uint32_t MaxSize) {
8126   assert(Size != 0);
8127   uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
8128   uint32_t MaxIndex = MaxSize == NoSizeLimit
8129                           ? llvm::array_lengthof(TypeForSize) - 1
8130                           : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
8131   return TypeForSize[std::min(TyIndex, MaxIndex)];
8132 }
8133 
8134 template <typename TraitsType>
8135 Type TargetX86Base<TraitsType>::firstTypeThatFitsSize(uint32_t Size,
8136                                                       uint32_t MaxSize) {
8137   assert(Size != 0);
8138   uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
8139   if (!llvm::isPowerOf2_32(Size))
8140     ++TyIndex;
8141   uint32_t MaxIndex = MaxSize == NoSizeLimit
8142                           ? llvm::array_lengthof(TypeForSize) - 1
8143                           : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
8144   return TypeForSize[std::min(TyIndex, MaxIndex)];
8145 }
8146 
8147 template <typename TraitsType> void TargetX86Base<TraitsType>::postLower() {
8148   if (Func->getOptLevel() == Opt_m1)
8149     return;
8150   markRedefinitions();
8151   Context.availabilityUpdate();
8152 }
8153 
8154 template <typename TraitsType>
8155 void TargetX86Base<TraitsType>::makeRandomRegisterPermutation(
8156     llvm::SmallVectorImpl<RegNumT> &Permutation,
8157     const SmallBitVector &ExcludeRegisters, uint64_t Salt) const {
8158   Traits::makeRandomRegisterPermutation(Func, Permutation, ExcludeRegisters,
8159                                         Salt);
8160 }
8161 
8162 template <typename TraitsType>
8163 void TargetX86Base<TraitsType>::emit(const ConstantInteger32 *C) const {
8164   if (!BuildDefs::dump())
8165     return;
8166   Ostream &Str = Ctx->getStrEmit();
8167   Str << "$" << C->getValue();
8168 }
8169 
8170 template <typename TraitsType>
8171 void TargetX86Base<TraitsType>::emit(const ConstantInteger64 *C) const {
8172   if (!Traits::Is64Bit) {
8173     llvm::report_fatal_error("Not expecting to emit 64-bit integers");
8174   } else {
8175     if (!BuildDefs::dump())
8176       return;
8177     Ostream &Str = Ctx->getStrEmit();
8178     Str << "$" << C->getValue();
8179   }
8180 }
8181 
8182 template <typename TraitsType>
8183 void TargetX86Base<TraitsType>::emit(const ConstantFloat *C) const {
8184   if (!BuildDefs::dump())
8185     return;
8186   Ostream &Str = Ctx->getStrEmit();
8187   Str << C->getLabelName();
8188 }
8189 
8190 template <typename TraitsType>
8191 void TargetX86Base<TraitsType>::emit(const ConstantDouble *C) const {
8192   if (!BuildDefs::dump())
8193     return;
8194   Ostream &Str = Ctx->getStrEmit();
8195   Str << C->getLabelName();
8196 }
8197 
8198 template <typename TraitsType>
8199 void TargetX86Base<TraitsType>::emit(const ConstantUndef *) const {
8200   llvm::report_fatal_error("undef value encountered by emitter.");
8201 }
8202 
8203 template <class Machine>
8204 void TargetX86Base<Machine>::emit(const ConstantRelocatable *C) const {
8205   if (!BuildDefs::dump())
8206     return;
8207   assert(!getFlags().getUseNonsfi() ||
8208          C->getName().toString() == GlobalOffsetTable);
8209   Ostream &Str = Ctx->getStrEmit();
8210   Str << "$";
8211   emitWithoutPrefix(C);
8212 }
8213 
8214 /// Randomize or pool an Immediate.
8215 template <typename TraitsType>
8216 Operand *
8217 TargetX86Base<TraitsType>::randomizeOrPoolImmediate(Constant *Immediate,
8218                                                     RegNumT RegNum) {
8219   assert(llvm::isa<ConstantInteger32>(Immediate) ||
8220          llvm::isa<ConstantRelocatable>(Immediate));
8221   if (getFlags().getRandomizeAndPoolImmediatesOption() == RPI_None ||
8222       RandomizationPoolingPaused == true) {
8223     // Immediates randomization/pooling off or paused
8224     return Immediate;
8225   }
8226 
8227   if (Traits::Is64Bit && NeedSandboxing) {
8228     // Immediate randomization/pooling is currently disabled for x86-64
8229     // sandboxing for it could generate invalid memory operands.
8230     assert(false &&
8231            "Constant pooling/randomization is disabled for x8664 sandbox.");
8232     return Immediate;
8233   }
8234 
8235   if (!Immediate->shouldBeRandomizedOrPooled()) {
8236     // the constant Immediate is not eligible for blinding/pooling
8237     return Immediate;
8238   }
8239   Ctx->statsUpdateRPImms();
8240   switch (getFlags().getRandomizeAndPoolImmediatesOption()) {
8241   default:
8242     llvm::report_fatal_error("Unsupported -randomize-pool-immediates option");
8243   case RPI_Randomize: {
8244     // blind the constant
8245     // FROM:
8246     //  imm
8247     // TO:
8248     //  insert: mov imm+cookie, Reg
8249     //  insert: lea -cookie[Reg], Reg
8250     //  => Reg
8251     // If we have already assigned a phy register, we must come from
8252     // advancedPhiLowering()=>lowerAssign(). In this case we should reuse the
8253     // assigned register as this assignment is that start of its use-def
8254     // chain. So we add RegNum argument here. Note we use 'lea' instruction
8255     // instead of 'xor' to avoid affecting the flags.
8256     Variable *Reg = makeReg(IceType_i32, RegNum);
8257     auto *Integer = llvm::cast<ConstantInteger32>(Immediate);
8258     uint32_t Value = Integer->getValue();
8259     uint32_t Cookie = Func->getConstantBlindingCookie();
8260     _mov(Reg, Ctx->getConstantInt(IceType_i32, Cookie + Value));
8261     Constant *Offset = Ctx->getConstantInt(IceType_i32, 0 - Cookie);
8262     _lea(Reg, X86OperandMem::create(Func, IceType_i32, Reg, Offset));
8263     if (Immediate->getType() == IceType_i32) {
8264       return Reg;
8265     }
8266     Variable *TruncReg = makeReg(Immediate->getType(), RegNum);
8267     _mov(TruncReg, Reg);
8268     return TruncReg;
8269   }
8270   case RPI_Pool: {
8271     // pool the constant
8272     // FROM:
8273     //  imm
8274     // TO:
8275     //  insert: mov $label, Reg
8276     //  => Reg
8277     assert(getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Pool);
8278     assert(Immediate->getShouldBePooled());
8279     // if we have already assigned a phy register, we must come from
8280     // advancedPhiLowering()=>lowerAssign(). In this case we should reuse the
8281     // assigned register as this assignment is that start of its use-def
8282     // chain. So we add RegNum argument here.
8283     Variable *Reg = makeReg(Immediate->getType(), RegNum);
8284     constexpr RelocOffsetT Offset = 0;
8285     Constant *Symbol = Ctx->getConstantSym(Offset, Immediate->getLabelName());
8286     constexpr Variable *NoBase = nullptr;
8287     X86OperandMem *MemOperand =
8288         X86OperandMem::create(Func, Immediate->getType(), NoBase, Symbol);
8289     _mov(Reg, MemOperand);
8290     return Reg;
8291   }
8292   }
8293 }
8294 
8295 template <typename TraitsType>
8296 typename TargetX86Base<TraitsType>::X86OperandMem *
8297 TargetX86Base<TraitsType>::randomizeOrPoolImmediate(X86OperandMem *MemOperand,
8298                                                     RegNumT RegNum) {
8299   assert(MemOperand);
8300   if (getFlags().getRandomizeAndPoolImmediatesOption() == RPI_None ||
8301       RandomizationPoolingPaused == true) {
8302     // immediates randomization/pooling is turned off
8303     return MemOperand;
8304   }
8305 
8306   if (Traits::Is64Bit && NeedSandboxing) {
8307     // Immediate randomization/pooling is currently disabled for x86-64
8308     // sandboxing for it could generate invalid memory operands.
8309     assert(false &&
8310            "Constant pooling/randomization is disabled for x8664 sandbox.");
8311     return MemOperand;
8312   }
8313 
8314   // If this memory operand is already a randomized one, we do not randomize it
8315   // again.
8316   if (MemOperand->getRandomized())
8317     return MemOperand;
8318 
8319   auto *C = llvm::dyn_cast_or_null<Constant>(MemOperand->getOffset());
8320 
8321   if (C == nullptr) {
8322     return MemOperand;
8323   }
8324 
8325   if (!C->shouldBeRandomizedOrPooled()) {
8326     return MemOperand;
8327   }
8328 
8329   // The offset of this mem operand should be blinded or pooled
8330   Ctx->statsUpdateRPImms();
8331   switch (getFlags().getRandomizeAndPoolImmediatesOption()) {
8332   default:
8333     llvm::report_fatal_error("Unsupported -randomize-pool-immediates option");
8334   case RPI_Randomize: {
8335     // blind the constant offset
8336     // FROM:
8337     //  offset[base, index, shift]
8338     // TO:
8339     //  insert: lea offset+cookie[base], RegTemp
8340     //  => -cookie[RegTemp, index, shift]
8341     uint32_t Value =
8342         llvm::dyn_cast<ConstantInteger32>(MemOperand->getOffset())->getValue();
8343     uint32_t Cookie = Func->getConstantBlindingCookie();
8344     Constant *Mask1 =
8345         Ctx->getConstantInt(MemOperand->getOffset()->getType(), Cookie + Value);
8346     Constant *Mask2 =
8347         Ctx->getConstantInt(MemOperand->getOffset()->getType(), 0 - Cookie);
8348 
8349     X86OperandMem *TempMemOperand = X86OperandMem::create(
8350         Func, MemOperand->getType(), MemOperand->getBase(), Mask1);
8351     // If we have already assigned a physical register, we must come from
8352     // advancedPhiLowering()=>lowerAssign(). In this case we should reuse
8353     // the assigned register as this assignment is that start of its
8354     // use-def chain. So we add RegNum argument here.
8355     Variable *RegTemp = makeReg(MemOperand->getOffset()->getType(), RegNum);
8356     _lea(RegTemp, TempMemOperand);
8357 
8358     X86OperandMem *NewMemOperand = X86OperandMem::create(
8359         Func, MemOperand->getType(), RegTemp, Mask2, MemOperand->getIndex(),
8360         MemOperand->getShift(), MemOperand->getSegmentRegister(),
8361         MemOperand->getIsRebased());
8362 
8363     // Label this memory operand as randomized, so we won't randomize it
8364     // again in case we call legalize() multiple times on this memory
8365     // operand.
8366     NewMemOperand->setRandomized(true);
8367     return NewMemOperand;
8368   }
8369   case RPI_Pool: {
8370     // pool the constant offset
8371     // FROM:
8372     //  offset[base, index, shift]
8373     // TO:
8374     //  insert: mov $label, RegTemp
8375     //  insert: lea [base, RegTemp], RegTemp
8376     //  =>[RegTemp, index, shift]
8377 
8378     // Memory operand should never exist as source operands in phi lowering
8379     // assignments, so there is no need to reuse any registers here. For
8380     // phi lowering, we should not ask for new physical registers in
8381     // general. However, if we do meet Memory Operand during phi lowering,
8382     // we should not blind or pool the immediates for now.
8383     if (RegNum.hasValue())
8384       return MemOperand;
8385     Variable *RegTemp = makeReg(IceType_i32);
8386     assert(MemOperand->getOffset()->getShouldBePooled());
8387     constexpr RelocOffsetT SymOffset = 0;
8388     Constant *Symbol =
8389         Ctx->getConstantSym(SymOffset, MemOperand->getOffset()->getLabelName());
8390     constexpr Variable *NoBase = nullptr;
8391     X86OperandMem *SymbolOperand = X86OperandMem::create(
8392         Func, MemOperand->getOffset()->getType(), NoBase, Symbol);
8393     _mov(RegTemp, SymbolOperand);
8394     // If we have a base variable here, we should add the lea instruction
8395     // to add the value of the base variable to RegTemp. If there is no
8396     // base variable, we won't need this lea instruction.
8397     if (MemOperand->getBase()) {
8398       X86OperandMem *CalculateOperand = X86OperandMem::create(
8399           Func, MemOperand->getType(), MemOperand->getBase(), nullptr, RegTemp,
8400           0, MemOperand->getSegmentRegister());
8401       _lea(RegTemp, CalculateOperand);
8402     }
8403     X86OperandMem *NewMemOperand = X86OperandMem::create(
8404         Func, MemOperand->getType(), RegTemp, nullptr, MemOperand->getIndex(),
8405         MemOperand->getShift(), MemOperand->getSegmentRegister());
8406     return NewMemOperand;
8407   }
8408   }
8409 }
8410 
8411 template <typename TraitsType>
8412 void TargetX86Base<TraitsType>::emitJumpTable(
8413     const Cfg *, const InstJumpTable *JumpTable) const {
8414   if (!BuildDefs::dump())
8415     return;
8416   Ostream &Str = Ctx->getStrEmit();
8417   const bool UseNonsfi = getFlags().getUseNonsfi();
8418   const char *Prefix = UseNonsfi ? ".data.rel.ro." : ".rodata.";
8419   Str << "\t.section\t" << Prefix << JumpTable->getSectionName()
8420       << ",\"a\",@progbits\n"
8421          "\t.align\t" << typeWidthInBytes(getPointerType()) << "\n"
8422       << JumpTable->getName() << ":";
8423 
8424   // On X86 ILP32 pointers are 32-bit hence the use of .long
8425   for (SizeT I = 0; I < JumpTable->getNumTargets(); ++I)
8426     Str << "\n\t.long\t" << JumpTable->getTarget(I)->getAsmName();
8427   Str << "\n";
8428 }
8429 
8430 template <typename TraitsType>
8431 template <typename T>
8432 void TargetDataX86<TraitsType>::emitConstantPool(GlobalContext *Ctx) {
8433   if (!BuildDefs::dump())
8434     return;
8435   Ostream &Str = Ctx->getStrEmit();
8436   Type Ty = T::Ty;
8437   SizeT Align = typeAlignInBytes(Ty);
8438   ConstantList Pool = Ctx->getConstantPool(Ty);
8439 
8440   Str << "\t.section\t.rodata.cst" << Align << ",\"aM\",@progbits," << Align
8441       << "\n";
8442   Str << "\t.align\t" << Align << "\n";
8443 
8444   // If reorder-pooled-constants option is set to true, we need to shuffle the
8445   // constant pool before emitting it.
8446   if (getFlags().getReorderPooledConstants() && !Pool.empty()) {
8447     // Use the constant's kind value as the salt for creating random number
8448     // generator.
8449     Operand::OperandKind K = (*Pool.begin())->getKind();
8450     RandomNumberGenerator RNG(getFlags().getRandomSeed(),
8451                               RPE_PooledConstantReordering, K);
8452     RandomShuffle(Pool.begin(), Pool.end(),
8453                   [&RNG](uint64_t N) { return (uint32_t)RNG.next(N); });
8454   }
8455 
8456   for (Constant *C : Pool) {
8457     if (!C->getShouldBePooled())
8458       continue;
8459     auto *Const = llvm::cast<typename T::IceType>(C);
8460     typename T::IceType::PrimType Value = Const->getValue();
8461     // Use memcpy() to copy bits from Value into RawValue in a way that avoids
8462     // breaking strict-aliasing rules.
8463     typename T::PrimitiveIntType RawValue;
8464     memcpy(&RawValue, &Value, sizeof(Value));
8465     char buf[30];
8466     int CharsPrinted =
8467         snprintf(buf, llvm::array_lengthof(buf), T::PrintfString, RawValue);
8468     assert(CharsPrinted >= 0);
8469     assert((size_t)CharsPrinted < llvm::array_lengthof(buf));
8470     (void)CharsPrinted; // avoid warnings if asserts are disabled
8471     Str << Const->getLabelName();
8472     Str << ":\n\t" << T::AsmTag << "\t" << buf << "\t/* " << T::TypeName << " "
8473         << Value << " */\n";
8474   }
8475 }
8476 
8477 template <typename TraitsType>
8478 void TargetDataX86<TraitsType>::lowerConstants() {
8479   if (getFlags().getDisableTranslation())
8480     return;
8481   switch (getFlags().getOutFileType()) {
8482   case FT_Elf: {
8483     ELFObjectWriter *Writer = Ctx->getObjectWriter();
8484 
8485     Writer->writeConstantPool<ConstantInteger32>(IceType_i8);
8486     Writer->writeConstantPool<ConstantInteger32>(IceType_i16);
8487     Writer->writeConstantPool<ConstantInteger32>(IceType_i32);
8488 
8489     Writer->writeConstantPool<ConstantFloat>(IceType_f32);
8490     Writer->writeConstantPool<ConstantDouble>(IceType_f64);
8491   } break;
8492   case FT_Asm:
8493   case FT_Iasm: {
8494     OstreamLocker L(Ctx);
8495 
8496     emitConstantPool<PoolTypeConverter<uint8_t>>(Ctx);
8497     emitConstantPool<PoolTypeConverter<uint16_t>>(Ctx);
8498     emitConstantPool<PoolTypeConverter<uint32_t>>(Ctx);
8499 
8500     emitConstantPool<PoolTypeConverter<float>>(Ctx);
8501     emitConstantPool<PoolTypeConverter<double>>(Ctx);
8502   } break;
8503   }
8504 }
8505 
8506 template <typename TraitsType>
8507 void TargetDataX86<TraitsType>::lowerJumpTables() {
8508   const bool IsPIC = getFlags().getUseNonsfi();
8509   switch (getFlags().getOutFileType()) {
8510   case FT_Elf: {
8511     ELFObjectWriter *Writer = Ctx->getObjectWriter();
8512     constexpr FixupKind FK_Abs64 = llvm::ELF::R_X86_64_64;
8513     const FixupKind RelocationKind =
8514         (getPointerType() == IceType_i32) ? Traits::FK_Abs : FK_Abs64;
8515     for (const JumpTableData &JT : Ctx->getJumpTables())
8516       Writer->writeJumpTable(JT, RelocationKind, IsPIC);
8517   } break;
8518   case FT_Asm:
8519     // Already emitted from Cfg
8520     break;
8521   case FT_Iasm: {
8522     if (!BuildDefs::dump())
8523       return;
8524     Ostream &Str = Ctx->getStrEmit();
8525     const char *Prefix = IsPIC ? ".data.rel.ro." : ".rodata.";
8526     for (const JumpTableData &JT : Ctx->getJumpTables()) {
8527       Str << "\t.section\t" << Prefix << JT.getSectionName()
8528           << ",\"a\",@progbits\n"
8529              "\t.align\t" << typeWidthInBytes(getPointerType()) << "\n"
8530           << JT.getName().toString() << ":";
8531 
8532       // On X8664 ILP32 pointers are 32-bit hence the use of .long
8533       for (intptr_t TargetOffset : JT.getTargetOffsets())
8534         Str << "\n\t.long\t" << JT.getFunctionName() << "+" << TargetOffset;
8535       Str << "\n";
8536     }
8537   } break;
8538   }
8539 }
8540 
8541 template <typename TraitsType>
8542 void TargetDataX86<TraitsType>::lowerGlobals(
8543     const VariableDeclarationList &Vars, const std::string &SectionSuffix) {
8544   const bool IsPIC = getFlags().getUseNonsfi();
8545   switch (getFlags().getOutFileType()) {
8546   case FT_Elf: {
8547     ELFObjectWriter *Writer = Ctx->getObjectWriter();
8548     Writer->writeDataSection(Vars, Traits::FK_Abs, SectionSuffix, IsPIC);
8549   } break;
8550   case FT_Asm:
8551   case FT_Iasm: {
8552     OstreamLocker L(Ctx);
8553     for (const VariableDeclaration *Var : Vars) {
8554       if (getFlags().matchTranslateOnly(Var->getName(), 0)) {
8555         emitGlobal(*Var, SectionSuffix);
8556       }
8557     }
8558   } break;
8559   }
8560 }
8561 } // end of namespace X86NAMESPACE
8562 } // end of namespace Ice
8563 
8564 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H
8565