• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==//
2 //
3 //                        The Subzero Code Generator
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief Implements the TargetLoweringX86Base class, which consists almost
12 /// entirely of the lowering sequence for each high-level instruction.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #ifndef SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H
17 #define SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H
18 
19 #include "IceCfg.h"
20 #include "IceCfgNode.h"
21 #include "IceClFlags.h"
22 #include "IceDefs.h"
23 #include "IceELFObjectWriter.h"
24 #include "IceGlobalInits.h"
25 #include "IceInstVarIter.h"
26 #include "IceInstX86Base.h"
27 #include "IceLiveness.h"
28 #include "IceOperand.h"
29 #include "IcePhiLoweringImpl.h"
30 #include "IceUtils.h"
31 #include "IceVariableSplitting.h"
32 
33 #include "llvm/Support/MathExtras.h"
34 
35 #include <stack>
36 
37 namespace Ice {
38 namespace X86 {
39 template <typename T> struct PoolTypeConverter {};
40 
41 template <> struct PoolTypeConverter<float> {
42   using PrimitiveIntType = uint32_t;
43   using IceType = ConstantFloat;
44   static const Type Ty = IceType_f32;
45   static const char *TypeName;
46   static const char *AsmTag;
47   static const char *PrintfString;
48 };
49 
50 template <> struct PoolTypeConverter<double> {
51   using PrimitiveIntType = uint64_t;
52   using IceType = ConstantDouble;
53   static const Type Ty = IceType_f64;
54   static const char *TypeName;
55   static const char *AsmTag;
56   static const char *PrintfString;
57 };
58 
59 // Add converter for int type constant pooling
60 template <> struct PoolTypeConverter<uint32_t> {
61   using PrimitiveIntType = uint32_t;
62   using IceType = ConstantInteger32;
63   static const Type Ty = IceType_i32;
64   static const char *TypeName;
65   static const char *AsmTag;
66   static const char *PrintfString;
67 };
68 
69 // Add converter for int type constant pooling
70 template <> struct PoolTypeConverter<uint16_t> {
71   using PrimitiveIntType = uint32_t;
72   using IceType = ConstantInteger32;
73   static const Type Ty = IceType_i16;
74   static const char *TypeName;
75   static const char *AsmTag;
76   static const char *PrintfString;
77 };
78 
79 // Add converter for int type constant pooling
80 template <> struct PoolTypeConverter<uint8_t> {
81   using PrimitiveIntType = uint32_t;
82   using IceType = ConstantInteger32;
83   static const Type Ty = IceType_i8;
84   static const char *TypeName;
85   static const char *AsmTag;
86   static const char *PrintfString;
87 };
88 } // end of namespace X86
89 
90 namespace X86NAMESPACE {
91 
92 // The Microsoft x64 ABI requires the caller to allocate a minimum 32 byte
93 // "shadow store" (aka "home space") so that the callee may copy the 4
94 // register args to it.
95 template <typename Traits> SizeT getShadowStoreSize() {
96 #if defined(SUBZERO_USE_MICROSOFT_ABI)
97   static const SizeT ShadowStoreSize =
98       Traits::Is64Bit ? 4 * typeWidthInBytes(Traits::WordType) : 0;
99   return ShadowStoreSize;
100 #else
101   return 0;
102 #endif
103 }
104 
105 using Utils::BoolFlagSaver;
106 
107 template <typename Traits> class BoolFoldingEntry {
108   BoolFoldingEntry(const BoolFoldingEntry &) = delete;
109 
110 public:
111   BoolFoldingEntry() = default;
112   explicit BoolFoldingEntry(Inst *I);
113   BoolFoldingEntry &operator=(const BoolFoldingEntry &) = default;
114   /// Instr is the instruction producing the i1-type variable of interest.
115   Inst *Instr = nullptr;
116   /// IsComplex is the cached result of BoolFolding::hasComplexLowering(Instr).
117   bool IsComplex = false;
118   /// IsLiveOut is initialized conservatively to true, and is set to false when
119   /// we encounter an instruction that ends Var's live range. We disable the
120   /// folding optimization when Var is live beyond this basic block. Note that
121   /// if liveness analysis is not performed (e.g. in Om1 mode), IsLiveOut will
122   /// always be true and the folding optimization will never be performed.
123   bool IsLiveOut = true;
124   // NumUses counts the number of times Var is used as a source operand in the
125   // basic block. If IsComplex is true and there is more than one use of Var,
126   // then the folding optimization is disabled for Var.
127   uint32_t NumUses = 0;
128 };
129 
130 template <typename Traits> class BoolFolding {
131 public:
132   enum BoolFoldingProducerKind {
133     PK_None,
134     // TODO(jpp): PK_Icmp32 is no longer meaningful. Rename to PK_IcmpNative.
135     PK_Icmp32,
136     PK_Icmp64,
137     PK_Fcmp,
138     PK_Trunc,
139     PK_Arith // A flag-setting arithmetic instruction.
140   };
141 
142   /// Currently the actual enum values are not used (other than CK_None), but we
143   /// go ahead and produce them anyway for symmetry with the
144   /// BoolFoldingProducerKind.
145   enum BoolFoldingConsumerKind { CK_None, CK_Br, CK_Select, CK_Sext, CK_Zext };
146 
147 private:
148   BoolFolding(const BoolFolding &) = delete;
149   BoolFolding &operator=(const BoolFolding &) = delete;
150 
151 public:
152   BoolFolding() = default;
153   static BoolFoldingProducerKind getProducerKind(const Inst *Instr);
154   static BoolFoldingConsumerKind getConsumerKind(const Inst *Instr);
155   static bool hasComplexLowering(const Inst *Instr);
156   static bool isValidFolding(BoolFoldingProducerKind ProducerKind,
157                              BoolFoldingConsumerKind ConsumerKind);
158   void init(CfgNode *Node);
159   const Inst *getProducerFor(const Operand *Opnd) const;
160   void dump(const Cfg *Func) const;
161 
162 private:
163   /// Returns true if Producers contains a valid entry for the given VarNum.
164   bool containsValid(SizeT VarNum) const {
165     auto Element = Producers.find(VarNum);
166     return Element != Producers.end() && Element->second.Instr != nullptr;
167   }
168   void setInvalid(SizeT VarNum) { Producers[VarNum].Instr = nullptr; }
169   void invalidateProducersOnStore(const Inst *Instr);
170   /// Producers maps Variable::Number to a BoolFoldingEntry.
171   CfgUnorderedMap<SizeT, BoolFoldingEntry<Traits>> Producers;
172 };
173 
174 template <typename Traits>
175 BoolFoldingEntry<Traits>::BoolFoldingEntry(Inst *I)
176     : Instr(I), IsComplex(BoolFolding<Traits>::hasComplexLowering(I)) {}
177 
178 template <typename Traits>
179 typename BoolFolding<Traits>::BoolFoldingProducerKind
180 BoolFolding<Traits>::getProducerKind(const Inst *Instr) {
181   if (llvm::isa<InstIcmp>(Instr)) {
182     if (Traits::Is64Bit || Instr->getSrc(0)->getType() != IceType_i64)
183       return PK_Icmp32;
184     return PK_Icmp64;
185   }
186   if (llvm::isa<InstFcmp>(Instr))
187     return PK_Fcmp;
188   if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) {
189     if (Traits::Is64Bit || Arith->getSrc(0)->getType() != IceType_i64) {
190       switch (Arith->getOp()) {
191       default:
192         return PK_None;
193       case InstArithmetic::And:
194       case InstArithmetic::Or:
195         return PK_Arith;
196       }
197     }
198   }
199   return PK_None; // TODO(stichnot): remove this
200 
201   if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
202     switch (Cast->getCastKind()) {
203     default:
204       return PK_None;
205     case InstCast::Trunc:
206       return PK_Trunc;
207     }
208   }
209   return PK_None;
210 }
211 
212 template <typename Traits>
213 typename BoolFolding<Traits>::BoolFoldingConsumerKind
214 BoolFolding<Traits>::getConsumerKind(const Inst *Instr) {
215   if (llvm::isa<InstBr>(Instr))
216     return CK_Br;
217   if (llvm::isa<InstSelect>(Instr))
218     return CK_Select;
219   return CK_None; // TODO(stichnot): remove this
220 
221   if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
222     switch (Cast->getCastKind()) {
223     default:
224       return CK_None;
225     case InstCast::Sext:
226       return CK_Sext;
227     case InstCast::Zext:
228       return CK_Zext;
229     }
230   }
231   return CK_None;
232 }
233 
234 /// Returns true if the producing instruction has a "complex" lowering sequence.
235 /// This generally means that its lowering sequence requires more than one
236 /// conditional branch, namely 64-bit integer compares and some floating-point
237 /// compares. When this is true, and there is more than one consumer, we prefer
238 /// to disable the folding optimization because it minimizes branches.
239 template <typename Traits>
240 bool BoolFolding<Traits>::hasComplexLowering(const Inst *Instr) {
241   switch (getProducerKind(Instr)) {
242   default:
243     return false;
244   case PK_Icmp64:
245     return !Traits::Is64Bit;
246   case PK_Fcmp:
247     return Traits::TableFcmp[llvm::cast<InstFcmp>(Instr)->getCondition()].C2 !=
248            Traits::Cond::Br_None;
249   }
250 }
251 
252 template <typename Traits>
253 bool BoolFolding<Traits>::isValidFolding(
254     typename BoolFolding<Traits>::BoolFoldingProducerKind ProducerKind,
255     typename BoolFolding<Traits>::BoolFoldingConsumerKind ConsumerKind) {
256   switch (ProducerKind) {
257   default:
258     return false;
259   case PK_Icmp32:
260   case PK_Icmp64:
261   case PK_Fcmp:
262     return (ConsumerKind == CK_Br) || (ConsumerKind == CK_Select);
263   case PK_Arith:
264     return ConsumerKind == CK_Br;
265   }
266 }
267 
268 template <typename Traits> void BoolFolding<Traits>::init(CfgNode *Node) {
269   Producers.clear();
270   for (Inst &Instr : Node->getInsts()) {
271     if (Instr.isDeleted())
272       continue;
273     invalidateProducersOnStore(&Instr);
274     // Check whether Instr is a valid producer.
275     Variable *Var = Instr.getDest();
276     if (Var) { // only consider instructions with an actual dest var
277       if (isBooleanType(Var->getType())) {        // only bool-type dest vars
278         if (getProducerKind(&Instr) != PK_None) { // white-listed instructions
279           Producers[Var->getIndex()] = BoolFoldingEntry<Traits>(&Instr);
280         }
281       }
282     }
283     // Check each src variable against the map.
284     FOREACH_VAR_IN_INST(Var, Instr) {
285       SizeT VarNum = Var->getIndex();
286       if (!containsValid(VarNum))
287         continue;
288       // All valid consumers use Var as the first source operand
289       if (IndexOfVarOperandInInst(Var) != 0) {
290         setInvalid(VarNum);
291         continue;
292       }
293       // Consumer instructions must be white-listed
294       typename BoolFolding<Traits>::BoolFoldingConsumerKind ConsumerKind =
295           getConsumerKind(&Instr);
296       if (ConsumerKind == CK_None) {
297         setInvalid(VarNum);
298         continue;
299       }
300       typename BoolFolding<Traits>::BoolFoldingProducerKind ProducerKind =
301           getProducerKind(Producers[VarNum].Instr);
302       if (!isValidFolding(ProducerKind, ConsumerKind)) {
303         setInvalid(VarNum);
304         continue;
305       }
306       // Avoid creating multiple copies of complex producer instructions.
307       if (Producers[VarNum].IsComplex && Producers[VarNum].NumUses > 0) {
308         setInvalid(VarNum);
309         continue;
310       }
311       ++Producers[VarNum].NumUses;
312       if (Instr.isLastUse(Var)) {
313         Producers[VarNum].IsLiveOut = false;
314       }
315     }
316   }
317   for (auto &I : Producers) {
318     // Ignore entries previously marked invalid.
319     if (I.second.Instr == nullptr)
320       continue;
321     // Disable the producer if its dest may be live beyond this block.
322     if (I.second.IsLiveOut) {
323       setInvalid(I.first);
324       continue;
325     }
326     // Mark as "dead" rather than outright deleting. This is so that other
327     // peephole style optimizations during or before lowering have access to
328     // this instruction in undeleted form. See for example
329     // tryOptimizedCmpxchgCmpBr().
330     I.second.Instr->setDead();
331   }
332 }
333 
334 template <typename Traits>
335 const Inst *BoolFolding<Traits>::getProducerFor(const Operand *Opnd) const {
336   auto *Var = llvm::dyn_cast<const Variable>(Opnd);
337   if (Var == nullptr)
338     return nullptr;
339   SizeT VarNum = Var->getIndex();
340   auto Element = Producers.find(VarNum);
341   if (Element == Producers.end())
342     return nullptr;
343   return Element->second.Instr;
344 }
345 
346 template <typename Traits>
347 void BoolFolding<Traits>::dump(const Cfg *Func) const {
348   if (!BuildDefs::dump() || !Func->isVerbose(IceV_Folding))
349     return;
350   OstreamLocker L(Func->getContext());
351   Ostream &Str = Func->getContext()->getStrDump();
352   for (auto &I : Producers) {
353     if (I.second.Instr == nullptr)
354       continue;
355     Str << "Found foldable producer:\n  ";
356     I.second.Instr->dump(Func);
357     Str << "\n";
358   }
359 }
360 
361 /// If the given instruction has potential memory side effects (e.g. store, rmw,
362 /// or a call instruction with potential memory side effects), then we must not
363 /// allow a pre-store Producer instruction with memory operands to be folded
364 /// into a post-store Consumer instruction.  If this is detected, the Producer
365 /// is invalidated.
366 ///
367 /// We use the Producer's IsLiveOut field to determine whether any potential
368 /// Consumers come after this store instruction.  The IsLiveOut field is
369 /// initialized to true, and BoolFolding::init() sets IsLiveOut to false when it
370 /// sees the variable's definitive last use (indicating the variable is not in
371 /// the node's live-out set).  Thus if we see here that IsLiveOut is false, we
372 /// know that there can be no consumers after the store, and therefore we know
373 /// the folding is safe despite the store instruction.
374 template <typename Traits>
375 void BoolFolding<Traits>::invalidateProducersOnStore(const Inst *Instr) {
376   if (!Instr->isMemoryWrite())
377     return;
378   for (auto &ProducerPair : Producers) {
379     if (!ProducerPair.second.IsLiveOut)
380       continue;
381     Inst *PInst = ProducerPair.second.Instr;
382     if (PInst == nullptr)
383       continue;
384     bool HasMemOperand = false;
385     const SizeT SrcSize = PInst->getSrcSize();
386     for (SizeT I = 0; I < SrcSize; ++I) {
387       if (llvm::isa<typename Traits::X86OperandMem>(PInst->getSrc(I))) {
388         HasMemOperand = true;
389         break;
390       }
391     }
392     if (!HasMemOperand)
393       continue;
394     setInvalid(ProducerPair.first);
395   }
396 }
397 
398 template <typename TraitsType>
399 void TargetX86Base<TraitsType>::initNodeForLowering(CfgNode *Node) {
400   FoldingInfo.init(Node);
401   FoldingInfo.dump(Func);
402 }
403 
404 template <typename TraitsType>
405 TargetX86Base<TraitsType>::TargetX86Base(Cfg *Func)
406     : TargetLowering(Func), NeedSandboxing(SandboxingType == ST_NaCl) {
407   static_assert(
408       (Traits::InstructionSet::End - Traits::InstructionSet::Begin) ==
409           (TargetInstructionSet::X86InstructionSet_End -
410            TargetInstructionSet::X86InstructionSet_Begin),
411       "Traits::InstructionSet range different from TargetInstructionSet");
412   if (getFlags().getTargetInstructionSet() !=
413       TargetInstructionSet::BaseInstructionSet) {
414     InstructionSet = static_cast<InstructionSetEnum>(
415         (getFlags().getTargetInstructionSet() -
416          TargetInstructionSet::X86InstructionSet_Begin) +
417         Traits::InstructionSet::Begin);
418   }
419 }
420 
421 template <typename TraitsType>
422 void TargetX86Base<TraitsType>::staticInit(GlobalContext *Ctx) {
423   RegNumT::setLimit(Traits::RegisterSet::Reg_NUM);
424   Traits::initRegisterSet(getFlags(), &TypeToRegisterSet, &RegisterAliases);
425   for (size_t i = 0; i < TypeToRegisterSet.size(); ++i)
426     TypeToRegisterSetUnfiltered[i] = TypeToRegisterSet[i];
427   filterTypeToRegisterSet(Ctx, Traits::RegisterSet::Reg_NUM,
428                           TypeToRegisterSet.data(), TypeToRegisterSet.size(),
429                           Traits::getRegName, getRegClassName);
430   PcRelFixup = Traits::FK_PcRel;
431   AbsFixup = getFlags().getUseNonsfi() ? Traits::FK_Gotoff : Traits::FK_Abs;
432 }
433 
434 template <typename TraitsType>
435 bool TargetX86Base<TraitsType>::shouldBePooled(const Constant *C) {
436   if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(C)) {
437     return !Utils::isPositiveZero(ConstFloat->getValue());
438   }
439   if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(C)) {
440     return !Utils::isPositiveZero(ConstDouble->getValue());
441   }
442   if (getFlags().getRandomizeAndPoolImmediatesOption() != RPI_Pool) {
443     return false;
444   }
445   return C->shouldBeRandomizedOrPooled();
446 }
447 
448 template <typename TraitsType>
449 ::Ice::Type TargetX86Base<TraitsType>::getPointerType() {
450   if (!Traits::Is64Bit ||
451       ::Ice::getFlags().getApplicationBinaryInterface() == ::Ice::ABI_PNaCl) {
452     return ::Ice::IceType_i32;
453   }
454   return ::Ice::IceType_i64;
455 }
456 
457 template <typename TraitsType> void TargetX86Base<TraitsType>::translateO2() {
458   TimerMarker T(TimerStack::TT_O2, Func);
459 
460   if (SandboxingType != ST_None) {
461     initRebasePtr();
462   }
463 
464   genTargetHelperCalls();
465   Func->dump("After target helper call insertion");
466 
467   // Merge Alloca instructions, and lay out the stack.
468   static constexpr bool SortAndCombineAllocas = true;
469   Func->processAllocas(SortAndCombineAllocas);
470   Func->dump("After Alloca processing");
471 
472   // Run this early so it can be used to focus optimizations on potentially hot
473   // code.
474   // TODO(stichnot,ascull): currently only used for regalloc not
475   // expensive high level optimizations which could be focused on potentially
476   // hot code.
477   Func->generateLoopInfo();
478   Func->dump("After loop analysis");
479   if (getFlags().getLoopInvariantCodeMotion()) {
480     Func->loopInvariantCodeMotion();
481     Func->dump("After LICM");
482   }
483 
484   if (getFlags().getLocalCSE() != Ice::LCSE_Disabled) {
485     Func->localCSE(getFlags().getLocalCSE() == Ice::LCSE_EnabledSSA);
486     Func->dump("After Local CSE");
487     Func->floatConstantCSE();
488   }
489   if (getFlags().getEnableShortCircuit()) {
490     Func->shortCircuitJumps();
491     Func->dump("After Short Circuiting");
492   }
493 
494   if (!getFlags().getEnablePhiEdgeSplit()) {
495     // Lower Phi instructions.
496     Func->placePhiLoads();
497     if (Func->hasError())
498       return;
499     Func->placePhiStores();
500     if (Func->hasError())
501       return;
502     Func->deletePhis();
503     if (Func->hasError())
504       return;
505     Func->dump("After Phi lowering");
506   }
507 
508   // Address mode optimization.
509   Func->getVMetadata()->init(VMK_SingleDefs);
510   Func->doAddressOpt();
511   Func->materializeVectorShuffles();
512 
513   // Find read-modify-write opportunities. Do this after address mode
514   // optimization so that doAddressOpt() doesn't need to be applied to RMW
515   // instructions as well.
516   findRMW();
517   Func->dump("After RMW transform");
518 
519   // Argument lowering
520   Func->doArgLowering();
521 
522   // Target lowering. This requires liveness analysis for some parts of the
523   // lowering decisions, such as compare/branch fusing. If non-lightweight
524   // liveness analysis is used, the instructions need to be renumbered first
525   // TODO: This renumbering should only be necessary if we're actually
526   // calculating live intervals, which we only do for register allocation.
527   Func->renumberInstructions();
528   if (Func->hasError())
529     return;
530 
531   // TODO: It should be sufficient to use the fastest liveness calculation,
532   // i.e. livenessLightweight(). However, for some reason that slows down the
533   // rest of the translation. Investigate.
534   Func->liveness(Liveness_Basic);
535   if (Func->hasError())
536     return;
537   Func->dump("After x86 address mode opt");
538 
539   // Disable constant blinding or pooling for load optimization.
540   {
541     BoolFlagSaver B(RandomizationPoolingPaused, true);
542     doLoadOpt();
543   }
544   Func->genCode();
545   if (Func->hasError())
546     return;
547   if (SandboxingType != ST_None) {
548     initSandbox();
549   }
550   Func->dump("After x86 codegen");
551   splitBlockLocalVariables(Func);
552 
553   // Register allocation. This requires instruction renumbering and full
554   // liveness analysis. Loops must be identified before liveness so variable
555   // use weights are correct.
556   Func->renumberInstructions();
557   if (Func->hasError())
558     return;
559   Func->liveness(Liveness_Intervals);
560   if (Func->hasError())
561     return;
562   // The post-codegen dump is done here, after liveness analysis and associated
563   // cleanup, to make the dump cleaner and more useful.
564   Func->dump("After initial x86 codegen");
565   // Validate the live range computations. The expensive validation call is
566   // deliberately only made when assertions are enabled.
567   assert(Func->validateLiveness());
568   Func->getVMetadata()->init(VMK_All);
569   regAlloc(RAK_Global);
570   if (Func->hasError())
571     return;
572   Func->dump("After linear scan regalloc");
573 
574   if (getFlags().getEnablePhiEdgeSplit()) {
575     Func->advancedPhiLowering();
576     Func->dump("After advanced Phi lowering");
577   }
578 
579   // Stack frame mapping.
580   Func->genFrame();
581   if (Func->hasError())
582     return;
583   Func->dump("After stack frame mapping");
584 
585   Func->contractEmptyNodes();
586   Func->reorderNodes();
587 
588   // Shuffle basic block order if -reorder-basic-blocks is enabled.
589   Func->shuffleNodes();
590 
591   // Branch optimization.  This needs to be done just before code emission. In
592   // particular, no transformations that insert or reorder CfgNodes should be
593   // done after branch optimization. We go ahead and do it before nop insertion
594   // to reduce the amount of work needed for searching for opportunities.
595   Func->doBranchOpt();
596   Func->dump("After branch optimization");
597 
598   // Nop insertion if -nop-insertion is enabled.
599   Func->doNopInsertion();
600 
601   // Mark nodes that require sandbox alignment
602   if (NeedSandboxing) {
603     Func->markNodesForSandboxing();
604   }
605 }
606 
607 template <typename TraitsType> void TargetX86Base<TraitsType>::translateOm1() {
608   TimerMarker T(TimerStack::TT_Om1, Func);
609 
610   if (SandboxingType != ST_None) {
611     initRebasePtr();
612   }
613 
614   genTargetHelperCalls();
615 
616   // Do not merge Alloca instructions, and lay out the stack.
617   static constexpr bool SortAndCombineAllocas = false;
618   Func->processAllocas(SortAndCombineAllocas);
619   Func->dump("After Alloca processing");
620 
621   Func->placePhiLoads();
622   if (Func->hasError())
623     return;
624   Func->placePhiStores();
625   if (Func->hasError())
626     return;
627   Func->deletePhis();
628   if (Func->hasError())
629     return;
630   Func->dump("After Phi lowering");
631 
632   Func->doArgLowering();
633   Func->genCode();
634   if (Func->hasError())
635     return;
636   if (SandboxingType != ST_None) {
637     initSandbox();
638   }
639   Func->dump("After initial x86 codegen");
640 
641   regAlloc(RAK_InfOnly);
642   if (Func->hasError())
643     return;
644   Func->dump("After regalloc of infinite-weight variables");
645 
646   Func->genFrame();
647   if (Func->hasError())
648     return;
649   Func->dump("After stack frame mapping");
650 
651   // Shuffle basic block order if -reorder-basic-blocks is enabled.
652   Func->shuffleNodes();
653 
654   // Nop insertion if -nop-insertion is enabled.
655   Func->doNopInsertion();
656 
657   // Mark nodes that require sandbox alignment
658   if (NeedSandboxing)
659     Func->markNodesForSandboxing();
660 }
661 
662 inline bool canRMW(const InstArithmetic *Arith) {
663   Type Ty = Arith->getDest()->getType();
664   // X86 vector instructions write to a register and have no RMW option.
665   if (isVectorType(Ty))
666     return false;
667   bool isI64 = Ty == IceType_i64;
668 
669   switch (Arith->getOp()) {
670   // Not handled for lack of simple lowering:
671   //   shift on i64
672   //   mul, udiv, urem, sdiv, srem, frem
673   // Not handled for lack of RMW instructions:
674   //   fadd, fsub, fmul, fdiv (also vector types)
675   default:
676     return false;
677   case InstArithmetic::Add:
678   case InstArithmetic::Sub:
679   case InstArithmetic::And:
680   case InstArithmetic::Or:
681   case InstArithmetic::Xor:
682     return true;
683   case InstArithmetic::Shl:
684   case InstArithmetic::Lshr:
685   case InstArithmetic::Ashr:
686     return false; // TODO(stichnot): implement
687     return !isI64;
688   }
689 }
690 
691 template <typename TraitsType>
692 bool isSameMemAddressOperand(const Operand *A, const Operand *B) {
693   if (A == B)
694     return true;
695   if (auto *MemA =
696           llvm::dyn_cast<typename TargetX86Base<TraitsType>::X86OperandMem>(
697               A)) {
698     if (auto *MemB =
699             llvm::dyn_cast<typename TargetX86Base<TraitsType>::X86OperandMem>(
700                 B)) {
701       return MemA->getBase() == MemB->getBase() &&
702              MemA->getOffset() == MemB->getOffset() &&
703              MemA->getIndex() == MemB->getIndex() &&
704              MemA->getShift() == MemB->getShift() &&
705              MemA->getSegmentRegister() == MemB->getSegmentRegister();
706     }
707   }
708   return false;
709 }
710 
711 template <typename TraitsType> void TargetX86Base<TraitsType>::findRMW() {
712   TimerMarker _(TimerStack::TT_findRMW, Func);
713   Func->dump("Before RMW");
714   if (Func->isVerbose(IceV_RMW))
715     Func->getContext()->lockStr();
716   for (CfgNode *Node : Func->getNodes()) {
717     // Walk through the instructions, considering each sequence of 3
718     // instructions, and look for the particular RMW pattern. Note that this
719     // search can be "broken" (false negatives) if there are intervening
720     // deleted instructions, or intervening instructions that could be safely
721     // moved out of the way to reveal an RMW pattern.
722     auto E = Node->getInsts().end();
723     auto I1 = E, I2 = E, I3 = Node->getInsts().begin();
724     for (; I3 != E; I1 = I2, I2 = I3, ++I3) {
725       // Make I3 skip over deleted instructions.
726       while (I3 != E && I3->isDeleted())
727         ++I3;
728       if (I1 == E || I2 == E || I3 == E)
729         continue;
730       assert(!I1->isDeleted());
731       assert(!I2->isDeleted());
732       assert(!I3->isDeleted());
733       auto *Load = llvm::dyn_cast<InstLoad>(I1);
734       auto *Arith = llvm::dyn_cast<InstArithmetic>(I2);
735       auto *Store = llvm::dyn_cast<InstStore>(I3);
736       if (!Load || !Arith || !Store)
737         continue;
738       // Look for:
739       //   a = Load addr
740       //   b = <op> a, other
741       //   Store b, addr
742       // Change to:
743       //   a = Load addr
744       //   b = <op> a, other
745       //   x = FakeDef
746       //   RMW <op>, addr, other, x
747       //   b = Store b, addr, x
748       // Note that inferTwoAddress() makes sure setDestRedefined() gets called
749       // on the updated Store instruction, to avoid liveness problems later.
750       //
751       // With this transformation, the Store instruction acquires a Dest
752       // variable and is now subject to dead code elimination if there are no
753       // more uses of "b".  Variable "x" is a beacon for determining whether the
754       // Store instruction gets dead-code eliminated.  If the Store instruction
755       // is eliminated, then it must be the case that the RMW instruction ends
756       // x's live range, and therefore the RMW instruction will be retained and
757       // later lowered.  On the other hand, if the RMW instruction does not end
758       // x's live range, then the Store instruction must still be present, and
759       // therefore the RMW instruction is ignored during lowering because it is
760       // redundant with the Store instruction.
761       //
762       // Note that if "a" has further uses, the RMW transformation may still
763       // trigger, resulting in two loads and one store, which is worse than the
764       // original one load and one store.  However, this is probably rare, and
765       // caching probably keeps it just as fast.
766       if (!isSameMemAddressOperand<TraitsType>(Load->getSourceAddress(),
767                                                Store->getAddr()))
768         continue;
769       Operand *ArithSrcFromLoad = Arith->getSrc(0);
770       Operand *ArithSrcOther = Arith->getSrc(1);
771       if (ArithSrcFromLoad != Load->getDest()) {
772         if (!Arith->isCommutative() || ArithSrcOther != Load->getDest())
773           continue;
774         std::swap(ArithSrcFromLoad, ArithSrcOther);
775       }
776       if (Arith->getDest() != Store->getData())
777         continue;
778       if (!canRMW(Arith))
779         continue;
780       if (Func->isVerbose(IceV_RMW)) {
781         Ostream &Str = Func->getContext()->getStrDump();
782         Str << "Found RMW in " << Func->getFunctionName() << ":\n  ";
783         Load->dump(Func);
784         Str << "\n  ";
785         Arith->dump(Func);
786         Str << "\n  ";
787         Store->dump(Func);
788         Str << "\n";
789       }
790       Variable *Beacon = Func->makeVariable(IceType_i32);
791       Beacon->setMustNotHaveReg();
792       Store->setRmwBeacon(Beacon);
793       auto *BeaconDef = InstFakeDef::create(Func, Beacon);
794       Node->getInsts().insert(I3, BeaconDef);
795       auto *RMW = InstX86FakeRMW::create(Func, ArithSrcOther, Store->getAddr(),
796                                          Beacon, Arith->getOp());
797       Node->getInsts().insert(I3, RMW);
798     }
799   }
800   if (Func->isVerbose(IceV_RMW))
801     Func->getContext()->unlockStr();
802 }
803 
804 // Converts a ConstantInteger32 operand into its constant value, or
805 // MemoryOrderInvalid if the operand is not a ConstantInteger32.
806 inline uint64_t getConstantMemoryOrder(Operand *Opnd) {
807   if (auto *Integer = llvm::dyn_cast<ConstantInteger32>(Opnd))
808     return Integer->getValue();
809   return Intrinsics::MemoryOrderInvalid;
810 }
811 
812 /// Determines whether the dest of a Load instruction can be folded into one of
813 /// the src operands of a 2-operand instruction. This is true as long as the
814 /// load dest matches exactly one of the binary instruction's src operands.
815 /// Replaces Src0 or Src1 with LoadSrc if the answer is true.
816 inline bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest,
817                                       Operand *&Src0, Operand *&Src1) {
818   if (Src0 == LoadDest && Src1 != LoadDest) {
819     Src0 = LoadSrc;
820     return true;
821   }
822   if (Src0 != LoadDest && Src1 == LoadDest) {
823     Src1 = LoadSrc;
824     return true;
825   }
826   return false;
827 }
828 
829 template <typename TraitsType> void TargetX86Base<TraitsType>::doLoadOpt() {
830   TimerMarker _(TimerStack::TT_loadOpt, Func);
831   for (CfgNode *Node : Func->getNodes()) {
832     Context.init(Node);
833     while (!Context.atEnd()) {
834       Variable *LoadDest = nullptr;
835       Operand *LoadSrc = nullptr;
836       Inst *CurInst = iteratorToInst(Context.getCur());
837       Inst *Next = Context.getNextInst();
838       // Determine whether the current instruction is a Load instruction or
839       // equivalent.
840       if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) {
841         // An InstLoad always qualifies.
842         LoadDest = Load->getDest();
843         constexpr bool DoLegalize = false;
844         LoadSrc = formMemoryOperand(Load->getSourceAddress(),
845                                     LoadDest->getType(), DoLegalize);
846       } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsicCall>(CurInst)) {
847         // An AtomicLoad intrinsic qualifies as long as it has a valid memory
848         // ordering, and can be implemented in a single instruction (i.e., not
849         // i64 on x86-32).
850         Intrinsics::IntrinsicID ID = Intrin->getIntrinsicInfo().ID;
851         if (ID == Intrinsics::AtomicLoad &&
852             (Traits::Is64Bit || Intrin->getDest()->getType() != IceType_i64) &&
853             Intrinsics::isMemoryOrderValid(
854                 ID, getConstantMemoryOrder(Intrin->getArg(1)))) {
855           LoadDest = Intrin->getDest();
856           constexpr bool DoLegalize = false;
857           LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(),
858                                       DoLegalize);
859         }
860       }
861       // A Load instruction can be folded into the following instruction only
862       // if the following instruction ends the Load's Dest variable's live
863       // range.
864       if (LoadDest && Next && Next->isLastUse(LoadDest)) {
865         assert(LoadSrc);
866         Inst *NewInst = nullptr;
867         if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) {
868           Operand *Src0 = Arith->getSrc(0);
869           Operand *Src1 = Arith->getSrc(1);
870           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
871             NewInst = InstArithmetic::create(Func, Arith->getOp(),
872                                              Arith->getDest(), Src0, Src1);
873           }
874         } else if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Next)) {
875           Operand *Src0 = Icmp->getSrc(0);
876           Operand *Src1 = Icmp->getSrc(1);
877           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
878             NewInst = InstIcmp::create(Func, Icmp->getCondition(),
879                                        Icmp->getDest(), Src0, Src1);
880           }
881         } else if (auto *Fcmp = llvm::dyn_cast<InstFcmp>(Next)) {
882           Operand *Src0 = Fcmp->getSrc(0);
883           Operand *Src1 = Fcmp->getSrc(1);
884           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
885             NewInst = InstFcmp::create(Func, Fcmp->getCondition(),
886                                        Fcmp->getDest(), Src0, Src1);
887           }
888         } else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) {
889           Operand *Src0 = Select->getTrueOperand();
890           Operand *Src1 = Select->getFalseOperand();
891           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
892             NewInst = InstSelect::create(Func, Select->getDest(),
893                                          Select->getCondition(), Src0, Src1);
894           }
895         } else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) {
896           // The load dest can always be folded into a Cast instruction.
897           auto *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0));
898           if (Src0 == LoadDest) {
899             NewInst = InstCast::create(Func, Cast->getCastKind(),
900                                        Cast->getDest(), LoadSrc);
901           }
902         }
903         if (NewInst) {
904           CurInst->setDeleted();
905           Next->setDeleted();
906           Context.insert(NewInst);
907           // Update NewInst->LiveRangesEnded so that target lowering may
908           // benefit. Also update NewInst->HasSideEffects.
909           NewInst->spliceLivenessInfo(Next, CurInst);
910         }
911       }
912       Context.advanceCur();
913       Context.advanceNext();
914     }
915   }
916   Func->dump("After load optimization");
917 }
918 
919 template <typename TraitsType>
920 bool TargetX86Base<TraitsType>::doBranchOpt(Inst *I, const CfgNode *NextNode) {
921   if (auto *Br = llvm::dyn_cast<InstX86Br>(I)) {
922     return Br->optimizeBranch(NextNode);
923   }
924   return false;
925 }
926 
927 template <typename TraitsType>
928 Variable *TargetX86Base<TraitsType>::getPhysicalRegister(RegNumT RegNum,
929                                                          Type Ty) {
930   if (Ty == IceType_void)
931     Ty = IceType_i32;
932   if (PhysicalRegisters[Ty].empty())
933     PhysicalRegisters[Ty].resize(Traits::RegisterSet::Reg_NUM);
934   assert(unsigned(RegNum) < PhysicalRegisters[Ty].size());
935   Variable *Reg = PhysicalRegisters[Ty][RegNum];
936   if (Reg == nullptr) {
937     Reg = Func->makeVariable(Ty);
938     Reg->setRegNum(RegNum);
939     PhysicalRegisters[Ty][RegNum] = Reg;
940     // Specially mark a named physical register as an "argument" so that it is
941     // considered live upon function entry.  Otherwise it's possible to get
942     // liveness validation errors for saving callee-save registers.
943     Func->addImplicitArg(Reg);
944     // Don't bother tracking the live range of a named physical register.
945     Reg->setIgnoreLiveness();
946   }
947   assert(Traits::getGprForType(Ty, RegNum) == RegNum);
948   return Reg;
949 }
950 
951 template <typename TraitsType>
952 const char *TargetX86Base<TraitsType>::getRegName(RegNumT RegNum,
953                                                   Type Ty) const {
954   return Traits::getRegName(Traits::getGprForType(Ty, RegNum));
955 }
956 
957 template <typename TraitsType>
958 void TargetX86Base<TraitsType>::emitVariable(const Variable *Var) const {
959   if (!BuildDefs::dump())
960     return;
961   Ostream &Str = Ctx->getStrEmit();
962   if (Var->hasReg()) {
963     const bool Is64BitSandboxing = Traits::Is64Bit && NeedSandboxing;
964     const Type VarType = (Var->isRematerializable() && Is64BitSandboxing)
965                              ? IceType_i64
966                              : Var->getType();
967     Str << "%" << getRegName(Var->getRegNum(), VarType);
968     return;
969   }
970   if (Var->mustHaveReg()) {
971     llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() +
972                              ") has no register assigned - function " +
973                              Func->getFunctionName());
974   }
975   const int32_t Offset = Var->getStackOffset();
976   auto BaseRegNum = Var->getBaseRegNum();
977   if (BaseRegNum.hasNoValue())
978     BaseRegNum = getFrameOrStackReg();
979 
980   // Print in the form "Offset(%reg)", omitting Offset when it is 0.
981   if (getFlags().getDecorateAsm()) {
982     Str << Var->getSymbolicStackOffset();
983   } else if (Offset != 0) {
984     Str << Offset;
985   }
986   const Type FrameSPTy = Traits::WordType;
987   Str << "(%" << getRegName(BaseRegNum, FrameSPTy) << ")";
988 }
989 
990 template <typename TraitsType>
991 typename TargetX86Base<TraitsType>::X86Address
992 TargetX86Base<TraitsType>::stackVarToAsmOperand(const Variable *Var) const {
993   if (Var->hasReg())
994     llvm::report_fatal_error("Stack Variable has a register assigned");
995   if (Var->mustHaveReg()) {
996     llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() +
997                              ") has no register assigned - function " +
998                              Func->getFunctionName());
999   }
1000   int32_t Offset = Var->getStackOffset();
1001   auto BaseRegNum = Var->getBaseRegNum();
1002   if (Var->getBaseRegNum().hasNoValue()) {
1003     // If the stack pointer needs alignment, we must use the frame pointer for
1004     // arguments. For locals, getFrameOrStackReg will return the stack pointer
1005     // in this case.
1006     if (needsStackPointerAlignment() && Var->getIsArg()) {
1007       assert(hasFramePointer());
1008       BaseRegNum = getFrameReg();
1009     } else {
1010       BaseRegNum = getFrameOrStackReg();
1011     }
1012   }
1013   return X86Address(Traits::getEncodedGPR(BaseRegNum), Offset,
1014                     AssemblerFixup::NoFixup);
1015 }
1016 
1017 template <typename TraitsType>
1018 void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) {
1019   // Stack frame layout:
1020   //
1021   // +------------------------+  ^ +
1022   // | 1. return address      |  |
1023   // +------------------------+  v -
1024   // | 2. preserved registers |
1025   // +------------------------+ <--- BasePointer (if used)
1026   // | 3. padding             |
1027   // +------------------------+
1028   // | 4. global spill area   |
1029   // +------------------------+
1030   // | 5. padding             |
1031   // +------------------------+
1032   // | 6. local spill area    |
1033   // +------------------------+
1034   // | 7. padding             |
1035   // +------------------------+
1036   // | 7.5 shadow (WinX64)    |
1037   // +------------------------+
1038   // | 8. allocas             |
1039   // +------------------------+
1040   // | 9. padding             |
1041   // +------------------------+
1042   // | 10. out args           |
1043   // +------------------------+ <--- StackPointer
1044   //
1045   // The following variables record the size in bytes of the given areas:
1046   //  * X86_RET_IP_SIZE_BYTES:   area 1
1047   //  * PreservedRegsSizeBytes:  area 2
1048   //  * SpillAreaPaddingBytes:   area 3
1049   //  * GlobalsSize:             area 4
1050   //  * LocalsSlotsPaddingBytes: area 5
1051   //  * GlobalsAndSubsequentPaddingSize: areas 4 - 5
1052   //  * LocalsSpillAreaSize:     area 6
1053   //  * FixedAllocaSizeBytes:    areas 7 - 8
1054   //  * SpillAreaSizeBytes:      areas 3 - 10
1055   //  * maxOutArgsSizeBytes():   areas 9 - 10
1056 
1057   // Determine stack frame offsets for each Variable without a register
1058   // assignment. This can be done as one variable per stack slot. Or, do
1059   // coalescing by running the register allocator again with an infinite set of
1060   // registers (as a side effect, this gives variables a second chance at
1061   // physical register assignment).
1062   //
1063   // A middle ground approach is to leverage sparsity and allocate one block of
1064   // space on the frame for globals (variables with multi-block lifetime), and
1065   // one block to share for locals (single-block lifetime).
1066 
1067   const SizeT ShadowStoreSize = getShadowStoreSize<Traits>();
1068 
1069   // StackPointer: points just past return address of calling function
1070 
1071   Context.init(Node);
1072   Context.setInsertPoint(Context.getCur());
1073 
1074   SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
1075   RegsUsed = SmallBitVector(CalleeSaves.size());
1076   VarList SortedSpilledVariables, VariablesLinkedToSpillSlots;
1077   size_t GlobalsSize = 0;
1078   // If there is a separate locals area, this represents that area. Otherwise
1079   // it counts any variable not counted by GlobalsSize.
1080   SpillAreaSizeBytes = 0;
1081   // If there is a separate locals area, this specifies the alignment for it.
1082   uint32_t LocalsSlotsAlignmentBytes = 0;
1083   // The entire spill locations area gets aligned to largest natural alignment
1084   // of the variables that have a spill slot.
1085   uint32_t SpillAreaAlignmentBytes = 0;
1086   // A spill slot linked to a variable with a stack slot should reuse that
1087   // stack slot.
1088   std::function<bool(Variable *)> TargetVarHook =
1089       [&VariablesLinkedToSpillSlots](Variable *Var) {
1090         // TODO(stichnot): Refactor this into the base class.
1091         Variable *Root = Var->getLinkedToStackRoot();
1092         if (Root != nullptr) {
1093           assert(!Root->hasReg());
1094           if (!Root->hasReg()) {
1095             VariablesLinkedToSpillSlots.push_back(Var);
1096             return true;
1097           }
1098         }
1099         return false;
1100       };
1101 
1102   // Compute the list of spilled variables and bounds for GlobalsSize, etc.
1103   getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize,
1104                         &SpillAreaSizeBytes, &SpillAreaAlignmentBytes,
1105                         &LocalsSlotsAlignmentBytes, TargetVarHook);
1106   uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes;
1107   SpillAreaSizeBytes += GlobalsSize;
1108 
1109   // Add push instructions for preserved registers.
1110   uint32_t NumCallee = 0;
1111   size_t PreservedRegsSizeBytes = 0;
1112   SmallBitVector Pushed(CalleeSaves.size());
1113   for (RegNumT i : RegNumBVIter(CalleeSaves)) {
1114     const auto Canonical = Traits::getBaseReg(i);
1115     assert(Canonical == Traits::getBaseReg(Canonical));
1116     if (RegsUsed[i]) {
1117       Pushed[Canonical] = true;
1118     }
1119   }
1120   for (RegNumT RegNum : RegNumBVIter(Pushed)) {
1121     assert(RegNum == Traits::getBaseReg(RegNum));
1122     ++NumCallee;
1123     if (Traits::isXmm(RegNum)) {
1124       PreservedRegsSizeBytes += 16;
1125     } else {
1126       PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType);
1127     }
1128     _push_reg(RegNum);
1129   }
1130   Ctx->statsUpdateRegistersSaved(NumCallee);
1131 
1132   // StackPointer: points past preserved registers at start of spill area
1133 
1134   // Generate "push frameptr; mov frameptr, stackptr"
1135   if (IsEbpBasedFrame) {
1136     assert((RegsUsed & getRegisterSet(RegSet_FramePointer, RegSet_None))
1137                .count() == 0);
1138     PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType);
1139     _link_bp();
1140   }
1141 
1142   // Align the variables area. SpillAreaPaddingBytes is the size of the region
1143   // after the preserved registers and before the spill areas.
1144   // LocalsSlotsPaddingBytes is the amount of padding between the globals and
1145   // locals area if they are separate.
1146   assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
1147   uint32_t SpillAreaPaddingBytes = 0;
1148   uint32_t LocalsSlotsPaddingBytes = 0;
1149   alignStackSpillAreas(Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes,
1150                        SpillAreaAlignmentBytes, GlobalsSize,
1151                        LocalsSlotsAlignmentBytes, &SpillAreaPaddingBytes,
1152                        &LocalsSlotsPaddingBytes);
1153   SpillAreaSizeBytes += SpillAreaPaddingBytes + LocalsSlotsPaddingBytes;
1154   uint32_t GlobalsAndSubsequentPaddingSize =
1155       GlobalsSize + LocalsSlotsPaddingBytes;
1156 
1157   // Functions returning scalar floating point types may need to convert values
1158   // from an in-register xmm value to the top of the x87 floating point stack.
1159   // This is done by a movp[sd] and an fld[sd].  Ensure there is enough scratch
1160   // space on the stack for this.
1161   const Type ReturnType = Func->getReturnType();
1162   if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
1163     if (isScalarFloatingType(ReturnType)) {
1164       // Avoid misaligned double-precision load/store.
1165       RequiredStackAlignment = std::max<size_t>(
1166           RequiredStackAlignment, Traits::X86_STACK_ALIGNMENT_BYTES);
1167       SpillAreaSizeBytes =
1168           std::max(typeWidthInBytesOnStack(ReturnType), SpillAreaSizeBytes);
1169     }
1170   }
1171 
1172   RequiredStackAlignment =
1173       std::max<size_t>(RequiredStackAlignment, SpillAreaAlignmentBytes);
1174 
1175   if (PrologEmitsFixedAllocas) {
1176     RequiredStackAlignment =
1177         std::max(RequiredStackAlignment, FixedAllocaAlignBytes);
1178   }
1179 
1180   // Combine fixed allocations into SpillAreaSizeBytes if we are emitting the
1181   // fixed allocations in the prolog.
1182   if (PrologEmitsFixedAllocas)
1183     SpillAreaSizeBytes += FixedAllocaSizeBytes;
1184 
1185   // Win64 ABI: add space for shadow store (aka home space)
1186   SpillAreaSizeBytes += ShadowStoreSize;
1187 
1188   // Entering the function has made the stack pointer unaligned. Re-align it by
1189   // adjusting the stack size.
1190   // Note that StackOffset does not include spill area. It's the offset from the
1191   // base stack pointer (epb), whether we set it or not, to the the first stack
1192   // arg (if any). StackSize, on the other hand, does include the spill area.
1193   const uint32_t StackOffset =
1194       ShadowStoreSize + Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
1195   uint32_t StackSize = Utils::applyAlignment(StackOffset + SpillAreaSizeBytes,
1196                                              RequiredStackAlignment);
1197   StackSize = Utils::applyAlignment(StackSize + maxOutArgsSizeBytes(),
1198                                     RequiredStackAlignment);
1199   SpillAreaSizeBytes = StackSize - StackOffset; // Adjust for alignment, if any
1200 
1201   if (SpillAreaSizeBytes) {
1202     emitStackProbe(SpillAreaSizeBytes);
1203 
1204     // Generate "sub stackptr, SpillAreaSizeBytes"
1205     _sub_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
1206   }
1207 
1208   // StackPointer: points just past the spill area (end of stack frame)
1209 
1210   // If the required alignment is greater than the stack pointer's guaranteed
1211   // alignment, align the stack pointer accordingly.
1212   if (RequiredStackAlignment > Traits::X86_STACK_ALIGNMENT_BYTES) {
1213     assert(IsEbpBasedFrame);
1214     _and(getPhysicalRegister(getStackReg(), Traits::WordType),
1215          Ctx->getConstantInt32(-RequiredStackAlignment));
1216   }
1217 
1218   // StackPointer: may have just been offset for alignment
1219 
1220   // Account for known-frame-offset alloca instructions that were not already
1221   // combined into the prolog.
1222   if (!PrologEmitsFixedAllocas)
1223     SpillAreaSizeBytes += FixedAllocaSizeBytes;
1224 
1225   Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
1226 
1227   // Fill in stack offsets for stack args, and copy args into registers for
1228   // those that were register-allocated. Args are pushed right to left, so
1229   // Arg[0] is closest to the stack/frame pointer.
1230   RegNumT FrameOrStackReg = IsEbpBasedFrame ? getFrameReg() : getStackReg();
1231   Variable *FramePtr = getPhysicalRegister(FrameOrStackReg, Traits::WordType);
1232   size_t BasicFrameOffset = StackOffset;
1233   if (!IsEbpBasedFrame)
1234     BasicFrameOffset += SpillAreaSizeBytes;
1235 
1236   emitGetIP(Node);
1237 
1238   const VarList &Args = Func->getArgs();
1239   size_t InArgsSizeBytes = 0;
1240   unsigned NumXmmArgs = 0;
1241   unsigned NumGPRArgs = 0;
1242   for (SizeT i = 0, NumArgs = Args.size(); i < NumArgs; ++i) {
1243     Variable *Arg = Args[i];
1244     // Skip arguments passed in registers.
1245     if (isVectorType(Arg->getType())) {
1246       if (Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs))
1247               .hasValue()) {
1248         ++NumXmmArgs;
1249         continue;
1250       }
1251     } else if (isScalarFloatingType(Arg->getType())) {
1252       if (Traits::X86_PASS_SCALAR_FP_IN_XMM &&
1253           Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs))
1254               .hasValue()) {
1255         ++NumXmmArgs;
1256         continue;
1257       }
1258     } else {
1259       assert(isScalarIntegerType(Arg->getType()));
1260       if (Traits::getRegisterForGprArgNum(Traits::WordType,
1261                                           Traits::getArgIndex(i, NumGPRArgs))
1262               .hasValue()) {
1263         ++NumGPRArgs;
1264         continue;
1265       }
1266     }
1267     // For esp-based frames where the allocas are done outside the prolog, the
1268     // esp value may not stabilize to its home value until after all the
1269     // fixed-size alloca instructions have executed.  In this case, a stack
1270     // adjustment is needed when accessing in-args in order to copy them into
1271     // registers.
1272     size_t StackAdjBytes = 0;
1273     if (!IsEbpBasedFrame && !PrologEmitsFixedAllocas)
1274       StackAdjBytes -= FixedAllocaSizeBytes;
1275     finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, StackAdjBytes,
1276                            InArgsSizeBytes);
1277   }
1278 
1279   // Fill in stack offsets for locals.
1280   assignVarStackSlots(SortedSpilledVariables, SpillAreaPaddingBytes,
1281                       SpillAreaSizeBytes, GlobalsAndSubsequentPaddingSize,
1282                       IsEbpBasedFrame && !needsStackPointerAlignment());
1283   // Assign stack offsets to variables that have been linked to spilled
1284   // variables.
1285   for (Variable *Var : VariablesLinkedToSpillSlots) {
1286     const Variable *Root = Var->getLinkedToStackRoot();
1287     assert(Root != nullptr);
1288     Var->setStackOffset(Root->getStackOffset());
1289   }
1290   this->HasComputedFrame = true;
1291 
1292   if (BuildDefs::dump() && Func->isVerbose(IceV_Frame)) {
1293     OstreamLocker L(Func->getContext());
1294     Ostream &Str = Func->getContext()->getStrDump();
1295 
1296     Str << "Stack layout:\n";
1297     uint32_t EspAdjustmentPaddingSize =
1298         SpillAreaSizeBytes - LocalsSpillAreaSize -
1299         GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes -
1300         maxOutArgsSizeBytes();
1301     Str << " in-args = " << InArgsSizeBytes << " bytes\n"
1302         << " return address = " << Traits::X86_RET_IP_SIZE_BYTES << " bytes\n"
1303         << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n"
1304         << " spill area padding = " << SpillAreaPaddingBytes << " bytes\n"
1305         << " globals spill area = " << GlobalsSize << " bytes\n"
1306         << " globals-locals spill areas intermediate padding = "
1307         << GlobalsAndSubsequentPaddingSize - GlobalsSize << " bytes\n"
1308         << " locals spill area = " << LocalsSpillAreaSize << " bytes\n"
1309         << " esp alignment padding = " << EspAdjustmentPaddingSize
1310         << " bytes\n";
1311 
1312     Str << "Stack details:\n"
1313         << " esp adjustment = " << SpillAreaSizeBytes << " bytes\n"
1314         << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n"
1315         << " outgoing args size = " << maxOutArgsSizeBytes() << " bytes\n"
1316         << " locals spill area alignment = " << LocalsSlotsAlignmentBytes
1317         << " bytes\n"
1318         << " is ebp based = " << IsEbpBasedFrame << "\n";
1319   }
1320 }
1321 
1322 /// Helper function for addProlog().
1323 ///
1324 /// This assumes Arg is an argument passed on the stack. This sets the frame
1325 /// offset for Arg and updates InArgsSizeBytes according to Arg's width. For an
1326 /// I64 arg that has been split into Lo and Hi components, it calls itself
1327 /// recursively on the components, taking care to handle Lo first because of the
1328 /// little-endian architecture. Lastly, this function generates an instruction
1329 /// to copy Arg into its assigned register if applicable.
1330 template <typename TraitsType>
1331 void TargetX86Base<TraitsType>::finishArgumentLowering(
1332     Variable *Arg, Variable *FramePtr, size_t BasicFrameOffset,
1333     size_t StackAdjBytes, size_t &InArgsSizeBytes) {
1334   if (!Traits::Is64Bit) {
1335     if (auto *Arg64On32 = llvm::dyn_cast<Variable64On32>(Arg)) {
1336       Variable *Lo = Arg64On32->getLo();
1337       Variable *Hi = Arg64On32->getHi();
1338       finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, StackAdjBytes,
1339                              InArgsSizeBytes);
1340       finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, StackAdjBytes,
1341                              InArgsSizeBytes);
1342       return;
1343     }
1344   }
1345   Type Ty = Arg->getType();
1346   if (isVectorType(Ty)) {
1347     InArgsSizeBytes = Traits::applyStackAlignment(InArgsSizeBytes);
1348   }
1349   Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);
1350   InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
1351   if (Arg->hasReg()) {
1352     assert(Ty != IceType_i64 || Traits::Is64Bit);
1353     auto *Mem = X86OperandMem::create(
1354         Func, Ty, FramePtr,
1355         Ctx->getConstantInt32(Arg->getStackOffset() + StackAdjBytes));
1356     if (isVectorType(Arg->getType())) {
1357       _movp(Arg, Mem);
1358     } else {
1359       _mov(Arg, Mem);
1360     }
1361     // This argument-copying instruction uses an explicit X86OperandMem
1362     // operand instead of a Variable, so its fill-from-stack operation has to
1363     // be tracked separately for statistics.
1364     Ctx->statsUpdateFills();
1365   }
1366 }
1367 
1368 template <typename TraitsType>
1369 void TargetX86Base<TraitsType>::addEpilog(CfgNode *Node) {
1370   InstList &Insts = Node->getInsts();
1371   InstList::reverse_iterator RI, E;
1372   for (RI = Insts.rbegin(), E = Insts.rend(); RI != E; ++RI) {
1373     if (llvm::isa<typename Traits::Insts::Ret>(*RI))
1374       break;
1375   }
1376   if (RI == E)
1377     return;
1378 
1379   // Convert the reverse_iterator position into its corresponding (forward)
1380   // iterator position.
1381   InstList::iterator InsertPoint = reverseToForwardIterator(RI);
1382   --InsertPoint;
1383   Context.init(Node);
1384   Context.setInsertPoint(InsertPoint);
1385 
1386   if (IsEbpBasedFrame) {
1387     _unlink_bp();
1388   } else {
1389     // add stackptr, SpillAreaSizeBytes
1390     if (SpillAreaSizeBytes != 0) {
1391       _add_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
1392     }
1393   }
1394 
1395   // Add pop instructions for preserved registers.
1396   SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
1397   SmallBitVector Popped(CalleeSaves.size());
1398   for (int32_t i = CalleeSaves.size() - 1; i >= 0; --i) {
1399     const auto RegNum = RegNumT::fromInt(i);
1400     if (RegNum == getFrameReg() && IsEbpBasedFrame)
1401       continue;
1402     const RegNumT Canonical = Traits::getBaseReg(RegNum);
1403     if (CalleeSaves[i] && RegsUsed[i]) {
1404       Popped[Canonical] = true;
1405     }
1406   }
1407   for (int32_t i = Popped.size() - 1; i >= 0; --i) {
1408     if (!Popped[i])
1409       continue;
1410     const auto RegNum = RegNumT::fromInt(i);
1411     assert(RegNum == Traits::getBaseReg(RegNum));
1412     _pop_reg(RegNum);
1413   }
1414 
1415   if (!NeedSandboxing) {
1416     return;
1417   }
1418   emitSandboxedReturn();
1419   if (RI->getSrcSize()) {
1420     auto *RetValue = llvm::cast<Variable>(RI->getSrc(0));
1421     Context.insert<InstFakeUse>(RetValue);
1422   }
1423   RI->setDeleted();
1424 }
1425 
1426 template <typename TraitsType> Type TargetX86Base<TraitsType>::stackSlotType() {
1427   return Traits::WordType;
1428 }
1429 
1430 template <typename TraitsType>
1431 template <typename T>
1432 typename std::enable_if<!T::Is64Bit, Operand>::type *
1433 TargetX86Base<TraitsType>::loOperand(Operand *Operand) {
1434   assert(Operand->getType() == IceType_i64 ||
1435          Operand->getType() == IceType_f64);
1436   if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
1437     return Operand;
1438   if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
1439     return Var64On32->getLo();
1440   if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
1441     auto *ConstInt = llvm::dyn_cast<ConstantInteger32>(
1442         Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue())));
1443     // Check if we need to blind/pool the constant.
1444     return legalize(ConstInt);
1445   }
1446   if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Operand)) {
1447     auto *MemOperand = X86OperandMem::create(
1448         Func, IceType_i32, Mem->getBase(), Mem->getOffset(), Mem->getIndex(),
1449         Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsRebased());
1450     // Test if we should randomize or pool the offset, if so randomize it or
1451     // pool it then create mem operand with the blinded/pooled constant.
1452     // Otherwise, return the mem operand as ordinary mem operand.
1453     return legalize(MemOperand);
1454   }
1455   llvm_unreachable("Unsupported operand type");
1456   return nullptr;
1457 }
1458 
1459 template <typename TraitsType>
1460 template <typename T>
1461 typename std::enable_if<!T::Is64Bit, Operand>::type *
1462 TargetX86Base<TraitsType>::hiOperand(Operand *Operand) {
1463   assert(Operand->getType() == IceType_i64 ||
1464          Operand->getType() == IceType_f64);
1465   if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
1466     return Operand;
1467   if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
1468     return Var64On32->getHi();
1469   if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
1470     auto *ConstInt = llvm::dyn_cast<ConstantInteger32>(
1471         Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue() >> 32)));
1472     // Check if we need to blind/pool the constant.
1473     return legalize(ConstInt);
1474   }
1475   if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Operand)) {
1476     Constant *Offset = Mem->getOffset();
1477     if (Offset == nullptr) {
1478       Offset = Ctx->getConstantInt32(4);
1479     } else if (auto *IntOffset = llvm::dyn_cast<ConstantInteger32>(Offset)) {
1480       Offset = Ctx->getConstantInt32(4 + IntOffset->getValue());
1481     } else if (auto *SymOffset = llvm::dyn_cast<ConstantRelocatable>(Offset)) {
1482       assert(!Utils::WouldOverflowAdd(SymOffset->getOffset(), 4));
1483       Offset =
1484           Ctx->getConstantSym(4 + SymOffset->getOffset(), SymOffset->getName());
1485     }
1486     auto *MemOperand = X86OperandMem::create(
1487         Func, IceType_i32, Mem->getBase(), Offset, Mem->getIndex(),
1488         Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsRebased());
1489     // Test if the Offset is an eligible i32 constants for randomization and
1490     // pooling. Blind/pool it if it is. Otherwise return as oridinary mem
1491     // operand.
1492     return legalize(MemOperand);
1493   }
1494   llvm_unreachable("Unsupported operand type");
1495   return nullptr;
1496 }
1497 
1498 template <typename TraitsType>
1499 SmallBitVector
1500 TargetX86Base<TraitsType>::getRegisterSet(RegSetMask Include,
1501                                           RegSetMask Exclude) const {
1502   return Traits::getRegisterSet(getFlags(), Include, Exclude);
1503 }
1504 
1505 template <typename TraitsType>
1506 void TargetX86Base<TraitsType>::lowerAlloca(const InstAlloca *Instr) {
1507   // Conservatively require the stack to be aligned. Some stack adjustment
1508   // operations implemented below assume that the stack is aligned before the
1509   // alloca. All the alloca code ensures that the stack alignment is preserved
1510   // after the alloca. The stack alignment restriction can be relaxed in some
1511   // cases.
1512   RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment,
1513                                             Traits::X86_STACK_ALIGNMENT_BYTES);
1514 
1515   // For default align=0, set it to the real value 1, to avoid any
1516   // bit-manipulation problems below.
1517   const uint32_t AlignmentParam = std::max(1u, Instr->getAlignInBytes());
1518 
1519   // LLVM enforces power of 2 alignment.
1520   assert(llvm::isPowerOf2_32(AlignmentParam));
1521   assert(llvm::isPowerOf2_32(Traits::X86_STACK_ALIGNMENT_BYTES));
1522 
1523   const uint32_t Alignment =
1524       std::max(AlignmentParam, Traits::X86_STACK_ALIGNMENT_BYTES);
1525   const bool OverAligned = Alignment > Traits::X86_STACK_ALIGNMENT_BYTES;
1526   const bool OptM1 = Func->getOptLevel() == Opt_m1;
1527   const bool AllocaWithKnownOffset = Instr->getKnownFrameOffset();
1528   const bool UseFramePointer =
1529       hasFramePointer() || OverAligned || !AllocaWithKnownOffset || OptM1;
1530 
1531   if (UseFramePointer)
1532     setHasFramePointer();
1533 
1534   Variable *esp = getPhysicalRegister(getStackReg(), Traits::WordType);
1535   if (OverAligned) {
1536     _and(esp, Ctx->getConstantInt32(-Alignment));
1537   }
1538 
1539   Variable *Dest = Instr->getDest();
1540   Operand *TotalSize = legalize(Instr->getSizeInBytes());
1541 
1542   if (const auto *ConstantTotalSize =
1543           llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
1544     const uint32_t Value =
1545         Utils::applyAlignment(ConstantTotalSize->getValue(), Alignment);
1546     if (UseFramePointer) {
1547       _sub_sp(Ctx->getConstantInt32(Value));
1548     } else {
1549       // If we don't need a Frame Pointer, this alloca has a known offset to the
1550       // stack pointer. We don't need adjust the stack pointer, nor assign any
1551       // value to Dest, as Dest is rematerializable.
1552       assert(Dest->isRematerializable());
1553       FixedAllocaSizeBytes += Value;
1554       Context.insert<InstFakeDef>(Dest);
1555     }
1556   } else {
1557     // Non-constant sizes need to be adjusted to the next highest multiple of
1558     // the required alignment at runtime.
1559     Variable *T = nullptr;
1560     if (Traits::Is64Bit && TotalSize->getType() != IceType_i64 &&
1561         !NeedSandboxing) {
1562       T = makeReg(IceType_i64);
1563       _movzx(T, TotalSize);
1564     } else {
1565       T = makeReg(IceType_i32);
1566       _mov(T, TotalSize);
1567     }
1568     _add(T, Ctx->getConstantInt32(Alignment - 1));
1569     _and(T, Ctx->getConstantInt32(-Alignment));
1570     _sub_sp(T);
1571   }
1572   // Add enough to the returned address to account for the out args area.
1573   uint32_t OutArgsSize = maxOutArgsSizeBytes();
1574   if (OutArgsSize > 0) {
1575     Variable *T = makeReg(Dest->getType());
1576     auto *CalculateOperand = X86OperandMem::create(
1577         Func, IceType_void, esp, Ctx->getConstantInt(IceType_i32, OutArgsSize));
1578     _lea(T, CalculateOperand);
1579     _mov(Dest, T);
1580   } else {
1581     _mov(Dest, esp);
1582   }
1583 }
1584 
1585 template <typename TraitsType>
1586 void TargetX86Base<TraitsType>::lowerArguments() {
1587   const bool OptM1 = Func->getOptLevel() == Opt_m1;
1588   VarList &Args = Func->getArgs();
1589   unsigned NumXmmArgs = 0;
1590   bool XmmSlotsRemain = true;
1591   unsigned NumGprArgs = 0;
1592   bool GprSlotsRemain = true;
1593 
1594   Context.init(Func->getEntryNode());
1595   Context.setInsertPoint(Context.getCur());
1596 
1597   for (SizeT i = 0, End = Args.size();
1598        i < End && (XmmSlotsRemain || GprSlotsRemain); ++i) {
1599     Variable *Arg = Args[i];
1600     Type Ty = Arg->getType();
1601     Variable *RegisterArg = nullptr;
1602     RegNumT RegNum;
1603     if (isVectorType(Ty)) {
1604       RegNum =
1605           Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs));
1606       if (RegNum.hasNoValue()) {
1607         XmmSlotsRemain = false;
1608         continue;
1609       }
1610       ++NumXmmArgs;
1611       RegisterArg = Func->makeVariable(Ty);
1612     } else if (isScalarFloatingType(Ty)) {
1613       if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
1614         continue;
1615       }
1616       RegNum =
1617           Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs));
1618       if (RegNum.hasNoValue()) {
1619         XmmSlotsRemain = false;
1620         continue;
1621       }
1622       ++NumXmmArgs;
1623       RegisterArg = Func->makeVariable(Ty);
1624     } else if (isScalarIntegerType(Ty)) {
1625       RegNum = Traits::getRegisterForGprArgNum(
1626           Ty, Traits::getArgIndex(i, NumGprArgs));
1627       if (RegNum.hasNoValue()) {
1628         GprSlotsRemain = false;
1629         continue;
1630       }
1631       ++NumGprArgs;
1632       RegisterArg = Func->makeVariable(Ty);
1633     }
1634     assert(RegNum.hasValue());
1635     assert(RegisterArg != nullptr);
1636     // Replace Arg in the argument list with the home register. Then generate
1637     // an instruction in the prolog to copy the home register to the assigned
1638     // location of Arg.
1639     if (BuildDefs::dump())
1640       RegisterArg->setName(Func, "home_reg:" + Arg->getName());
1641     RegisterArg->setRegNum(RegNum);
1642     RegisterArg->setIsArg();
1643     Arg->setIsArg(false);
1644 
1645     Args[i] = RegisterArg;
1646     // When not Om1, do the assignment through a temporary, instead of directly
1647     // from the pre-colored variable, so that a subsequent availabilityGet()
1648     // call has a chance to work.  (In Om1, don't bother creating extra
1649     // instructions with extra variables to register-allocate.)
1650     if (OptM1) {
1651       Context.insert<InstAssign>(Arg, RegisterArg);
1652     } else {
1653       Variable *Tmp = makeReg(RegisterArg->getType());
1654       Context.insert<InstAssign>(Tmp, RegisterArg);
1655       Context.insert<InstAssign>(Arg, Tmp);
1656     }
1657   }
1658   if (!OptM1)
1659     Context.availabilityUpdate();
1660 }
1661 
1662 /// Strength-reduce scalar integer multiplication by a constant (for i32 or
1663 /// narrower) for certain constants. The lea instruction can be used to multiply
1664 /// by 3, 5, or 9, and the lsh instruction can be used to multiply by powers of
1665 /// 2. These can be combined such that e.g. multiplying by 100 can be done as 2
1666 /// lea-based multiplies by 5, combined with left-shifting by 2.
1667 template <typename TraitsType>
1668 bool TargetX86Base<TraitsType>::optimizeScalarMul(Variable *Dest, Operand *Src0,
1669                                                   int32_t Src1) {
1670   // Disable this optimization for Om1 and O0, just to keep things simple
1671   // there.
1672   if (Func->getOptLevel() < Opt_1)
1673     return false;
1674   Type Ty = Dest->getType();
1675   if (Src1 == -1) {
1676     Variable *T = nullptr;
1677     _mov(T, Src0);
1678     _neg(T);
1679     _mov(Dest, T);
1680     return true;
1681   }
1682   if (Src1 == 0) {
1683     _mov(Dest, Ctx->getConstantZero(Ty));
1684     return true;
1685   }
1686   if (Src1 == 1) {
1687     Variable *T = nullptr;
1688     _mov(T, Src0);
1689     _mov(Dest, T);
1690     return true;
1691   }
1692   // Don't bother with the edge case where Src1 == MININT.
1693   if (Src1 == -Src1)
1694     return false;
1695   const bool Src1IsNegative = Src1 < 0;
1696   if (Src1IsNegative)
1697     Src1 = -Src1;
1698   uint32_t Count9 = 0;
1699   uint32_t Count5 = 0;
1700   uint32_t Count3 = 0;
1701   uint32_t Count2 = 0;
1702   uint32_t CountOps = 0;
1703   while (Src1 > 1) {
1704     if (Src1 % 9 == 0) {
1705       ++CountOps;
1706       ++Count9;
1707       Src1 /= 9;
1708     } else if (Src1 % 5 == 0) {
1709       ++CountOps;
1710       ++Count5;
1711       Src1 /= 5;
1712     } else if (Src1 % 3 == 0) {
1713       ++CountOps;
1714       ++Count3;
1715       Src1 /= 3;
1716     } else if (Src1 % 2 == 0) {
1717       if (Count2 == 0)
1718         ++CountOps;
1719       ++Count2;
1720       Src1 /= 2;
1721     } else {
1722       return false;
1723     }
1724   }
1725   // Lea optimization only works for i16 and i32 types, not i8.
1726   if (Ty != IceType_i32 && !(Traits::Is64Bit && Ty == IceType_i64) &&
1727       (Count3 || Count5 || Count9))
1728     return false;
1729   // Limit the number of lea/shl operations for a single multiply, to a
1730   // somewhat arbitrary choice of 3.
1731   constexpr uint32_t MaxOpsForOptimizedMul = 3;
1732   if (CountOps > MaxOpsForOptimizedMul)
1733     return false;
1734   Variable *T = makeReg(Traits::WordType);
1735   if (typeWidthInBytes(Src0->getType()) < typeWidthInBytes(T->getType())) {
1736     Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
1737     _movzx(T, Src0RM);
1738   } else {
1739     _mov(T, Src0);
1740   }
1741   Constant *Zero = Ctx->getConstantZero(IceType_i32);
1742   for (uint32_t i = 0; i < Count9; ++i) {
1743     constexpr uint16_t Shift = 3; // log2(9-1)
1744     _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
1745   }
1746   for (uint32_t i = 0; i < Count5; ++i) {
1747     constexpr uint16_t Shift = 2; // log2(5-1)
1748     _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
1749   }
1750   for (uint32_t i = 0; i < Count3; ++i) {
1751     constexpr uint16_t Shift = 1; // log2(3-1)
1752     _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
1753   }
1754   if (Count2) {
1755     _shl(T, Ctx->getConstantInt(Ty, Count2));
1756   }
1757   if (Src1IsNegative)
1758     _neg(T);
1759   _mov(Dest, T);
1760   return true;
1761 }
1762 
1763 template <typename TraitsType>
1764 void TargetX86Base<TraitsType>::lowerShift64(InstArithmetic::OpKind Op,
1765                                              Operand *Src0Lo, Operand *Src0Hi,
1766                                              Operand *Src1Lo, Variable *DestLo,
1767                                              Variable *DestHi) {
1768   // TODO: Refactor the similarities between Shl, Lshr, and Ashr.
1769   Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
1770   Constant *Zero = Ctx->getConstantZero(IceType_i32);
1771   Constant *SignExtend = Ctx->getConstantInt32(0x1f);
1772   if (auto *ConstantShiftAmount = llvm::dyn_cast<ConstantInteger32>(Src1Lo)) {
1773     uint32_t ShiftAmount = ConstantShiftAmount->getValue();
1774     if (ShiftAmount > 32) {
1775       Constant *ReducedShift = Ctx->getConstantInt32(ShiftAmount - 32);
1776       switch (Op) {
1777       default:
1778         assert(0 && "non-shift op");
1779         break;
1780       case InstArithmetic::Shl: {
1781         // a=b<<c ==>
1782         //   t2 = b.lo
1783         //   t2 = shl t2, ShiftAmount-32
1784         //   t3 = t2
1785         //   t2 = 0
1786         _mov(T_2, Src0Lo);
1787         _shl(T_2, ReducedShift);
1788         _mov(DestHi, T_2);
1789         _mov(DestLo, Zero);
1790       } break;
1791       case InstArithmetic::Lshr: {
1792         // a=b>>c (unsigned) ==>
1793         //   t2 = b.hi
1794         //   t2 = shr t2, ShiftAmount-32
1795         //   a.lo = t2
1796         //   a.hi = 0
1797         _mov(T_2, Src0Hi);
1798         _shr(T_2, ReducedShift);
1799         _mov(DestLo, T_2);
1800         _mov(DestHi, Zero);
1801       } break;
1802       case InstArithmetic::Ashr: {
1803         // a=b>>c (signed) ==>
1804         //   t3 = b.hi
1805         //   t3 = sar t3, 0x1f
1806         //   t2 = b.hi
1807         //   t2 = shrd t2, t3, ShiftAmount-32
1808         //   a.lo = t2
1809         //   a.hi = t3
1810         _mov(T_3, Src0Hi);
1811         _sar(T_3, SignExtend);
1812         _mov(T_2, Src0Hi);
1813         _shrd(T_2, T_3, ReducedShift);
1814         _mov(DestLo, T_2);
1815         _mov(DestHi, T_3);
1816       } break;
1817       }
1818     } else if (ShiftAmount == 32) {
1819       switch (Op) {
1820       default:
1821         assert(0 && "non-shift op");
1822         break;
1823       case InstArithmetic::Shl: {
1824         // a=b<<c ==>
1825         //   t2 = b.lo
1826         //   a.hi = t2
1827         //   a.lo = 0
1828         _mov(T_2, Src0Lo);
1829         _mov(DestHi, T_2);
1830         _mov(DestLo, Zero);
1831       } break;
1832       case InstArithmetic::Lshr: {
1833         // a=b>>c (unsigned) ==>
1834         //   t2 = b.hi
1835         //   a.lo = t2
1836         //   a.hi = 0
1837         _mov(T_2, Src0Hi);
1838         _mov(DestLo, T_2);
1839         _mov(DestHi, Zero);
1840       } break;
1841       case InstArithmetic::Ashr: {
1842         // a=b>>c (signed) ==>
1843         //   t2 = b.hi
1844         //   a.lo = t2
1845         //   t3 = b.hi
1846         //   t3 = sar t3, 0x1f
1847         //   a.hi = t3
1848         _mov(T_2, Src0Hi);
1849         _mov(DestLo, T_2);
1850         _mov(T_3, Src0Hi);
1851         _sar(T_3, SignExtend);
1852         _mov(DestHi, T_3);
1853       } break;
1854       }
1855     } else {
1856       // COMMON PREFIX OF: a=b SHIFT_OP c ==>
1857       //   t2 = b.lo
1858       //   t3 = b.hi
1859       _mov(T_2, Src0Lo);
1860       _mov(T_3, Src0Hi);
1861       switch (Op) {
1862       default:
1863         assert(0 && "non-shift op");
1864         break;
1865       case InstArithmetic::Shl: {
1866         // a=b<<c ==>
1867         //   t3 = shld t3, t2, ShiftAmount
1868         //   t2 = shl t2, ShiftAmount
1869         _shld(T_3, T_2, ConstantShiftAmount);
1870         _shl(T_2, ConstantShiftAmount);
1871       } break;
1872       case InstArithmetic::Lshr: {
1873         // a=b>>c (unsigned) ==>
1874         //   t2 = shrd t2, t3, ShiftAmount
1875         //   t3 = shr t3, ShiftAmount
1876         _shrd(T_2, T_3, ConstantShiftAmount);
1877         _shr(T_3, ConstantShiftAmount);
1878       } break;
1879       case InstArithmetic::Ashr: {
1880         // a=b>>c (signed) ==>
1881         //   t2 = shrd t2, t3, ShiftAmount
1882         //   t3 = sar t3, ShiftAmount
1883         _shrd(T_2, T_3, ConstantShiftAmount);
1884         _sar(T_3, ConstantShiftAmount);
1885       } break;
1886       }
1887       // COMMON SUFFIX OF: a=b SHIFT_OP c ==>
1888       //   a.lo = t2
1889       //   a.hi = t3
1890       _mov(DestLo, T_2);
1891       _mov(DestHi, T_3);
1892     }
1893   } else {
1894     // NON-CONSTANT CASES.
1895     Constant *BitTest = Ctx->getConstantInt32(0x20);
1896     InstX86Label *Label = InstX86Label::create(Func, this);
1897     // COMMON PREFIX OF: a=b SHIFT_OP c ==>
1898     //   t1:ecx = c.lo & 0xff
1899     //   t2 = b.lo
1900     //   t3 = b.hi
1901     T_1 = copyToReg8(Src1Lo, Traits::RegisterSet::Reg_cl);
1902     _mov(T_2, Src0Lo);
1903     _mov(T_3, Src0Hi);
1904     switch (Op) {
1905     default:
1906       assert(0 && "non-shift op");
1907       break;
1908     case InstArithmetic::Shl: {
1909       // a=b<<c ==>
1910       //   t3 = shld t3, t2, t1
1911       //   t2 = shl t2, t1
1912       //   test t1, 0x20
1913       //   je L1
1914       //   use(t3)
1915       //   t3 = t2
1916       //   t2 = 0
1917       _shld(T_3, T_2, T_1);
1918       _shl(T_2, T_1);
1919       _test(T_1, BitTest);
1920       _br(Traits::Cond::Br_e, Label);
1921       // T_2 and T_3 are being assigned again because of the intra-block control
1922       // flow, so we need to use _redefined to avoid liveness problems.
1923       _redefined(_mov(T_3, T_2));
1924       _redefined(_mov(T_2, Zero));
1925     } break;
1926     case InstArithmetic::Lshr: {
1927       // a=b>>c (unsigned) ==>
1928       //   t2 = shrd t2, t3, t1
1929       //   t3 = shr t3, t1
1930       //   test t1, 0x20
1931       //   je L1
1932       //   use(t2)
1933       //   t2 = t3
1934       //   t3 = 0
1935       _shrd(T_2, T_3, T_1);
1936       _shr(T_3, T_1);
1937       _test(T_1, BitTest);
1938       _br(Traits::Cond::Br_e, Label);
1939       // T_2 and T_3 are being assigned again because of the intra-block control
1940       // flow, so we need to use _redefined to avoid liveness problems.
1941       _redefined(_mov(T_2, T_3));
1942       _redefined(_mov(T_3, Zero));
1943     } break;
1944     case InstArithmetic::Ashr: {
1945       // a=b>>c (signed) ==>
1946       //   t2 = shrd t2, t3, t1
1947       //   t3 = sar t3, t1
1948       //   test t1, 0x20
1949       //   je L1
1950       //   use(t2)
1951       //   t2 = t3
1952       //   t3 = sar t3, 0x1f
1953       Constant *SignExtend = Ctx->getConstantInt32(0x1f);
1954       _shrd(T_2, T_3, T_1);
1955       _sar(T_3, T_1);
1956       _test(T_1, BitTest);
1957       _br(Traits::Cond::Br_e, Label);
1958       // T_2 and T_3 are being assigned again because of the intra-block control
1959       // flow, so T_2 needs to use _redefined to avoid liveness problems. T_3
1960       // doesn't need special treatment because it is reassigned via _sar
1961       // instead of _mov.
1962       _redefined(_mov(T_2, T_3));
1963       _sar(T_3, SignExtend);
1964     } break;
1965     }
1966     // COMMON SUFFIX OF: a=b SHIFT_OP c ==>
1967     // L1:
1968     //   a.lo = t2
1969     //   a.hi = t3
1970     Context.insert(Label);
1971     _mov(DestLo, T_2);
1972     _mov(DestHi, T_3);
1973   }
1974 }
1975 
1976 template <typename TraitsType>
1977 void TargetX86Base<TraitsType>::lowerArithmetic(const InstArithmetic *Instr) {
1978   Variable *Dest = Instr->getDest();
1979   if (Dest->isRematerializable()) {
1980     Context.insert<InstFakeDef>(Dest);
1981     return;
1982   }
1983   Type Ty = Dest->getType();
1984   Operand *Src0 = legalize(Instr->getSrc(0));
1985   Operand *Src1 = legalize(Instr->getSrc(1));
1986   if (Instr->isCommutative()) {
1987     uint32_t SwapCount = 0;
1988     if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1)) {
1989       std::swap(Src0, Src1);
1990       ++SwapCount;
1991     }
1992     if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1)) {
1993       std::swap(Src0, Src1);
1994       ++SwapCount;
1995     }
1996     // Improve two-address code patterns by avoiding a copy to the dest
1997     // register when one of the source operands ends its lifetime here.
1998     if (!Instr->isLastUse(Src0) && Instr->isLastUse(Src1)) {
1999       std::swap(Src0, Src1);
2000       ++SwapCount;
2001     }
2002     assert(SwapCount <= 1);
2003     (void)SwapCount;
2004   }
2005   if (!Traits::Is64Bit && Ty == IceType_i64) {
2006     // These x86-32 helper-call-involved instructions are lowered in this
2007     // separate switch. This is because loOperand() and hiOperand() may insert
2008     // redundant instructions for constant blinding and pooling. Such redundant
2009     // instructions will fail liveness analysis under -Om1 setting. And,
2010     // actually these arguments do not need to be processed with loOperand()
2011     // and hiOperand() to be used.
2012     switch (Instr->getOp()) {
2013     case InstArithmetic::Udiv:
2014     case InstArithmetic::Sdiv:
2015     case InstArithmetic::Urem:
2016     case InstArithmetic::Srem:
2017       llvm::report_fatal_error("Helper call was expected");
2018       return;
2019     default:
2020       break;
2021     }
2022 
2023     auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
2024     auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
2025     Operand *Src0Lo = loOperand(Src0);
2026     Operand *Src0Hi = hiOperand(Src0);
2027     Operand *Src1Lo = loOperand(Src1);
2028     Operand *Src1Hi = hiOperand(Src1);
2029     Variable *T_Lo = nullptr, *T_Hi = nullptr;
2030     switch (Instr->getOp()) {
2031     case InstArithmetic::_num:
2032       llvm_unreachable("Unknown arithmetic operator");
2033       break;
2034     case InstArithmetic::Add:
2035       _mov(T_Lo, Src0Lo);
2036       _add(T_Lo, Src1Lo);
2037       _mov(DestLo, T_Lo);
2038       _mov(T_Hi, Src0Hi);
2039       _adc(T_Hi, Src1Hi);
2040       _mov(DestHi, T_Hi);
2041       break;
2042     case InstArithmetic::And:
2043       _mov(T_Lo, Src0Lo);
2044       _and(T_Lo, Src1Lo);
2045       _mov(DestLo, T_Lo);
2046       _mov(T_Hi, Src0Hi);
2047       _and(T_Hi, Src1Hi);
2048       _mov(DestHi, T_Hi);
2049       break;
2050     case InstArithmetic::Or:
2051       _mov(T_Lo, Src0Lo);
2052       _or(T_Lo, Src1Lo);
2053       _mov(DestLo, T_Lo);
2054       _mov(T_Hi, Src0Hi);
2055       _or(T_Hi, Src1Hi);
2056       _mov(DestHi, T_Hi);
2057       break;
2058     case InstArithmetic::Xor:
2059       _mov(T_Lo, Src0Lo);
2060       _xor(T_Lo, Src1Lo);
2061       _mov(DestLo, T_Lo);
2062       _mov(T_Hi, Src0Hi);
2063       _xor(T_Hi, Src1Hi);
2064       _mov(DestHi, T_Hi);
2065       break;
2066     case InstArithmetic::Sub:
2067       _mov(T_Lo, Src0Lo);
2068       _sub(T_Lo, Src1Lo);
2069       _mov(DestLo, T_Lo);
2070       _mov(T_Hi, Src0Hi);
2071       _sbb(T_Hi, Src1Hi);
2072       _mov(DestHi, T_Hi);
2073       break;
2074     case InstArithmetic::Mul: {
2075       Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
2076       Variable *T_4Lo = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
2077       Variable *T_4Hi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
2078       // gcc does the following:
2079       // a=b*c ==>
2080       //   t1 = b.hi; t1 *=(imul) c.lo
2081       //   t2 = c.hi; t2 *=(imul) b.lo
2082       //   t3:eax = b.lo
2083       //   t4.hi:edx,t4.lo:eax = t3:eax *(mul) c.lo
2084       //   a.lo = t4.lo
2085       //   t4.hi += t1
2086       //   t4.hi += t2
2087       //   a.hi = t4.hi
2088       // The mul instruction cannot take an immediate operand.
2089       Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Mem);
2090       _mov(T_1, Src0Hi);
2091       _imul(T_1, Src1Lo);
2092       _mov(T_3, Src0Lo, Traits::RegisterSet::Reg_eax);
2093       _mul(T_4Lo, T_3, Src1Lo);
2094       // The mul instruction produces two dest variables, edx:eax. We create a
2095       // fake definition of edx to account for this.
2096       Context.insert<InstFakeDef>(T_4Hi, T_4Lo);
2097       Context.insert<InstFakeUse>(T_4Hi);
2098       _mov(DestLo, T_4Lo);
2099       _add(T_4Hi, T_1);
2100       _mov(T_2, Src1Hi);
2101       _imul(T_2, Src0Lo);
2102       _add(T_4Hi, T_2);
2103       _mov(DestHi, T_4Hi);
2104     } break;
2105     case InstArithmetic::Shl:
2106     case InstArithmetic::Lshr:
2107     case InstArithmetic::Ashr:
2108       lowerShift64(Instr->getOp(), Src0Lo, Src0Hi, Src1Lo, DestLo, DestHi);
2109       break;
2110     case InstArithmetic::Fadd:
2111     case InstArithmetic::Fsub:
2112     case InstArithmetic::Fmul:
2113     case InstArithmetic::Fdiv:
2114     case InstArithmetic::Frem:
2115       llvm_unreachable("FP instruction with i64 type");
2116       break;
2117     case InstArithmetic::Udiv:
2118     case InstArithmetic::Sdiv:
2119     case InstArithmetic::Urem:
2120     case InstArithmetic::Srem:
2121       llvm_unreachable("Call-helper-involved instruction for i64 type \
2122                        should have already been handled before");
2123       break;
2124     }
2125     return;
2126   }
2127   if (isVectorType(Ty)) {
2128     // TODO: Trap on integer divide and integer modulo by zero. See:
2129     // https://code.google.com/p/nativeclient/issues/detail?id=3899
2130     if (llvm::isa<X86OperandMem>(Src1))
2131       Src1 = legalizeToReg(Src1);
2132     switch (Instr->getOp()) {
2133     case InstArithmetic::_num:
2134       llvm_unreachable("Unknown arithmetic operator");
2135       break;
2136     case InstArithmetic::Add: {
2137       Variable *T = makeReg(Ty);
2138       _movp(T, Src0);
2139       _padd(T, Src1);
2140       _movp(Dest, T);
2141     } break;
2142     case InstArithmetic::And: {
2143       Variable *T = makeReg(Ty);
2144       _movp(T, Src0);
2145       _pand(T, Src1);
2146       _movp(Dest, T);
2147     } break;
2148     case InstArithmetic::Or: {
2149       Variable *T = makeReg(Ty);
2150       _movp(T, Src0);
2151       _por(T, Src1);
2152       _movp(Dest, T);
2153     } break;
2154     case InstArithmetic::Xor: {
2155       Variable *T = makeReg(Ty);
2156       _movp(T, Src0);
2157       _pxor(T, Src1);
2158       _movp(Dest, T);
2159     } break;
2160     case InstArithmetic::Sub: {
2161       Variable *T = makeReg(Ty);
2162       _movp(T, Src0);
2163       _psub(T, Src1);
2164       _movp(Dest, T);
2165     } break;
2166     case InstArithmetic::Mul: {
2167       bool TypesAreValidForPmull = Ty == IceType_v4i32 || Ty == IceType_v8i16;
2168       bool InstructionSetIsValidForPmull =
2169           Ty == IceType_v8i16 || InstructionSet >= Traits::SSE4_1;
2170       if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {
2171         Variable *T = makeReg(Ty);
2172         _movp(T, Src0);
2173         _pmull(T, Src0 == Src1 ? T : Src1);
2174         _movp(Dest, T);
2175       } else if (Ty == IceType_v4i32) {
2176         // Lowering sequence:
2177         // Note: The mask arguments have index 0 on the left.
2178         //
2179         // movups  T1, Src0
2180         // pshufd  T2, Src0, {1,0,3,0}
2181         // pshufd  T3, Src1, {1,0,3,0}
2182         // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]}
2183         // pmuludq T1, Src1
2184         // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]}
2185         // pmuludq T2, T3
2186         // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])}
2187         // shufps  T1, T2, {0,2,0,2}
2188         // pshufd  T4, T1, {0,2,1,3}
2189         // movups  Dest, T4
2190 
2191         // Mask that directs pshufd to create a vector with entries
2192         // Src[1, 0, 3, 0]
2193         constexpr unsigned Constant1030 = 0x31;
2194         Constant *Mask1030 = Ctx->getConstantInt32(Constant1030);
2195         // Mask that directs shufps to create a vector with entries
2196         // Dest[0, 2], Src[0, 2]
2197         constexpr unsigned Mask0202 = 0x88;
2198         // Mask that directs pshufd to create a vector with entries
2199         // Src[0, 2, 1, 3]
2200         constexpr unsigned Mask0213 = 0xd8;
2201         Variable *T1 = makeReg(IceType_v4i32);
2202         Variable *T2 = makeReg(IceType_v4i32);
2203         Variable *T3 = makeReg(IceType_v4i32);
2204         Variable *T4 = makeReg(IceType_v4i32);
2205         _movp(T1, Src0);
2206         _pshufd(T2, Src0, Mask1030);
2207         _pshufd(T3, Src1, Mask1030);
2208         _pmuludq(T1, Src1);
2209         _pmuludq(T2, T3);
2210         _shufps(T1, T2, Ctx->getConstantInt32(Mask0202));
2211         _pshufd(T4, T1, Ctx->getConstantInt32(Mask0213));
2212         _movp(Dest, T4);
2213       } else if (Ty == IceType_v16i8) {
2214         llvm::report_fatal_error("Scalarized operation was expected");
2215       } else {
2216         llvm::report_fatal_error("Invalid vector multiply type");
2217       }
2218     } break;
2219     case InstArithmetic::Shl: {
2220       assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
2221       Variable *T = makeReg(Ty);
2222       _movp(T, Src0);
2223       _psll(T, Src1);
2224       _movp(Dest, T);
2225     } break;
2226     case InstArithmetic::Lshr: {
2227       assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
2228       Variable *T = makeReg(Ty);
2229       _movp(T, Src0);
2230       _psrl(T, Src1);
2231       _movp(Dest, T);
2232     } break;
2233     case InstArithmetic::Ashr: {
2234       assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
2235       Variable *T = makeReg(Ty);
2236       _movp(T, Src0);
2237       _psra(T, Src1);
2238       _movp(Dest, T);
2239     } break;
2240     case InstArithmetic::Udiv:
2241     case InstArithmetic::Urem:
2242     case InstArithmetic::Sdiv:
2243     case InstArithmetic::Srem:
2244       llvm::report_fatal_error("Scalarized operation was expected");
2245       break;
2246     case InstArithmetic::Fadd: {
2247       Variable *T = makeReg(Ty);
2248       _movp(T, Src0);
2249       _addps(T, Src1);
2250       _movp(Dest, T);
2251     } break;
2252     case InstArithmetic::Fsub: {
2253       Variable *T = makeReg(Ty);
2254       _movp(T, Src0);
2255       _subps(T, Src1);
2256       _movp(Dest, T);
2257     } break;
2258     case InstArithmetic::Fmul: {
2259       Variable *T = makeReg(Ty);
2260       _movp(T, Src0);
2261       _mulps(T, Src0 == Src1 ? T : Src1);
2262       _movp(Dest, T);
2263     } break;
2264     case InstArithmetic::Fdiv: {
2265       Variable *T = makeReg(Ty);
2266       _movp(T, Src0);
2267       _divps(T, Src1);
2268       _movp(Dest, T);
2269     } break;
2270     case InstArithmetic::Frem:
2271       llvm::report_fatal_error("Scalarized operation was expected");
2272       break;
2273     }
2274     return;
2275   }
2276   Variable *T_edx = nullptr;
2277   Variable *T = nullptr;
2278   switch (Instr->getOp()) {
2279   case InstArithmetic::_num:
2280     llvm_unreachable("Unknown arithmetic operator");
2281     break;
2282   case InstArithmetic::Add: {
2283     const bool ValidType =
2284         Ty == IceType_i32 || (Ty == IceType_i64 && Traits::Is64Bit);
2285     auto *Const = llvm::dyn_cast<Constant>(Instr->getSrc(1));
2286     const bool ValidKind =
2287         Const != nullptr && (llvm::isa<ConstantInteger32>(Const) ||
2288                              llvm::isa<ConstantRelocatable>(Const));
2289     if (getFlags().getAggressiveLea() && ValidType && ValidKind) {
2290       auto *Var = legalizeToReg(Src0);
2291       auto *Mem = Traits::X86OperandMem::create(Func, IceType_void, Var, Const);
2292       T = makeReg(Ty);
2293       _lea(T, _sandbox_mem_reference(Mem));
2294       _mov(Dest, T);
2295       break;
2296     }
2297     _mov(T, Src0);
2298     _add(T, Src1);
2299     _mov(Dest, T);
2300   } break;
2301   case InstArithmetic::And:
2302     _mov(T, Src0);
2303     _and(T, Src1);
2304     _mov(Dest, T);
2305     break;
2306   case InstArithmetic::Or:
2307     _mov(T, Src0);
2308     _or(T, Src1);
2309     _mov(Dest, T);
2310     break;
2311   case InstArithmetic::Xor:
2312     _mov(T, Src0);
2313     _xor(T, Src1);
2314     _mov(Dest, T);
2315     break;
2316   case InstArithmetic::Sub:
2317     _mov(T, Src0);
2318     _sub(T, Src1);
2319     _mov(Dest, T);
2320     break;
2321   case InstArithmetic::Mul:
2322     if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
2323       if (optimizeScalarMul(Dest, Src0, C->getValue()))
2324         return;
2325     }
2326     // The 8-bit version of imul only allows the form "imul r/m8" where T must
2327     // be in al.
2328     if (isByteSizedArithType(Ty)) {
2329       _mov(T, Src0, Traits::RegisterSet::Reg_al);
2330       Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2331       _imul(T, Src0 == Src1 ? T : Src1);
2332       _mov(Dest, T);
2333     } else if (auto *ImmConst = llvm::dyn_cast<ConstantInteger32>(Src1)) {
2334       T = makeReg(Ty);
2335       _imul_imm(T, Src0, ImmConst);
2336       _mov(Dest, T);
2337     } else {
2338       _mov(T, Src0);
2339       _imul(T, Src0 == Src1 ? T : Src1);
2340       _mov(Dest, T);
2341     }
2342     break;
2343   case InstArithmetic::Shl:
2344     _mov(T, Src0);
2345     if (!llvm::isa<ConstantInteger32>(Src1) &&
2346         !llvm::isa<ConstantInteger64>(Src1))
2347       Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
2348     _shl(T, Src1);
2349     _mov(Dest, T);
2350     break;
2351   case InstArithmetic::Lshr:
2352     _mov(T, Src0);
2353     if (!llvm::isa<ConstantInteger32>(Src1) &&
2354         !llvm::isa<ConstantInteger64>(Src1))
2355       Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
2356     _shr(T, Src1);
2357     _mov(Dest, T);
2358     break;
2359   case InstArithmetic::Ashr:
2360     _mov(T, Src0);
2361     if (!llvm::isa<ConstantInteger32>(Src1) &&
2362         !llvm::isa<ConstantInteger64>(Src1))
2363       Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
2364     _sar(T, Src1);
2365     _mov(Dest, T);
2366     break;
2367   case InstArithmetic::Udiv: {
2368     // div and idiv are the few arithmetic operators that do not allow
2369     // immediates as the operand.
2370     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2371     RegNumT Eax;
2372     RegNumT Edx;
2373     switch (Ty) {
2374     default:
2375       llvm::report_fatal_error("Bad type for udiv");
2376     case IceType_i64:
2377       Eax = Traits::getRaxOrDie();
2378       Edx = Traits::getRdxOrDie();
2379       break;
2380     case IceType_i32:
2381       Eax = Traits::RegisterSet::Reg_eax;
2382       Edx = Traits::RegisterSet::Reg_edx;
2383       break;
2384     case IceType_i16:
2385       Eax = Traits::RegisterSet::Reg_ax;
2386       Edx = Traits::RegisterSet::Reg_dx;
2387       break;
2388     case IceType_i8:
2389       Eax = Traits::RegisterSet::Reg_al;
2390       Edx = Traits::RegisterSet::Reg_ah;
2391       break;
2392     }
2393     T_edx = makeReg(Ty, Edx);
2394     _mov(T, Src0, Eax);
2395     _mov(T_edx, Ctx->getConstantZero(Ty));
2396     _div(T_edx, Src1, T);
2397     _redefined(Context.insert<InstFakeDef>(T, T_edx));
2398     _mov(Dest, T);
2399   } break;
2400   case InstArithmetic::Sdiv:
2401     // TODO(stichnot): Enable this after doing better performance and cross
2402     // testing.
2403     if (false && Func->getOptLevel() >= Opt_1) {
2404       // Optimize division by constant power of 2, but not for Om1 or O0, just
2405       // to keep things simple there.
2406       if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
2407         const int32_t Divisor = C->getValue();
2408         const uint32_t UDivisor = Divisor;
2409         if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
2410           uint32_t LogDiv = llvm::Log2_32(UDivisor);
2411           // LLVM does the following for dest=src/(1<<log):
2412           //   t=src
2413           //   sar t,typewidth-1 // -1 if src is negative, 0 if not
2414           //   shr t,typewidth-log
2415           //   add t,src
2416           //   sar t,log
2417           //   dest=t
2418           uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty);
2419           _mov(T, Src0);
2420           // If for some reason we are dividing by 1, just treat it like an
2421           // assignment.
2422           if (LogDiv > 0) {
2423             // The initial sar is unnecessary when dividing by 2.
2424             if (LogDiv > 1)
2425               _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
2426             _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
2427             _add(T, Src0);
2428             _sar(T, Ctx->getConstantInt(Ty, LogDiv));
2429           }
2430           _mov(Dest, T);
2431           return;
2432         }
2433       }
2434     }
2435     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2436     switch (Ty) {
2437     default:
2438       llvm::report_fatal_error("Bad type for sdiv");
2439     case IceType_i64:
2440       T_edx = makeReg(Ty, Traits::getRdxOrDie());
2441       _mov(T, Src0, Traits::getRaxOrDie());
2442       break;
2443     case IceType_i32:
2444       T_edx = makeReg(Ty, Traits::RegisterSet::Reg_edx);
2445       _mov(T, Src0, Traits::RegisterSet::Reg_eax);
2446       break;
2447     case IceType_i16:
2448       T_edx = makeReg(Ty, Traits::RegisterSet::Reg_dx);
2449       _mov(T, Src0, Traits::RegisterSet::Reg_ax);
2450       break;
2451     case IceType_i8:
2452       T_edx = makeReg(IceType_i16, Traits::RegisterSet::Reg_ax);
2453       _mov(T, Src0, Traits::RegisterSet::Reg_al);
2454       break;
2455     }
2456     _cbwdq(T_edx, T);
2457     _idiv(T_edx, Src1, T);
2458     _redefined(Context.insert<InstFakeDef>(T, T_edx));
2459     _mov(Dest, T);
2460     break;
2461   case InstArithmetic::Urem: {
2462     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2463     RegNumT Eax;
2464     RegNumT Edx;
2465     switch (Ty) {
2466     default:
2467       llvm::report_fatal_error("Bad type for urem");
2468     case IceType_i64:
2469       Eax = Traits::getRaxOrDie();
2470       Edx = Traits::getRdxOrDie();
2471       break;
2472     case IceType_i32:
2473       Eax = Traits::RegisterSet::Reg_eax;
2474       Edx = Traits::RegisterSet::Reg_edx;
2475       break;
2476     case IceType_i16:
2477       Eax = Traits::RegisterSet::Reg_ax;
2478       Edx = Traits::RegisterSet::Reg_dx;
2479       break;
2480     case IceType_i8:
2481       Eax = Traits::RegisterSet::Reg_al;
2482       Edx = Traits::RegisterSet::Reg_ah;
2483       break;
2484     }
2485     T_edx = makeReg(Ty, Edx);
2486     _mov(T_edx, Ctx->getConstantZero(Ty));
2487     _mov(T, Src0, Eax);
2488     _div(T, Src1, T_edx);
2489     _redefined(Context.insert<InstFakeDef>(T_edx, T));
2490     if (Ty == IceType_i8) {
2491       // Register ah must be moved into one of {al,bl,cl,dl} before it can be
2492       // moved into a general 8-bit register.
2493       auto *T_AhRcvr = makeReg(Ty);
2494       T_AhRcvr->setRegClass(RCX86_IsAhRcvr);
2495       _mov(T_AhRcvr, T_edx);
2496       T_edx = T_AhRcvr;
2497     }
2498     _mov(Dest, T_edx);
2499   } break;
2500   case InstArithmetic::Srem: {
2501     // TODO(stichnot): Enable this after doing better performance and cross
2502     // testing.
2503     if (false && Func->getOptLevel() >= Opt_1) {
2504       // Optimize mod by constant power of 2, but not for Om1 or O0, just to
2505       // keep things simple there.
2506       if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
2507         const int32_t Divisor = C->getValue();
2508         const uint32_t UDivisor = Divisor;
2509         if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
2510           uint32_t LogDiv = llvm::Log2_32(UDivisor);
2511           // LLVM does the following for dest=src%(1<<log):
2512           //   t=src
2513           //   sar t,typewidth-1 // -1 if src is negative, 0 if not
2514           //   shr t,typewidth-log
2515           //   add t,src
2516           //   and t, -(1<<log)
2517           //   sub t,src
2518           //   neg t
2519           //   dest=t
2520           uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty);
2521           // If for some reason we are dividing by 1, just assign 0.
2522           if (LogDiv == 0) {
2523             _mov(Dest, Ctx->getConstantZero(Ty));
2524             return;
2525           }
2526           _mov(T, Src0);
2527           // The initial sar is unnecessary when dividing by 2.
2528           if (LogDiv > 1)
2529             _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
2530           _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
2531           _add(T, Src0);
2532           _and(T, Ctx->getConstantInt(Ty, -(1 << LogDiv)));
2533           _sub(T, Src0);
2534           _neg(T);
2535           _mov(Dest, T);
2536           return;
2537         }
2538       }
2539     }
2540     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2541     RegNumT Eax;
2542     RegNumT Edx;
2543     switch (Ty) {
2544     default:
2545       llvm::report_fatal_error("Bad type for srem");
2546     case IceType_i64:
2547       Eax = Traits::getRaxOrDie();
2548       Edx = Traits::getRdxOrDie();
2549       break;
2550     case IceType_i32:
2551       Eax = Traits::RegisterSet::Reg_eax;
2552       Edx = Traits::RegisterSet::Reg_edx;
2553       break;
2554     case IceType_i16:
2555       Eax = Traits::RegisterSet::Reg_ax;
2556       Edx = Traits::RegisterSet::Reg_dx;
2557       break;
2558     case IceType_i8:
2559       Eax = Traits::RegisterSet::Reg_al;
2560       Edx = Traits::RegisterSet::Reg_ah;
2561       break;
2562     }
2563     T_edx = makeReg(Ty, Edx);
2564     _mov(T, Src0, Eax);
2565     _cbwdq(T_edx, T);
2566     _idiv(T, Src1, T_edx);
2567     _redefined(Context.insert<InstFakeDef>(T_edx, T));
2568     if (Ty == IceType_i8) {
2569       // Register ah must be moved into one of {al,bl,cl,dl} before it can be
2570       // moved into a general 8-bit register.
2571       auto *T_AhRcvr = makeReg(Ty);
2572       T_AhRcvr->setRegClass(RCX86_IsAhRcvr);
2573       _mov(T_AhRcvr, T_edx);
2574       T_edx = T_AhRcvr;
2575     }
2576     _mov(Dest, T_edx);
2577   } break;
2578   case InstArithmetic::Fadd:
2579     _mov(T, Src0);
2580     _addss(T, Src1);
2581     _mov(Dest, T);
2582     break;
2583   case InstArithmetic::Fsub:
2584     _mov(T, Src0);
2585     _subss(T, Src1);
2586     _mov(Dest, T);
2587     break;
2588   case InstArithmetic::Fmul:
2589     _mov(T, Src0);
2590     _mulss(T, Src0 == Src1 ? T : Src1);
2591     _mov(Dest, T);
2592     break;
2593   case InstArithmetic::Fdiv:
2594     _mov(T, Src0);
2595     _divss(T, Src1);
2596     _mov(Dest, T);
2597     break;
2598   case InstArithmetic::Frem:
2599     llvm::report_fatal_error("Helper call was expected");
2600     break;
2601   }
2602 }
2603 
2604 template <typename TraitsType>
2605 void TargetX86Base<TraitsType>::lowerAssign(const InstAssign *Instr) {
2606   Variable *Dest = Instr->getDest();
2607   if (Dest->isRematerializable()) {
2608     Context.insert<InstFakeDef>(Dest);
2609     return;
2610   }
2611   Operand *Src = Instr->getSrc(0);
2612   assert(Dest->getType() == Src->getType());
2613   lowerMove(Dest, Src, false);
2614 }
2615 
2616 template <typename TraitsType>
2617 void TargetX86Base<TraitsType>::lowerBr(const InstBr *Br) {
2618   if (Br->isUnconditional()) {
2619     _br(Br->getTargetUnconditional());
2620     return;
2621   }
2622   Operand *Cond = Br->getCondition();
2623 
2624   // Handle folding opportunities.
2625   if (const Inst *Producer = FoldingInfo.getProducerFor(Cond)) {
2626     assert(Producer->isDeleted());
2627     switch (BoolFolding<Traits>::getProducerKind(Producer)) {
2628     default:
2629       break;
2630     case BoolFolding<Traits>::PK_Icmp32:
2631     case BoolFolding<Traits>::PK_Icmp64: {
2632       lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Br);
2633       return;
2634     }
2635     case BoolFolding<Traits>::PK_Fcmp: {
2636       lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Br);
2637       return;
2638     }
2639     case BoolFolding<Traits>::PK_Arith: {
2640       lowerArithAndConsumer(llvm::cast<InstArithmetic>(Producer), Br);
2641       return;
2642     }
2643     }
2644   }
2645   Operand *Src0 = legalize(Cond, Legal_Reg | Legal_Mem);
2646   Constant *Zero = Ctx->getConstantZero(IceType_i32);
2647   _cmp(Src0, Zero);
2648   _br(Traits::Cond::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
2649 }
2650 
2651 // constexprMax returns a (constexpr) max(S0, S1), and it is used for defining
2652 // OperandList in lowerCall. std::max() is supposed to work, but it doesn't.
2653 inline constexpr SizeT constexprMax(SizeT S0, SizeT S1) {
2654   return S0 < S1 ? S1 : S0;
2655 }
2656 
2657 template <typename TraitsType>
2658 void TargetX86Base<TraitsType>::lowerCall(const InstCall *Instr) {
2659   // Common x86 calling convention lowering:
2660   //
2661   // * At the point before the call, the stack must be aligned to 16 bytes.
2662   //
2663   // * Non-register arguments are pushed onto the stack in right-to-left order,
2664   // such that the left-most argument ends up on the top of the stack at the
2665   // lowest memory address.
2666   //
2667   // * Stack arguments of vector type are aligned to start at the next highest
2668   // multiple of 16 bytes. Other stack arguments are aligned to the next word
2669   // size boundary (4 or 8 bytes, respectively).
2670   RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment,
2671                                             Traits::X86_STACK_ALIGNMENT_BYTES);
2672 
2673   constexpr SizeT MaxOperands =
2674       constexprMax(Traits::X86_MAX_XMM_ARGS, Traits::X86_MAX_GPR_ARGS);
2675   using OperandList = llvm::SmallVector<Operand *, MaxOperands>;
2676 
2677   OperandList XmmArgs;
2678   llvm::SmallVector<SizeT, MaxOperands> XmmArgIndices;
2679   CfgVector<std::pair<const Type, Operand *>> GprArgs;
2680   CfgVector<SizeT> GprArgIndices;
2681   OperandList StackArgs, StackArgLocations;
2682   uint32_t ParameterAreaSizeBytes = 0;
2683 
2684   ParameterAreaSizeBytes += getShadowStoreSize<Traits>();
2685 
2686   // Classify each argument operand according to the location where the argument
2687   // is passed.
2688   for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {
2689     Operand *Arg = Instr->getArg(i);
2690     const Type Ty = Arg->getType();
2691     // The PNaCl ABI requires the width of arguments to be at least 32 bits.
2692     assert(typeWidthInBytes(Ty) >= 4);
2693     if (isVectorType(Ty) &&
2694         Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, XmmArgs.size()))
2695             .hasValue()) {
2696       XmmArgs.push_back(Arg);
2697       XmmArgIndices.push_back(i);
2698     } else if (isScalarFloatingType(Ty) && Traits::X86_PASS_SCALAR_FP_IN_XMM &&
2699                Traits::getRegisterForXmmArgNum(
2700                    Traits::getArgIndex(i, XmmArgs.size()))
2701                    .hasValue()) {
2702       XmmArgs.push_back(Arg);
2703       XmmArgIndices.push_back(i);
2704     } else if (isScalarIntegerType(Ty) &&
2705                Traits::getRegisterForGprArgNum(
2706                    Ty, Traits::getArgIndex(i, GprArgs.size()))
2707                    .hasValue()) {
2708       GprArgs.emplace_back(Ty, Arg);
2709       GprArgIndices.push_back(i);
2710     } else {
2711       // Place on stack.
2712       StackArgs.push_back(Arg);
2713       if (isVectorType(Arg->getType())) {
2714         ParameterAreaSizeBytes =
2715             Traits::applyStackAlignment(ParameterAreaSizeBytes);
2716       }
2717       Variable *esp = getPhysicalRegister(getStackReg(), Traits::WordType);
2718       Constant *Loc = Ctx->getConstantInt32(ParameterAreaSizeBytes);
2719       StackArgLocations.push_back(
2720           Traits::X86OperandMem::create(Func, Ty, esp, Loc));
2721       ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType());
2722     }
2723   }
2724   // Ensure there is enough space for the fstp/movs for floating returns.
2725   Variable *Dest = Instr->getDest();
2726   const Type DestTy = Dest ? Dest->getType() : IceType_void;
2727   if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
2728     if (isScalarFloatingType(DestTy)) {
2729       ParameterAreaSizeBytes =
2730           std::max(static_cast<size_t>(ParameterAreaSizeBytes),
2731                    typeWidthInBytesOnStack(DestTy));
2732     }
2733   }
2734   // Adjust the parameter area so that the stack is aligned. It is assumed that
2735   // the stack is already aligned at the start of the calling sequence.
2736   ParameterAreaSizeBytes = Traits::applyStackAlignment(ParameterAreaSizeBytes);
2737   assert(ParameterAreaSizeBytes <= maxOutArgsSizeBytes());
2738   // Copy arguments that are passed on the stack to the appropriate stack
2739   // locations.  We make sure legalize() is called on each argument at this
2740   // point, to allow availabilityGet() to work.
2741   for (SizeT i = 0, NumStackArgs = StackArgs.size(); i < NumStackArgs; ++i) {
2742     lowerStore(
2743         InstStore::create(Func, legalize(StackArgs[i]), StackArgLocations[i]));
2744   }
2745   // Copy arguments to be passed in registers to the appropriate registers.
2746   for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) {
2747     XmmArgs[i] = legalizeToReg(legalize(XmmArgs[i]),
2748                                Traits::getRegisterForXmmArgNum(
2749                                    Traits::getArgIndex(XmmArgIndices[i], i)));
2750   }
2751   // Materialize moves for arguments passed in GPRs.
2752   for (SizeT i = 0, NumGprArgs = GprArgs.size(); i < NumGprArgs; ++i) {
2753     const Type SignatureTy = GprArgs[i].first;
2754     Operand *Arg =
2755         legalize(GprArgs[i].second, Legal_Default | Legal_Rematerializable);
2756     GprArgs[i].second = legalizeToReg(
2757         Arg, Traits::getRegisterForGprArgNum(
2758                  Arg->getType(), Traits::getArgIndex(GprArgIndices[i], i)));
2759     assert(SignatureTy == IceType_i64 || SignatureTy == IceType_i32);
2760     assert(SignatureTy == Arg->getType());
2761     (void)SignatureTy;
2762   }
2763   // Generate a FakeUse of register arguments so that they do not get dead code
2764   // eliminated as a result of the FakeKill of scratch registers after the call.
2765   // These need to be right before the call instruction.
2766   for (auto *Arg : XmmArgs) {
2767     Context.insert<InstFakeUse>(llvm::cast<Variable>(Arg));
2768   }
2769   for (auto &ArgPair : GprArgs) {
2770     Context.insert<InstFakeUse>(llvm::cast<Variable>(ArgPair.second));
2771   }
2772   // Generate the call instruction. Assign its result to a temporary with high
2773   // register allocation weight.
2774   // ReturnReg doubles as ReturnRegLo as necessary.
2775   Variable *ReturnReg = nullptr;
2776   Variable *ReturnRegHi = nullptr;
2777   if (Dest) {
2778     switch (DestTy) {
2779     case IceType_NUM:
2780     case IceType_void:
2781     case IceType_i1:
2782     case IceType_i8:
2783     case IceType_i16:
2784       llvm::report_fatal_error("Invalid Call dest type");
2785       break;
2786     case IceType_i32:
2787       ReturnReg = makeReg(DestTy, Traits::RegisterSet::Reg_eax);
2788       break;
2789     case IceType_i64:
2790       if (Traits::Is64Bit) {
2791         ReturnReg = makeReg(IceType_i64, Traits::getRaxOrDie());
2792       } else {
2793         ReturnReg = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
2794         ReturnRegHi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
2795       }
2796       break;
2797     case IceType_f32:
2798     case IceType_f64:
2799       if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
2800         // Leave ReturnReg==ReturnRegHi==nullptr, and capture the result with
2801         // the fstp instruction.
2802         break;
2803       }
2804     // Fallthrough intended.
2805     case IceType_v4i1:
2806     case IceType_v8i1:
2807     case IceType_v16i1:
2808     case IceType_v16i8:
2809     case IceType_v8i16:
2810     case IceType_v4i32:
2811     case IceType_v4f32:
2812       ReturnReg = makeReg(DestTy, Traits::RegisterSet::Reg_xmm0);
2813       break;
2814     }
2815   }
2816   // Emit the call to the function.
2817   Operand *CallTarget =
2818       legalize(Instr->getCallTarget(), Legal_Reg | Legal_Imm | Legal_AddrAbs);
2819   size_t NumVariadicFpArgs = Instr->isVariadic() ? XmmArgs.size() : 0;
2820   Inst *NewCall = emitCallToTarget(CallTarget, ReturnReg, NumVariadicFpArgs);
2821   // Keep the upper return register live on 32-bit platform.
2822   if (ReturnRegHi)
2823     Context.insert<InstFakeDef>(ReturnRegHi);
2824   // Mark the call as killing all the caller-save registers.
2825   Context.insert<InstFakeKill>(NewCall);
2826   // Handle x86-32 floating point returns.
2827   if (Dest != nullptr && isScalarFloatingType(DestTy) &&
2828       !Traits::X86_PASS_SCALAR_FP_IN_XMM) {
2829     // Special treatment for an FP function which returns its result in st(0).
2830     // If Dest ends up being a physical xmm register, the fstp emit code will
2831     // route st(0) through the space reserved in the function argument area
2832     // we allocated.
2833     _fstp(Dest);
2834     // Create a fake use of Dest in case it actually isn't used, because st(0)
2835     // still needs to be popped.
2836     Context.insert<InstFakeUse>(Dest);
2837   }
2838   // Generate a FakeUse to keep the call live if necessary.
2839   if (Instr->hasSideEffects() && ReturnReg) {
2840     Context.insert<InstFakeUse>(ReturnReg);
2841   }
2842   // Process the return value, if any.
2843   if (Dest == nullptr)
2844     return;
2845   // Assign the result of the call to Dest.  Route it through a temporary so
2846   // that the local register availability peephole can be subsequently used.
2847   Variable *Tmp = nullptr;
2848   if (isVectorType(DestTy)) {
2849     assert(ReturnReg && "Vector type requires a return register");
2850     Tmp = makeReg(DestTy);
2851     _movp(Tmp, ReturnReg);
2852     _movp(Dest, Tmp);
2853   } else if (isScalarFloatingType(DestTy)) {
2854     if (Traits::X86_PASS_SCALAR_FP_IN_XMM) {
2855       assert(ReturnReg && "FP type requires a return register");
2856       _mov(Tmp, ReturnReg);
2857       _mov(Dest, Tmp);
2858     }
2859   } else {
2860     assert(isScalarIntegerType(DestTy));
2861     assert(ReturnReg && "Integer type requires a return register");
2862     if (DestTy == IceType_i64 && !Traits::Is64Bit) {
2863       assert(ReturnRegHi && "64-bit type requires two return registers");
2864       auto *Dest64On32 = llvm::cast<Variable64On32>(Dest);
2865       Variable *DestLo = Dest64On32->getLo();
2866       Variable *DestHi = Dest64On32->getHi();
2867       _mov(Tmp, ReturnReg);
2868       _mov(DestLo, Tmp);
2869       Variable *TmpHi = nullptr;
2870       _mov(TmpHi, ReturnRegHi);
2871       _mov(DestHi, TmpHi);
2872     } else {
2873       _mov(Tmp, ReturnReg);
2874       _mov(Dest, Tmp);
2875     }
2876   }
2877 }
2878 
2879 template <typename TraitsType>
2880 void TargetX86Base<TraitsType>::lowerCast(const InstCast *Instr) {
2881   // a = cast(b) ==> t=cast(b); a=t; (link t->b, link a->t, no overlap)
2882   InstCast::OpKind CastKind = Instr->getCastKind();
2883   Variable *Dest = Instr->getDest();
2884   Type DestTy = Dest->getType();
2885   switch (CastKind) {
2886   default:
2887     Func->setError("Cast type not supported");
2888     return;
2889   case InstCast::Sext: {
2890     // Src0RM is the source operand legalized to physical register or memory,
2891     // but not immediate, since the relevant x86 native instructions don't
2892     // allow an immediate operand. If the operand is an immediate, we could
2893     // consider computing the strength-reduced result at translation time, but
2894     // we're unlikely to see something like that in the bitcode that the
2895     // optimizer wouldn't have already taken care of.
2896     Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2897     if (isVectorType(DestTy)) {
2898       if (DestTy == IceType_v16i8) {
2899         // onemask = materialize(1,1,...); dst = (src & onemask) > 0
2900         Variable *OneMask = makeVectorOfOnes(DestTy);
2901         Variable *T = makeReg(DestTy);
2902         _movp(T, Src0RM);
2903         _pand(T, OneMask);
2904         Variable *Zeros = makeVectorOfZeros(DestTy);
2905         _pcmpgt(T, Zeros);
2906         _movp(Dest, T);
2907       } else {
2908         /// width = width(elty) - 1; dest = (src << width) >> width
2909         SizeT ShiftAmount =
2910             Traits::X86_CHAR_BIT * typeWidthInBytes(typeElementType(DestTy)) -
2911             1;
2912         Constant *ShiftConstant = Ctx->getConstantInt8(ShiftAmount);
2913         Variable *T = makeReg(DestTy);
2914         _movp(T, Src0RM);
2915         _psll(T, ShiftConstant);
2916         _psra(T, ShiftConstant);
2917         _movp(Dest, T);
2918       }
2919     } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
2920       // t1=movsx src; t2=t1; t2=sar t2, 31; dst.lo=t1; dst.hi=t2
2921       Constant *Shift = Ctx->getConstantInt32(31);
2922       auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
2923       auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
2924       Variable *T_Lo = makeReg(DestLo->getType());
2925       if (Src0RM->getType() == IceType_i32) {
2926         _mov(T_Lo, Src0RM);
2927       } else if (Src0RM->getType() == IceType_i1) {
2928         _movzx(T_Lo, Src0RM);
2929         _shl(T_Lo, Shift);
2930         _sar(T_Lo, Shift);
2931       } else {
2932         _movsx(T_Lo, Src0RM);
2933       }
2934       _mov(DestLo, T_Lo);
2935       Variable *T_Hi = nullptr;
2936       _mov(T_Hi, T_Lo);
2937       if (Src0RM->getType() != IceType_i1)
2938         // For i1, the sar instruction is already done above.
2939         _sar(T_Hi, Shift);
2940       _mov(DestHi, T_Hi);
2941     } else if (Src0RM->getType() == IceType_i1) {
2942       // t1 = src
2943       // shl t1, dst_bitwidth - 1
2944       // sar t1, dst_bitwidth - 1
2945       // dst = t1
2946       size_t DestBits = Traits::X86_CHAR_BIT * typeWidthInBytes(DestTy);
2947       Constant *ShiftAmount = Ctx->getConstantInt32(DestBits - 1);
2948       Variable *T = makeReg(DestTy);
2949       if (typeWidthInBytes(DestTy) <= typeWidthInBytes(Src0RM->getType())) {
2950         _mov(T, Src0RM);
2951       } else {
2952         // Widen the source using movsx or movzx. (It doesn't matter which one,
2953         // since the following shl/sar overwrite the bits.)
2954         _movzx(T, Src0RM);
2955       }
2956       _shl(T, ShiftAmount);
2957       _sar(T, ShiftAmount);
2958       _mov(Dest, T);
2959     } else {
2960       // t1 = movsx src; dst = t1
2961       Variable *T = makeReg(DestTy);
2962       _movsx(T, Src0RM);
2963       _mov(Dest, T);
2964     }
2965     break;
2966   }
2967   case InstCast::Zext: {
2968     Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2969     if (isVectorType(DestTy)) {
2970       // onemask = materialize(1,1,...); dest = onemask & src
2971       Variable *OneMask = makeVectorOfOnes(DestTy);
2972       Variable *T = makeReg(DestTy);
2973       _movp(T, Src0RM);
2974       _pand(T, OneMask);
2975       _movp(Dest, T);
2976     } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
2977       // t1=movzx src; dst.lo=t1; dst.hi=0
2978       Constant *Zero = Ctx->getConstantZero(IceType_i32);
2979       auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
2980       auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
2981       Variable *Tmp = makeReg(DestLo->getType());
2982       if (Src0RM->getType() == IceType_i32) {
2983         _mov(Tmp, Src0RM);
2984       } else {
2985         _movzx(Tmp, Src0RM);
2986       }
2987       _mov(DestLo, Tmp);
2988       _mov(DestHi, Zero);
2989     } else if (Src0RM->getType() == IceType_i1) {
2990       // t = Src0RM; Dest = t
2991       Variable *T = nullptr;
2992       if (DestTy == IceType_i8) {
2993         _mov(T, Src0RM);
2994       } else {
2995         assert(DestTy != IceType_i1);
2996         assert(Traits::Is64Bit || DestTy != IceType_i64);
2997         // Use 32-bit for both 16-bit and 32-bit, since 32-bit ops are shorter.
2998         // In x86-64 we need to widen T to 64-bits to ensure that T -- if
2999         // written to the stack (i.e., in -Om1) will be fully zero-extended.
3000         T = makeReg(DestTy == IceType_i64 ? IceType_i64 : IceType_i32);
3001         _movzx(T, Src0RM);
3002       }
3003       _mov(Dest, T);
3004     } else {
3005       // t1 = movzx src; dst = t1
3006       Variable *T = makeReg(DestTy);
3007       _movzx(T, Src0RM);
3008       _mov(Dest, T);
3009     }
3010     break;
3011   }
3012   case InstCast::Trunc: {
3013     if (isVectorType(DestTy)) {
3014       // onemask = materialize(1,1,...); dst = src & onemask
3015       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
3016       Type Src0Ty = Src0RM->getType();
3017       Variable *OneMask = makeVectorOfOnes(Src0Ty);
3018       Variable *T = makeReg(DestTy);
3019       _movp(T, Src0RM);
3020       _pand(T, OneMask);
3021       _movp(Dest, T);
3022     } else if (DestTy == IceType_i1 || DestTy == IceType_i8) {
3023       // Make sure we truncate from and into valid registers.
3024       Operand *Src0 = legalizeUndef(Instr->getSrc(0));
3025       if (!Traits::Is64Bit && Src0->getType() == IceType_i64)
3026         Src0 = loOperand(Src0);
3027       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3028       Variable *T = copyToReg8(Src0RM);
3029       if (DestTy == IceType_i1)
3030         _and(T, Ctx->getConstantInt1(1));
3031       _mov(Dest, T);
3032     } else {
3033       Operand *Src0 = legalizeUndef(Instr->getSrc(0));
3034       if (!Traits::Is64Bit && Src0->getType() == IceType_i64)
3035         Src0 = loOperand(Src0);
3036       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3037       // t1 = trunc Src0RM; Dest = t1
3038       Variable *T = makeReg(DestTy);
3039       _mov(T, Src0RM);
3040       _mov(Dest, T);
3041     }
3042     break;
3043   }
3044   case InstCast::Fptrunc:
3045   case InstCast::Fpext: {
3046     Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
3047     // t1 = cvt Src0RM; Dest = t1
3048     Variable *T = makeReg(DestTy);
3049     _cvt(T, Src0RM, Traits::Insts::Cvt::Float2float);
3050     _mov(Dest, T);
3051     break;
3052   }
3053   case InstCast::Fptosi:
3054     if (isVectorType(DestTy)) {
3055       assert(DestTy == IceType_v4i32);
3056       assert(Instr->getSrc(0)->getType() == IceType_v4f32);
3057       Operand *Src0R = legalizeToReg(Instr->getSrc(0));
3058       Variable *T = makeReg(DestTy);
3059       _cvt(T, Src0R, Traits::Insts::Cvt::Tps2dq);
3060       _movp(Dest, T);
3061     } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
3062       llvm::report_fatal_error("Helper call was expected");
3063     } else {
3064       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
3065       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
3066       Variable *T_1 = nullptr;
3067       if (Traits::Is64Bit && DestTy == IceType_i64) {
3068         T_1 = makeReg(IceType_i64);
3069       } else {
3070         assert(DestTy != IceType_i64);
3071         T_1 = makeReg(IceType_i32);
3072       }
3073       // cvt() requires its integer argument to be a GPR.
3074       Variable *T_2 = makeReg(DestTy);
3075       if (isByteSizedType(DestTy)) {
3076         assert(T_1->getType() == IceType_i32);
3077         T_1->setRegClass(RCX86_Is32To8);
3078         T_2->setRegClass(RCX86_IsTrunc8Rcvr);
3079       }
3080       _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si);
3081       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
3082       if (DestTy == IceType_i1)
3083         _and(T_2, Ctx->getConstantInt1(1));
3084       _mov(Dest, T_2);
3085     }
3086     break;
3087   case InstCast::Fptoui:
3088     if (isVectorType(DestTy)) {
3089       llvm::report_fatal_error("Helper call was expected");
3090     } else if (DestTy == IceType_i64 ||
3091                (!Traits::Is64Bit && DestTy == IceType_i32)) {
3092       llvm::report_fatal_error("Helper call was expected");
3093     } else {
3094       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
3095       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
3096       assert(DestTy != IceType_i64);
3097       Variable *T_1 = nullptr;
3098       if (Traits::Is64Bit && DestTy == IceType_i32) {
3099         T_1 = makeReg(IceType_i64);
3100       } else {
3101         assert(DestTy != IceType_i32);
3102         T_1 = makeReg(IceType_i32);
3103       }
3104       Variable *T_2 = makeReg(DestTy);
3105       if (isByteSizedType(DestTy)) {
3106         assert(T_1->getType() == IceType_i32);
3107         T_1->setRegClass(RCX86_Is32To8);
3108         T_2->setRegClass(RCX86_IsTrunc8Rcvr);
3109       }
3110       _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si);
3111       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
3112       if (DestTy == IceType_i1)
3113         _and(T_2, Ctx->getConstantInt1(1));
3114       _mov(Dest, T_2);
3115     }
3116     break;
3117   case InstCast::Sitofp:
3118     if (isVectorType(DestTy)) {
3119       assert(DestTy == IceType_v4f32);
3120       assert(Instr->getSrc(0)->getType() == IceType_v4i32);
3121       Operand *Src0R = legalizeToReg(Instr->getSrc(0));
3122       Variable *T = makeReg(DestTy);
3123       _cvt(T, Src0R, Traits::Insts::Cvt::Dq2ps);
3124       _movp(Dest, T);
3125     } else if (!Traits::Is64Bit && Instr->getSrc(0)->getType() == IceType_i64) {
3126       llvm::report_fatal_error("Helper call was expected");
3127     } else {
3128       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
3129       // Sign-extend the operand.
3130       // t1.i32 = movsx Src0RM; t2 = Cvt t1.i32; Dest = t2
3131       Variable *T_1 = nullptr;
3132       if (Traits::Is64Bit && Src0RM->getType() == IceType_i64) {
3133         T_1 = makeReg(IceType_i64);
3134       } else {
3135         assert(Src0RM->getType() != IceType_i64);
3136         T_1 = makeReg(IceType_i32);
3137       }
3138       Variable *T_2 = makeReg(DestTy);
3139       if (Src0RM->getType() == T_1->getType())
3140         _mov(T_1, Src0RM);
3141       else
3142         _movsx(T_1, Src0RM);
3143       _cvt(T_2, T_1, Traits::Insts::Cvt::Si2ss);
3144       _mov(Dest, T_2);
3145     }
3146     break;
3147   case InstCast::Uitofp: {
3148     Operand *Src0 = Instr->getSrc(0);
3149     if (isVectorType(Src0->getType())) {
3150       llvm::report_fatal_error("Helper call was expected");
3151     } else if (Src0->getType() == IceType_i64 ||
3152                (!Traits::Is64Bit && Src0->getType() == IceType_i32)) {
3153       llvm::report_fatal_error("Helper call was expected");
3154     } else {
3155       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3156       // Zero-extend the operand.
3157       // t1.i32 = movzx Src0RM; t2 = Cvt t1.i32; Dest = t2
3158       Variable *T_1 = nullptr;
3159       if (Traits::Is64Bit && Src0RM->getType() == IceType_i32) {
3160         T_1 = makeReg(IceType_i64);
3161       } else {
3162         assert(Src0RM->getType() != IceType_i64);
3163         assert(Traits::Is64Bit || Src0RM->getType() != IceType_i32);
3164         T_1 = makeReg(IceType_i32);
3165       }
3166       Variable *T_2 = makeReg(DestTy);
3167       if (Src0RM->getType() == T_1->getType())
3168         _mov(T_1, Src0RM);
3169       else
3170         _movzx(T_1, Src0RM)->setMustKeep();
3171       _cvt(T_2, T_1, Traits::Insts::Cvt::Si2ss);
3172       _mov(Dest, T_2);
3173     }
3174     break;
3175   }
3176   case InstCast::Bitcast: {
3177     Operand *Src0 = Instr->getSrc(0);
3178     if (DestTy == Src0->getType()) {
3179       auto *Assign = InstAssign::create(Func, Dest, Src0);
3180       lowerAssign(Assign);
3181       return;
3182     }
3183     switch (DestTy) {
3184     default:
3185       llvm_unreachable("Unexpected Bitcast dest type");
3186     case IceType_i8: {
3187       llvm::report_fatal_error("Helper call was expected");
3188     } break;
3189     case IceType_i16: {
3190       llvm::report_fatal_error("Helper call was expected");
3191     } break;
3192     case IceType_i32:
3193     case IceType_f32: {
3194       Variable *Src0R = legalizeToReg(Src0);
3195       Variable *T = makeReg(DestTy);
3196       _movd(T, Src0R);
3197       _mov(Dest, T);
3198     } break;
3199     case IceType_i64: {
3200       assert(Src0->getType() == IceType_f64);
3201       if (Traits::Is64Bit) {
3202         Variable *Src0R = legalizeToReg(Src0);
3203         Variable *T = makeReg(IceType_i64);
3204         _movd(T, Src0R);
3205         _mov(Dest, T);
3206       } else {
3207         Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3208         // a.i64 = bitcast b.f64 ==>
3209         //   s.f64 = spill b.f64
3210         //   t_lo.i32 = lo(s.f64)
3211         //   a_lo.i32 = t_lo.i32
3212         //   t_hi.i32 = hi(s.f64)
3213         //   a_hi.i32 = t_hi.i32
3214         Operand *SpillLo, *SpillHi;
3215         if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0RM)) {
3216           Variable *Spill = Func->makeVariable(IceType_f64);
3217           Spill->setLinkedTo(Src0Var);
3218           Spill->setMustNotHaveReg();
3219           _movq(Spill, Src0RM);
3220           SpillLo = Traits::VariableSplit::create(Func, Spill,
3221                                                   Traits::VariableSplit::Low);
3222           SpillHi = Traits::VariableSplit::create(Func, Spill,
3223                                                   Traits::VariableSplit::High);
3224         } else {
3225           SpillLo = loOperand(Src0RM);
3226           SpillHi = hiOperand(Src0RM);
3227         }
3228 
3229         auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
3230         auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
3231         Variable *T_Lo = makeReg(IceType_i32);
3232         Variable *T_Hi = makeReg(IceType_i32);
3233 
3234         _mov(T_Lo, SpillLo);
3235         _mov(DestLo, T_Lo);
3236         _mov(T_Hi, SpillHi);
3237         _mov(DestHi, T_Hi);
3238       }
3239     } break;
3240     case IceType_f64: {
3241       assert(Src0->getType() == IceType_i64);
3242       if (Traits::Is64Bit) {
3243         Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3244         Variable *T = makeReg(IceType_f64);
3245         _movd(T, Src0RM);
3246         _mov(Dest, T);
3247       } else {
3248         Src0 = legalize(Src0);
3249         if (llvm::isa<X86OperandMem>(Src0)) {
3250           Variable *T = makeReg(DestTy);
3251           _movq(T, Src0);
3252           _movq(Dest, T);
3253           break;
3254         }
3255         // a.f64 = bitcast b.i64 ==>
3256         //   t_lo.i32 = b_lo.i32
3257         //   FakeDef(s.f64)
3258         //   lo(s.f64) = t_lo.i32
3259         //   t_hi.i32 = b_hi.i32
3260         //   hi(s.f64) = t_hi.i32
3261         //   a.f64 = s.f64
3262         Variable *Spill = Func->makeVariable(IceType_f64);
3263         Spill->setLinkedTo(Dest);
3264         Spill->setMustNotHaveReg();
3265 
3266         Variable *T_Lo = nullptr, *T_Hi = nullptr;
3267         auto *SpillLo = Traits::VariableSplit::create(
3268             Func, Spill, Traits::VariableSplit::Low);
3269         auto *SpillHi = Traits::VariableSplit::create(
3270             Func, Spill, Traits::VariableSplit::High);
3271         _mov(T_Lo, loOperand(Src0));
3272         // Technically, the Spill is defined after the _store happens, but
3273         // SpillLo is considered a "use" of Spill so define Spill before it is
3274         // used.
3275         Context.insert<InstFakeDef>(Spill);
3276         _store(T_Lo, SpillLo);
3277         _mov(T_Hi, hiOperand(Src0));
3278         _store(T_Hi, SpillHi);
3279         _movq(Dest, Spill);
3280       }
3281     } break;
3282     case IceType_v8i1: {
3283       llvm::report_fatal_error("Helper call was expected");
3284     } break;
3285     case IceType_v16i1: {
3286       llvm::report_fatal_error("Helper call was expected");
3287     } break;
3288     case IceType_v8i16:
3289     case IceType_v16i8:
3290     case IceType_v4i32:
3291     case IceType_v4f32: {
3292       if (Src0->getType() == IceType_i32) {
3293         // Bitcast requires equal type sizes, which isn't strictly the case
3294         // between scalars and vectors, but to emulate v4i8 vectors one has to
3295         // use v16i8 vectors.
3296         assert(getFlags().getApplicationBinaryInterface() != ABI_PNaCl &&
3297                "PNaCl only supports real 128-bit vectors");
3298         Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3299         Variable *T = makeReg(DestTy);
3300         _movd(T, Src0RM);
3301         _mov(Dest, T);
3302       } else {
3303         _movp(Dest, legalizeToReg(Src0));
3304       }
3305     } break;
3306     }
3307     break;
3308   }
3309   }
3310 }
3311 
3312 template <typename TraitsType>
3313 void TargetX86Base<TraitsType>::lowerExtractElement(
3314     const InstExtractElement *Instr) {
3315   Operand *SourceVectNotLegalized = Instr->getSrc(0);
3316   auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(1));
3317   // Only constant indices are allowed in PNaCl IR.
3318   assert(ElementIndex);
3319 
3320   unsigned Index = ElementIndex->getValue();
3321   Type Ty = SourceVectNotLegalized->getType();
3322   Type ElementTy = typeElementType(Ty);
3323   Type InVectorElementTy = Traits::getInVectorElementType(Ty);
3324 
3325   // TODO(wala): Determine the best lowering sequences for each type.
3326   bool CanUsePextr = Ty == IceType_v8i16 || Ty == IceType_v8i1 ||
3327                      (InstructionSet >= Traits::SSE4_1 && Ty != IceType_v4f32);
3328   Variable *ExtractedElementR =
3329       makeReg(CanUsePextr ? IceType_i32 : InVectorElementTy);
3330   if (CanUsePextr) {
3331     // Use pextrb, pextrw, or pextrd.  The "b" and "w" versions clear the upper
3332     // bits of the destination register, so we represent this by always
3333     // extracting into an i32 register.  The _mov into Dest below will do
3334     // truncation as necessary.
3335     Constant *Mask = Ctx->getConstantInt32(Index);
3336     Variable *SourceVectR = legalizeToReg(SourceVectNotLegalized);
3337     _pextr(ExtractedElementR, SourceVectR, Mask);
3338   } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
3339     // Use pshufd and movd/movss.
3340     Variable *T = nullptr;
3341     if (Index) {
3342       // The shuffle only needs to occur if the element to be extracted is not
3343       // at the lowest index.
3344       Constant *Mask = Ctx->getConstantInt32(Index);
3345       T = makeReg(Ty);
3346       _pshufd(T, legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem), Mask);
3347     } else {
3348       T = legalizeToReg(SourceVectNotLegalized);
3349     }
3350 
3351     if (InVectorElementTy == IceType_i32) {
3352       _movd(ExtractedElementR, T);
3353     } else { // Ty == IceType_f32
3354       // TODO(wala): _movss is only used here because _mov does not allow a
3355       // vector source and a scalar destination.  _mov should be able to be
3356       // used here.
3357       // _movss is a binary instruction, so the FakeDef is needed to keep the
3358       // live range analysis consistent.
3359       Context.insert<InstFakeDef>(ExtractedElementR);
3360       _movss(ExtractedElementR, T);
3361     }
3362   } else {
3363     assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
3364     // Spill the value to a stack slot and do the extraction in memory.
3365     //
3366     // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support
3367     // for legalizing to mem is implemented.
3368     Variable *Slot = Func->makeVariable(Ty);
3369     Slot->setMustNotHaveReg();
3370     _movp(Slot, legalizeToReg(SourceVectNotLegalized));
3371 
3372     // Compute the location of the element in memory.
3373     unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
3374     X86OperandMem *Loc =
3375         getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
3376     _mov(ExtractedElementR, Loc);
3377   }
3378 
3379   if (ElementTy == IceType_i1) {
3380     // Truncate extracted integers to i1s if necessary.
3381     Variable *T = makeReg(IceType_i1);
3382     InstCast *Cast =
3383         InstCast::create(Func, InstCast::Trunc, T, ExtractedElementR);
3384     lowerCast(Cast);
3385     ExtractedElementR = T;
3386   }
3387 
3388   // Copy the element to the destination.
3389   Variable *Dest = Instr->getDest();
3390   _mov(Dest, ExtractedElementR);
3391 }
3392 
3393 template <typename TraitsType>
3394 void TargetX86Base<TraitsType>::lowerFcmp(const InstFcmp *Fcmp) {
3395   Variable *Dest = Fcmp->getDest();
3396 
3397   if (isVectorType(Dest->getType())) {
3398     lowerFcmpVector(Fcmp);
3399   } else {
3400     constexpr Inst *Consumer = nullptr;
3401     lowerFcmpAndConsumer(Fcmp, Consumer);
3402   }
3403 }
3404 
3405 template <typename TraitsType>
3406 void TargetX86Base<TraitsType>::lowerFcmpAndConsumer(const InstFcmp *Fcmp,
3407                                                      const Inst *Consumer) {
3408   Operand *Src0 = Fcmp->getSrc(0);
3409   Operand *Src1 = Fcmp->getSrc(1);
3410   Variable *Dest = Fcmp->getDest();
3411 
3412   if (Consumer != nullptr) {
3413     if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3414       if (lowerOptimizeFcmpSelect(Fcmp, Select))
3415         return;
3416     }
3417   }
3418 
3419   if (isVectorType(Dest->getType())) {
3420     lowerFcmp(Fcmp);
3421     if (Consumer != nullptr)
3422       lowerSelectVector(llvm::cast<InstSelect>(Consumer));
3423     return;
3424   }
3425 
3426   // Lowering a = fcmp cond, b, c
3427   //   ucomiss b, c       /* only if C1 != Br_None */
3428   //                      /* but swap b,c order if SwapOperands==true */
3429   //   mov a, <default>
3430   //   j<C1> label        /* only if C1 != Br_None */
3431   //   j<C2> label        /* only if C2 != Br_None */
3432   //   FakeUse(a)         /* only if C1 != Br_None */
3433   //   mov a, !<default>  /* only if C1 != Br_None */
3434   //   label:             /* only if C1 != Br_None */
3435   //
3436   // setcc lowering when C1 != Br_None && C2 == Br_None:
3437   //   ucomiss b, c       /* but swap b,c order if SwapOperands==true */
3438   //   setcc a, C1
3439   InstFcmp::FCond Condition = Fcmp->getCondition();
3440   assert(static_cast<size_t>(Condition) < Traits::TableFcmpSize);
3441   if (Traits::TableFcmp[Condition].SwapScalarOperands)
3442     std::swap(Src0, Src1);
3443   const bool HasC1 = (Traits::TableFcmp[Condition].C1 != Traits::Cond::Br_None);
3444   const bool HasC2 = (Traits::TableFcmp[Condition].C2 != Traits::Cond::Br_None);
3445   if (HasC1) {
3446     Src0 = legalize(Src0);
3447     Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3448     Variable *T = nullptr;
3449     _mov(T, Src0);
3450     _ucomiss(T, Src1RM);
3451     if (!HasC2) {
3452       assert(Traits::TableFcmp[Condition].Default);
3453       setccOrConsumer(Traits::TableFcmp[Condition].C1, Dest, Consumer);
3454       return;
3455     }
3456   }
3457   int32_t IntDefault = Traits::TableFcmp[Condition].Default;
3458   if (Consumer == nullptr) {
3459     Constant *Default = Ctx->getConstantInt(Dest->getType(), IntDefault);
3460     _mov(Dest, Default);
3461     if (HasC1) {
3462       InstX86Label *Label = InstX86Label::create(Func, this);
3463       _br(Traits::TableFcmp[Condition].C1, Label);
3464       if (HasC2) {
3465         _br(Traits::TableFcmp[Condition].C2, Label);
3466       }
3467       Constant *NonDefault = Ctx->getConstantInt(Dest->getType(), !IntDefault);
3468       _redefined(_mov(Dest, NonDefault));
3469       Context.insert(Label);
3470     }
3471     return;
3472   }
3473   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3474     CfgNode *TrueSucc = Br->getTargetTrue();
3475     CfgNode *FalseSucc = Br->getTargetFalse();
3476     if (IntDefault != 0)
3477       std::swap(TrueSucc, FalseSucc);
3478     if (HasC1) {
3479       _br(Traits::TableFcmp[Condition].C1, FalseSucc);
3480       if (HasC2) {
3481         _br(Traits::TableFcmp[Condition].C2, FalseSucc);
3482       }
3483       _br(TrueSucc);
3484       return;
3485     }
3486     _br(FalseSucc);
3487     return;
3488   }
3489   if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3490     Operand *SrcT = Select->getTrueOperand();
3491     Operand *SrcF = Select->getFalseOperand();
3492     Variable *SelectDest = Select->getDest();
3493     if (IntDefault != 0)
3494       std::swap(SrcT, SrcF);
3495     lowerMove(SelectDest, SrcF, false);
3496     if (HasC1) {
3497       InstX86Label *Label = InstX86Label::create(Func, this);
3498       _br(Traits::TableFcmp[Condition].C1, Label);
3499       if (HasC2) {
3500         _br(Traits::TableFcmp[Condition].C2, Label);
3501       }
3502       static constexpr bool IsRedefinition = true;
3503       lowerMove(SelectDest, SrcT, IsRedefinition);
3504       Context.insert(Label);
3505     }
3506     return;
3507   }
3508   llvm::report_fatal_error("Unexpected consumer type");
3509 }
3510 
3511 template <typename TraitsType>
3512 void TargetX86Base<TraitsType>::lowerFcmpVector(const InstFcmp *Fcmp) {
3513   Operand *Src0 = Fcmp->getSrc(0);
3514   Operand *Src1 = Fcmp->getSrc(1);
3515   Variable *Dest = Fcmp->getDest();
3516 
3517   if (!isVectorType(Dest->getType()))
3518     llvm::report_fatal_error("Expected vector compare");
3519 
3520   InstFcmp::FCond Condition = Fcmp->getCondition();
3521   assert(static_cast<size_t>(Condition) < Traits::TableFcmpSize);
3522 
3523   if (Traits::TableFcmp[Condition].SwapVectorOperands)
3524     std::swap(Src0, Src1);
3525 
3526   Variable *T = nullptr;
3527 
3528   if (Condition == InstFcmp::True) {
3529     // makeVectorOfOnes() requires an integer vector type.
3530     T = makeVectorOfMinusOnes(IceType_v4i32);
3531   } else if (Condition == InstFcmp::False) {
3532     T = makeVectorOfZeros(Dest->getType());
3533   } else {
3534     Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3535     Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3536     if (llvm::isa<X86OperandMem>(Src1RM))
3537       Src1RM = legalizeToReg(Src1RM);
3538 
3539     switch (Condition) {
3540     default: {
3541       const CmppsCond Predicate = Traits::TableFcmp[Condition].Predicate;
3542       assert(Predicate != Traits::Cond::Cmpps_Invalid);
3543       T = makeReg(Src0RM->getType());
3544       _movp(T, Src0RM);
3545       _cmpps(T, Src1RM, Predicate);
3546     } break;
3547     case InstFcmp::One: {
3548       // Check both unequal and ordered.
3549       T = makeReg(Src0RM->getType());
3550       Variable *T2 = makeReg(Src0RM->getType());
3551       _movp(T, Src0RM);
3552       _cmpps(T, Src1RM, Traits::Cond::Cmpps_neq);
3553       _movp(T2, Src0RM);
3554       _cmpps(T2, Src1RM, Traits::Cond::Cmpps_ord);
3555       _pand(T, T2);
3556     } break;
3557     case InstFcmp::Ueq: {
3558       // Check both equal or unordered.
3559       T = makeReg(Src0RM->getType());
3560       Variable *T2 = makeReg(Src0RM->getType());
3561       _movp(T, Src0RM);
3562       _cmpps(T, Src1RM, Traits::Cond::Cmpps_eq);
3563       _movp(T2, Src0RM);
3564       _cmpps(T2, Src1RM, Traits::Cond::Cmpps_unord);
3565       _por(T, T2);
3566     } break;
3567     }
3568   }
3569 
3570   assert(T != nullptr);
3571   _movp(Dest, T);
3572   eliminateNextVectorSextInstruction(Dest);
3573 }
3574 
3575 inline bool isZero(const Operand *Opnd) {
3576   if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Opnd))
3577     return C64->getValue() == 0;
3578   if (auto *C32 = llvm::dyn_cast<ConstantInteger32>(Opnd))
3579     return C32->getValue() == 0;
3580   return false;
3581 }
3582 
3583 template <typename TraitsType>
3584 void TargetX86Base<TraitsType>::lowerIcmpAndConsumer(const InstIcmp *Icmp,
3585                                                      const Inst *Consumer) {
3586   Operand *Src0 = legalize(Icmp->getSrc(0));
3587   Operand *Src1 = legalize(Icmp->getSrc(1));
3588   Variable *Dest = Icmp->getDest();
3589 
3590   if (isVectorType(Dest->getType())) {
3591     lowerIcmp(Icmp);
3592     if (Consumer != nullptr)
3593       lowerSelectVector(llvm::cast<InstSelect>(Consumer));
3594     return;
3595   }
3596 
3597   if (!Traits::Is64Bit && Src0->getType() == IceType_i64) {
3598     lowerIcmp64(Icmp, Consumer);
3599     return;
3600   }
3601 
3602   // cmp b, c
3603   if (isZero(Src1)) {
3604     switch (Icmp->getCondition()) {
3605     default:
3606       break;
3607     case InstIcmp::Uge:
3608       movOrConsumer(true, Dest, Consumer);
3609       return;
3610     case InstIcmp::Ult:
3611       movOrConsumer(false, Dest, Consumer);
3612       return;
3613     }
3614   }
3615   Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
3616   _cmp(Src0RM, Src1);
3617   setccOrConsumer(Traits::getIcmp32Mapping(Icmp->getCondition()), Dest,
3618                   Consumer);
3619 }
3620 
3621 template <typename TraitsType>
3622 void TargetX86Base<TraitsType>::lowerIcmpVector(const InstIcmp *Icmp) {
3623   Operand *Src0 = legalize(Icmp->getSrc(0));
3624   Operand *Src1 = legalize(Icmp->getSrc(1));
3625   Variable *Dest = Icmp->getDest();
3626 
3627   if (!isVectorType(Dest->getType()))
3628     llvm::report_fatal_error("Expected a vector compare");
3629 
3630   Type Ty = Src0->getType();
3631   // Promote i1 vectors to 128 bit integer vector types.
3632   if (typeElementType(Ty) == IceType_i1) {
3633     Type NewTy = IceType_NUM;
3634     switch (Ty) {
3635     default:
3636       llvm::report_fatal_error("unexpected type");
3637       break;
3638     case IceType_v4i1:
3639       NewTy = IceType_v4i32;
3640       break;
3641     case IceType_v8i1:
3642       NewTy = IceType_v8i16;
3643       break;
3644     case IceType_v16i1:
3645       NewTy = IceType_v16i8;
3646       break;
3647     }
3648     Variable *NewSrc0 = Func->makeVariable(NewTy);
3649     Variable *NewSrc1 = Func->makeVariable(NewTy);
3650     lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc0, Src0));
3651     lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc1, Src1));
3652     Src0 = NewSrc0;
3653     Src1 = NewSrc1;
3654     Ty = NewTy;
3655   }
3656 
3657   InstIcmp::ICond Condition = Icmp->getCondition();
3658 
3659   Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3660   Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3661 
3662   // SSE2 only has signed comparison operations. Transform unsigned inputs in
3663   // a manner that allows for the use of signed comparison operations by
3664   // flipping the high order bits.
3665   if (Condition == InstIcmp::Ugt || Condition == InstIcmp::Uge ||
3666       Condition == InstIcmp::Ult || Condition == InstIcmp::Ule) {
3667     Variable *T0 = makeReg(Ty);
3668     Variable *T1 = makeReg(Ty);
3669     Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty);
3670     _movp(T0, Src0RM);
3671     _pxor(T0, HighOrderBits);
3672     _movp(T1, Src1RM);
3673     _pxor(T1, HighOrderBits);
3674     Src0RM = T0;
3675     Src1RM = T1;
3676   }
3677 
3678   Variable *T = makeReg(Ty);
3679   switch (Condition) {
3680   default:
3681     llvm_unreachable("unexpected condition");
3682     break;
3683   case InstIcmp::Eq: {
3684     if (llvm::isa<X86OperandMem>(Src1RM))
3685       Src1RM = legalizeToReg(Src1RM);
3686     _movp(T, Src0RM);
3687     _pcmpeq(T, Src1RM);
3688   } break;
3689   case InstIcmp::Ne: {
3690     if (llvm::isa<X86OperandMem>(Src1RM))
3691       Src1RM = legalizeToReg(Src1RM);
3692     _movp(T, Src0RM);
3693     _pcmpeq(T, Src1RM);
3694     Variable *MinusOne = makeVectorOfMinusOnes(Ty);
3695     _pxor(T, MinusOne);
3696   } break;
3697   case InstIcmp::Ugt:
3698   case InstIcmp::Sgt: {
3699     if (llvm::isa<X86OperandMem>(Src1RM))
3700       Src1RM = legalizeToReg(Src1RM);
3701     _movp(T, Src0RM);
3702     _pcmpgt(T, Src1RM);
3703   } break;
3704   case InstIcmp::Uge:
3705   case InstIcmp::Sge: {
3706     // !(Src1RM > Src0RM)
3707     if (llvm::isa<X86OperandMem>(Src0RM))
3708       Src0RM = legalizeToReg(Src0RM);
3709     _movp(T, Src1RM);
3710     _pcmpgt(T, Src0RM);
3711     Variable *MinusOne = makeVectorOfMinusOnes(Ty);
3712     _pxor(T, MinusOne);
3713   } break;
3714   case InstIcmp::Ult:
3715   case InstIcmp::Slt: {
3716     if (llvm::isa<X86OperandMem>(Src0RM))
3717       Src0RM = legalizeToReg(Src0RM);
3718     _movp(T, Src1RM);
3719     _pcmpgt(T, Src0RM);
3720   } break;
3721   case InstIcmp::Ule:
3722   case InstIcmp::Sle: {
3723     // !(Src0RM > Src1RM)
3724     if (llvm::isa<X86OperandMem>(Src1RM))
3725       Src1RM = legalizeToReg(Src1RM);
3726     _movp(T, Src0RM);
3727     _pcmpgt(T, Src1RM);
3728     Variable *MinusOne = makeVectorOfMinusOnes(Ty);
3729     _pxor(T, MinusOne);
3730   } break;
3731   }
3732 
3733   _movp(Dest, T);
3734   eliminateNextVectorSextInstruction(Dest);
3735 }
3736 
3737 template <typename TraitsType>
3738 template <typename T>
3739 typename std::enable_if<!T::Is64Bit, void>::type
3740 TargetX86Base<TraitsType>::lowerIcmp64(const InstIcmp *Icmp,
3741                                        const Inst *Consumer) {
3742   // a=icmp cond, b, c ==> cmp b,c; a=1; br cond,L1; FakeUse(a); a=0; L1:
3743   Operand *Src0 = legalize(Icmp->getSrc(0));
3744   Operand *Src1 = legalize(Icmp->getSrc(1));
3745   Variable *Dest = Icmp->getDest();
3746   InstIcmp::ICond Condition = Icmp->getCondition();
3747   assert(static_cast<size_t>(Condition) < Traits::TableIcmp64Size);
3748   Operand *Src0LoRM = nullptr;
3749   Operand *Src0HiRM = nullptr;
3750   // Legalize the portions of Src0 that are going to be needed.
3751   if (isZero(Src1)) {
3752     switch (Condition) {
3753     default:
3754       llvm_unreachable("unexpected condition");
3755       break;
3756     // These two are not optimized, so we fall through to the general case,
3757     // which needs the upper and lower halves legalized.
3758     case InstIcmp::Sgt:
3759     case InstIcmp::Sle:
3760     // These four compare after performing an "or" of the high and low half, so
3761     // they need the upper and lower halves legalized.
3762     case InstIcmp::Eq:
3763     case InstIcmp::Ule:
3764     case InstIcmp::Ne:
3765     case InstIcmp::Ugt:
3766       Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
3767     // These two test only the high half's sign bit, so they need only
3768     // the upper half legalized.
3769     case InstIcmp::Sge:
3770     case InstIcmp::Slt:
3771       Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
3772       break;
3773 
3774     // These two move constants and hence need no legalization.
3775     case InstIcmp::Uge:
3776     case InstIcmp::Ult:
3777       break;
3778     }
3779   } else {
3780     Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
3781     Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
3782   }
3783   // Optimize comparisons with zero.
3784   if (isZero(Src1)) {
3785     Constant *SignMask = Ctx->getConstantInt32(0x80000000);
3786     Variable *Temp = nullptr;
3787     switch (Condition) {
3788     default:
3789       llvm_unreachable("unexpected condition");
3790       break;
3791     case InstIcmp::Eq:
3792     case InstIcmp::Ule:
3793       // Mov Src0HiRM first, because it was legalized most recently, and will
3794       // sometimes avoid a move before the OR.
3795       _mov(Temp, Src0HiRM);
3796       _or(Temp, Src0LoRM);
3797       Context.insert<InstFakeUse>(Temp);
3798       setccOrConsumer(Traits::Cond::Br_e, Dest, Consumer);
3799       return;
3800     case InstIcmp::Ne:
3801     case InstIcmp::Ugt:
3802       // Mov Src0HiRM first, because it was legalized most recently, and will
3803       // sometimes avoid a move before the OR.
3804       _mov(Temp, Src0HiRM);
3805       _or(Temp, Src0LoRM);
3806       Context.insert<InstFakeUse>(Temp);
3807       setccOrConsumer(Traits::Cond::Br_ne, Dest, Consumer);
3808       return;
3809     case InstIcmp::Uge:
3810       movOrConsumer(true, Dest, Consumer);
3811       return;
3812     case InstIcmp::Ult:
3813       movOrConsumer(false, Dest, Consumer);
3814       return;
3815     case InstIcmp::Sgt:
3816       break;
3817     case InstIcmp::Sge:
3818       _test(Src0HiRM, SignMask);
3819       setccOrConsumer(Traits::Cond::Br_e, Dest, Consumer);
3820       return;
3821     case InstIcmp::Slt:
3822       _test(Src0HiRM, SignMask);
3823       setccOrConsumer(Traits::Cond::Br_ne, Dest, Consumer);
3824       return;
3825     case InstIcmp::Sle:
3826       break;
3827     }
3828   }
3829   // Handle general compares.
3830   Operand *Src1LoRI = legalize(loOperand(Src1), Legal_Reg | Legal_Imm);
3831   Operand *Src1HiRI = legalize(hiOperand(Src1), Legal_Reg | Legal_Imm);
3832   if (Consumer == nullptr) {
3833     Constant *Zero = Ctx->getConstantInt(Dest->getType(), 0);
3834     Constant *One = Ctx->getConstantInt(Dest->getType(), 1);
3835     InstX86Label *LabelFalse = InstX86Label::create(Func, this);
3836     InstX86Label *LabelTrue = InstX86Label::create(Func, this);
3837     _mov(Dest, One);
3838     _cmp(Src0HiRM, Src1HiRI);
3839     if (Traits::TableIcmp64[Condition].C1 != Traits::Cond::Br_None)
3840       _br(Traits::TableIcmp64[Condition].C1, LabelTrue);
3841     if (Traits::TableIcmp64[Condition].C2 != Traits::Cond::Br_None)
3842       _br(Traits::TableIcmp64[Condition].C2, LabelFalse);
3843     _cmp(Src0LoRM, Src1LoRI);
3844     _br(Traits::TableIcmp64[Condition].C3, LabelTrue);
3845     Context.insert(LabelFalse);
3846     _redefined(_mov(Dest, Zero));
3847     Context.insert(LabelTrue);
3848     return;
3849   }
3850   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3851     _cmp(Src0HiRM, Src1HiRI);
3852     if (Traits::TableIcmp64[Condition].C1 != Traits::Cond::Br_None)
3853       _br(Traits::TableIcmp64[Condition].C1, Br->getTargetTrue());
3854     if (Traits::TableIcmp64[Condition].C2 != Traits::Cond::Br_None)
3855       _br(Traits::TableIcmp64[Condition].C2, Br->getTargetFalse());
3856     _cmp(Src0LoRM, Src1LoRI);
3857     _br(Traits::TableIcmp64[Condition].C3, Br->getTargetTrue(),
3858         Br->getTargetFalse());
3859     return;
3860   }
3861   if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3862     Operand *SrcT = Select->getTrueOperand();
3863     Operand *SrcF = Select->getFalseOperand();
3864     Variable *SelectDest = Select->getDest();
3865     InstX86Label *LabelFalse = InstX86Label::create(Func, this);
3866     InstX86Label *LabelTrue = InstX86Label::create(Func, this);
3867     lowerMove(SelectDest, SrcT, false);
3868     _cmp(Src0HiRM, Src1HiRI);
3869     if (Traits::TableIcmp64[Condition].C1 != Traits::Cond::Br_None)
3870       _br(Traits::TableIcmp64[Condition].C1, LabelTrue);
3871     if (Traits::TableIcmp64[Condition].C2 != Traits::Cond::Br_None)
3872       _br(Traits::TableIcmp64[Condition].C2, LabelFalse);
3873     _cmp(Src0LoRM, Src1LoRI);
3874     _br(Traits::TableIcmp64[Condition].C3, LabelTrue);
3875     Context.insert(LabelFalse);
3876     static constexpr bool IsRedefinition = true;
3877     lowerMove(SelectDest, SrcF, IsRedefinition);
3878     Context.insert(LabelTrue);
3879     return;
3880   }
3881   llvm::report_fatal_error("Unexpected consumer type");
3882 }
3883 
3884 template <typename TraitsType>
3885 void TargetX86Base<TraitsType>::setccOrConsumer(BrCond Condition,
3886                                                 Variable *Dest,
3887                                                 const Inst *Consumer) {
3888   if (Consumer == nullptr) {
3889     _setcc(Dest, Condition);
3890     return;
3891   }
3892   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3893     _br(Condition, Br->getTargetTrue(), Br->getTargetFalse());
3894     return;
3895   }
3896   if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3897     Operand *SrcT = Select->getTrueOperand();
3898     Operand *SrcF = Select->getFalseOperand();
3899     Variable *SelectDest = Select->getDest();
3900     lowerSelectMove(SelectDest, Condition, SrcT, SrcF);
3901     return;
3902   }
3903   llvm::report_fatal_error("Unexpected consumer type");
3904 }
3905 
3906 template <typename TraitsType>
3907 void TargetX86Base<TraitsType>::movOrConsumer(bool IcmpResult, Variable *Dest,
3908                                               const Inst *Consumer) {
3909   if (Consumer == nullptr) {
3910     _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0)));
3911     return;
3912   }
3913   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3914     // TODO(sehr,stichnot): This could be done with a single unconditional
3915     // branch instruction, but subzero doesn't know how to handle the resulting
3916     // control flow graph changes now.  Make it do so to eliminate mov and cmp.
3917     _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0)));
3918     _cmp(Dest, Ctx->getConstantInt(Dest->getType(), 0));
3919     _br(Traits::Cond::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
3920     return;
3921   }
3922   if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3923     Operand *Src = nullptr;
3924     if (IcmpResult) {
3925       Src = legalize(Select->getTrueOperand(), Legal_Reg | Legal_Imm);
3926     } else {
3927       Src = legalize(Select->getFalseOperand(), Legal_Reg | Legal_Imm);
3928     }
3929     Variable *SelectDest = Select->getDest();
3930     lowerMove(SelectDest, Src, false);
3931     return;
3932   }
3933   llvm::report_fatal_error("Unexpected consumer type");
3934 }
3935 
3936 template <typename TraitsType>
3937 void TargetX86Base<TraitsType>::lowerArithAndConsumer(
3938     const InstArithmetic *Arith, const Inst *Consumer) {
3939   Variable *T = nullptr;
3940   Operand *Src0 = legalize(Arith->getSrc(0));
3941   Operand *Src1 = legalize(Arith->getSrc(1));
3942   Variable *Dest = Arith->getDest();
3943   switch (Arith->getOp()) {
3944   default:
3945     llvm_unreachable("arithmetic operator not AND or OR");
3946     break;
3947   case InstArithmetic::And:
3948     _mov(T, Src0);
3949     // Test cannot have an address in the second position.  Since T is
3950     // guaranteed to be a register and Src1 could be a memory load, ensure
3951     // that the second argument is a register.
3952     if (llvm::isa<Constant>(Src1))
3953       _test(T, Src1);
3954     else
3955       _test(Src1, T);
3956     break;
3957   case InstArithmetic::Or:
3958     _mov(T, Src0);
3959     _or(T, Src1);
3960     break;
3961   }
3962 
3963   if (Consumer == nullptr) {
3964     llvm::report_fatal_error("Expected a consumer instruction");
3965   }
3966   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3967     Context.insert<InstFakeUse>(T);
3968     Context.insert<InstFakeDef>(Dest);
3969     _br(Traits::Cond::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
3970     return;
3971   }
3972   llvm::report_fatal_error("Unexpected consumer type");
3973 }
3974 
3975 template <typename TraitsType>
3976 void TargetX86Base<TraitsType>::lowerInsertElement(
3977     const InstInsertElement *Instr) {
3978   Operand *SourceVectNotLegalized = Instr->getSrc(0);
3979   Operand *ElementToInsertNotLegalized = Instr->getSrc(1);
3980   auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(2));
3981   // Only constant indices are allowed in PNaCl IR.
3982   assert(ElementIndex);
3983   unsigned Index = ElementIndex->getValue();
3984   assert(Index < typeNumElements(SourceVectNotLegalized->getType()));
3985 
3986   Type Ty = SourceVectNotLegalized->getType();
3987   Type ElementTy = typeElementType(Ty);
3988   Type InVectorElementTy = Traits::getInVectorElementType(Ty);
3989 
3990   if (ElementTy == IceType_i1) {
3991     // Expand the element to the appropriate size for it to be inserted in the
3992     // vector.
3993     Variable *Expanded = Func->makeVariable(InVectorElementTy);
3994     auto *Cast = InstCast::create(Func, InstCast::Zext, Expanded,
3995                                   ElementToInsertNotLegalized);
3996     lowerCast(Cast);
3997     ElementToInsertNotLegalized = Expanded;
3998   }
3999 
4000   if (Ty == IceType_v8i16 || Ty == IceType_v8i1 ||
4001       InstructionSet >= Traits::SSE4_1) {
4002     // Use insertps, pinsrb, pinsrw, or pinsrd.
4003     Operand *ElementRM =
4004         legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
4005     Operand *SourceVectRM =
4006         legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
4007     Variable *T = makeReg(Ty);
4008     _movp(T, SourceVectRM);
4009     if (Ty == IceType_v4f32) {
4010       _insertps(T, ElementRM, Ctx->getConstantInt32(Index << 4));
4011     } else {
4012       // For the pinsrb and pinsrw instructions, when the source operand is a
4013       // register, it must be a full r32 register like eax, and not ax/al/ah.
4014       // For filetype=asm, InstX86Pinsr<TraitsType>::emit() compensates for
4015       // the use
4016       // of r16 and r8 by converting them through getBaseReg(), while emitIAS()
4017       // validates that the original and base register encodings are the same.
4018       if (ElementRM->getType() == IceType_i8 &&
4019           llvm::isa<Variable>(ElementRM)) {
4020         // Don't use ah/bh/ch/dh for pinsrb.
4021         ElementRM = copyToReg8(ElementRM);
4022       }
4023       _pinsr(T, ElementRM, Ctx->getConstantInt32(Index));
4024     }
4025     _movp(Instr->getDest(), T);
4026   } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
4027     // Use shufps or movss.
4028     Variable *ElementR = nullptr;
4029     Operand *SourceVectRM =
4030         legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
4031 
4032     if (InVectorElementTy == IceType_f32) {
4033       // ElementR will be in an XMM register since it is floating point.
4034       ElementR = legalizeToReg(ElementToInsertNotLegalized);
4035     } else {
4036       // Copy an integer to an XMM register.
4037       Operand *T = legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
4038       ElementR = makeReg(Ty);
4039       _movd(ElementR, T);
4040     }
4041 
4042     if (Index == 0) {
4043       Variable *T = makeReg(Ty);
4044       _movp(T, SourceVectRM);
4045       _movss(T, ElementR);
4046       _movp(Instr->getDest(), T);
4047       return;
4048     }
4049 
4050     // shufps treats the source and destination operands as vectors of four
4051     // doublewords. The destination's two high doublewords are selected from
4052     // the source operand and the two low doublewords are selected from the
4053     // (original value of) the destination operand. An insertelement operation
4054     // can be effected with a sequence of two shufps operations with
4055     // appropriate masks. In all cases below, Element[0] is being inserted into
4056     // SourceVectOperand. Indices are ordered from left to right.
4057     //
4058     // insertelement into index 1 (result is stored in ElementR):
4059     //   ElementR := ElementR[0, 0] SourceVectRM[0, 0]
4060     //   ElementR := ElementR[3, 0] SourceVectRM[2, 3]
4061     //
4062     // insertelement into index 2 (result is stored in T):
4063     //   T := SourceVectRM
4064     //   ElementR := ElementR[0, 0] T[0, 3]
4065     //   T := T[0, 1] ElementR[0, 3]
4066     //
4067     // insertelement into index 3 (result is stored in T):
4068     //   T := SourceVectRM
4069     //   ElementR := ElementR[0, 0] T[0, 2]
4070     //   T := T[0, 1] ElementR[3, 0]
4071     const unsigned char Mask1[3] = {0, 192, 128};
4072     const unsigned char Mask2[3] = {227, 196, 52};
4073 
4074     Constant *Mask1Constant = Ctx->getConstantInt32(Mask1[Index - 1]);
4075     Constant *Mask2Constant = Ctx->getConstantInt32(Mask2[Index - 1]);
4076 
4077     if (Index == 1) {
4078       _shufps(ElementR, SourceVectRM, Mask1Constant);
4079       _shufps(ElementR, SourceVectRM, Mask2Constant);
4080       _movp(Instr->getDest(), ElementR);
4081     } else {
4082       Variable *T = makeReg(Ty);
4083       _movp(T, SourceVectRM);
4084       _shufps(ElementR, T, Mask1Constant);
4085       _shufps(T, ElementR, Mask2Constant);
4086       _movp(Instr->getDest(), T);
4087     }
4088   } else {
4089     assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
4090     // Spill the value to a stack slot and perform the insertion in memory.
4091     //
4092     // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support
4093     // for legalizing to mem is implemented.
4094     Variable *Slot = Func->makeVariable(Ty);
4095     Slot->setMustNotHaveReg();
4096     _movp(Slot, legalizeToReg(SourceVectNotLegalized));
4097 
4098     // Compute the location of the position to insert in memory.
4099     unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
4100     X86OperandMem *Loc =
4101         getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
4102     _store(legalizeToReg(ElementToInsertNotLegalized), Loc);
4103 
4104     Variable *T = makeReg(Ty);
4105     _movp(T, Slot);
4106     _movp(Instr->getDest(), T);
4107   }
4108 }
4109 
4110 template <typename TraitsType>
4111 void TargetX86Base<TraitsType>::lowerIntrinsicCall(
4112     const InstIntrinsicCall *Instr) {
4113   switch (Intrinsics::IntrinsicID ID = Instr->getIntrinsicInfo().ID) {
4114   case Intrinsics::AtomicCmpxchg: {
4115     if (!Intrinsics::isMemoryOrderValid(
4116             ID, getConstantMemoryOrder(Instr->getArg(3)),
4117             getConstantMemoryOrder(Instr->getArg(4)))) {
4118       Func->setError("Unexpected memory ordering for AtomicCmpxchg");
4119       return;
4120     }
4121     Variable *DestPrev = Instr->getDest();
4122     Operand *PtrToMem = legalize(Instr->getArg(0));
4123     Operand *Expected = legalize(Instr->getArg(1));
4124     Operand *Desired = legalize(Instr->getArg(2));
4125     if (tryOptimizedCmpxchgCmpBr(DestPrev, PtrToMem, Expected, Desired))
4126       return;
4127     lowerAtomicCmpxchg(DestPrev, PtrToMem, Expected, Desired);
4128     return;
4129   }
4130   case Intrinsics::AtomicFence:
4131     if (!Intrinsics::isMemoryOrderValid(
4132             ID, getConstantMemoryOrder(Instr->getArg(0)))) {
4133       Func->setError("Unexpected memory ordering for AtomicFence");
4134       return;
4135     }
4136     _mfence();
4137     return;
4138   case Intrinsics::AtomicFenceAll:
4139     // NOTE: FenceAll should prevent and load/store from being moved across the
4140     // fence (both atomic and non-atomic). The InstX8632Mfence instruction is
4141     // currently marked coarsely as "HasSideEffects".
4142     _mfence();
4143     return;
4144   case Intrinsics::AtomicIsLockFree: {
4145     // X86 is always lock free for 8/16/32/64 bit accesses.
4146     // TODO(jvoung): Since the result is constant when given a constant byte
4147     // size, this opens up DCE opportunities.
4148     Operand *ByteSize = Instr->getArg(0);
4149     Variable *Dest = Instr->getDest();
4150     if (auto *CI = llvm::dyn_cast<ConstantInteger32>(ByteSize)) {
4151       Constant *Result;
4152       switch (CI->getValue()) {
4153       default:
4154         // Some x86-64 processors support the cmpxchg16b instruction, which can
4155         // make 16-byte operations lock free (when used with the LOCK prefix).
4156         // However, that's not supported in 32-bit mode, so just return 0 even
4157         // for large sizes.
4158         Result = Ctx->getConstantZero(IceType_i32);
4159         break;
4160       case 1:
4161       case 2:
4162       case 4:
4163       case 8:
4164         Result = Ctx->getConstantInt32(1);
4165         break;
4166       }
4167       _mov(Dest, Result);
4168       return;
4169     }
4170     // The PNaCl ABI requires the byte size to be a compile-time constant.
4171     Func->setError("AtomicIsLockFree byte size should be compile-time const");
4172     return;
4173   }
4174   case Intrinsics::AtomicLoad: {
4175     // We require the memory address to be naturally aligned. Given that is the
4176     // case, then normal loads are atomic.
4177     if (!Intrinsics::isMemoryOrderValid(
4178             ID, getConstantMemoryOrder(Instr->getArg(1)))) {
4179       Func->setError("Unexpected memory ordering for AtomicLoad");
4180       return;
4181     }
4182     Variable *Dest = Instr->getDest();
4183     if (!Traits::Is64Bit) {
4184       if (auto *Dest64On32 = llvm::dyn_cast<Variable64On32>(Dest)) {
4185         // Follow what GCC does and use a movq instead of what lowerLoad()
4186         // normally does (split the load into two). Thus, this skips
4187         // load/arithmetic op folding. Load/arithmetic folding can't happen
4188         // anyway, since this is x86-32 and integer arithmetic only happens on
4189         // 32-bit quantities.
4190         Variable *T = makeReg(IceType_f64);
4191         X86OperandMem *Addr = formMemoryOperand(Instr->getArg(0), IceType_f64);
4192         _movq(T, Addr);
4193         // Then cast the bits back out of the XMM register to the i64 Dest.
4194         auto *Cast = InstCast::create(Func, InstCast::Bitcast, Dest, T);
4195         lowerCast(Cast);
4196         // Make sure that the atomic load isn't elided when unused.
4197         Context.insert<InstFakeUse>(Dest64On32->getLo());
4198         Context.insert<InstFakeUse>(Dest64On32->getHi());
4199         return;
4200       }
4201     }
4202     auto *Load = InstLoad::create(Func, Dest, Instr->getArg(0));
4203     lowerLoad(Load);
4204     // Make sure the atomic load isn't elided when unused, by adding a FakeUse.
4205     // Since lowerLoad may fuse the load w/ an arithmetic instruction, insert
4206     // the FakeUse on the last-inserted instruction's dest.
4207     Context.insert<InstFakeUse>(Context.getLastInserted()->getDest());
4208     return;
4209   }
4210   case Intrinsics::AtomicRMW:
4211     if (!Intrinsics::isMemoryOrderValid(
4212             ID, getConstantMemoryOrder(Instr->getArg(3)))) {
4213       Func->setError("Unexpected memory ordering for AtomicRMW");
4214       return;
4215     }
4216     lowerAtomicRMW(
4217         Instr->getDest(),
4218         static_cast<uint32_t>(
4219             llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()),
4220         Instr->getArg(1), Instr->getArg(2));
4221     return;
4222   case Intrinsics::AtomicStore: {
4223     if (!Intrinsics::isMemoryOrderValid(
4224             ID, getConstantMemoryOrder(Instr->getArg(2)))) {
4225       Func->setError("Unexpected memory ordering for AtomicStore");
4226       return;
4227     }
4228     // We require the memory address to be naturally aligned. Given that is the
4229     // case, then normal stores are atomic. Add a fence after the store to make
4230     // it visible.
4231     Operand *Value = Instr->getArg(0);
4232     Operand *Ptr = Instr->getArg(1);
4233     if (!Traits::Is64Bit && Value->getType() == IceType_i64) {
4234       // Use a movq instead of what lowerStore() normally does (split the store
4235       // into two), following what GCC does. Cast the bits from int -> to an
4236       // xmm register first.
4237       Variable *T = makeReg(IceType_f64);
4238       auto *Cast = InstCast::create(Func, InstCast::Bitcast, T, Value);
4239       lowerCast(Cast);
4240       // Then store XMM w/ a movq.
4241       X86OperandMem *Addr = formMemoryOperand(Ptr, IceType_f64);
4242       _storeq(T, Addr);
4243       _mfence();
4244       return;
4245     }
4246     auto *Store = InstStore::create(Func, Value, Ptr);
4247     lowerStore(Store);
4248     _mfence();
4249     return;
4250   }
4251   case Intrinsics::Bswap: {
4252     Variable *Dest = Instr->getDest();
4253     Operand *Val = Instr->getArg(0);
4254     // In 32-bit mode, bswap only works on 32-bit arguments, and the argument
4255     // must be a register. Use rotate left for 16-bit bswap.
4256     if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
4257       Val = legalizeUndef(Val);
4258       Variable *T_Lo = legalizeToReg(loOperand(Val));
4259       Variable *T_Hi = legalizeToReg(hiOperand(Val));
4260       auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
4261       auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
4262       _bswap(T_Lo);
4263       _bswap(T_Hi);
4264       _mov(DestLo, T_Hi);
4265       _mov(DestHi, T_Lo);
4266     } else if ((Traits::Is64Bit && Val->getType() == IceType_i64) ||
4267                Val->getType() == IceType_i32) {
4268       Variable *T = legalizeToReg(Val);
4269       _bswap(T);
4270       _mov(Dest, T);
4271     } else {
4272       assert(Val->getType() == IceType_i16);
4273       Constant *Eight = Ctx->getConstantInt16(8);
4274       Variable *T = nullptr;
4275       Val = legalize(Val);
4276       _mov(T, Val);
4277       _rol(T, Eight);
4278       _mov(Dest, T);
4279     }
4280     return;
4281   }
4282   case Intrinsics::Ctpop: {
4283     Variable *Dest = Instr->getDest();
4284     Variable *T = nullptr;
4285     Operand *Val = Instr->getArg(0);
4286     Type ValTy = Val->getType();
4287     assert(ValTy == IceType_i32 || ValTy == IceType_i64);
4288 
4289     if (!Traits::Is64Bit) {
4290       T = Dest;
4291     } else {
4292       T = makeReg(IceType_i64);
4293       if (ValTy == IceType_i32) {
4294         // in x86-64, __popcountsi2 is not defined, so we cheat a bit by
4295         // converting it to a 64-bit value, and using ctpop_i64. _movzx should
4296         // ensure we will not have any bits set on Val's upper 32 bits.
4297         Variable *V = makeReg(IceType_i64);
4298         Operand *ValRM = legalize(Val, Legal_Reg | Legal_Mem);
4299         _movzx(V, ValRM);
4300         Val = V;
4301       }
4302       ValTy = IceType_i64;
4303     }
4304 
4305     InstCall *Call =
4306         makeHelperCall(ValTy == IceType_i32 ? RuntimeHelper::H_call_ctpop_i32
4307                                             : RuntimeHelper::H_call_ctpop_i64,
4308                        T, 1);
4309     Call->addArg(Val);
4310     lowerCall(Call);
4311     // The popcount helpers always return 32-bit values, while the intrinsic's
4312     // signature matches the native POPCNT instruction and fills a 64-bit reg
4313     // (in 64-bit mode). Thus, clear the upper bits of the dest just in case
4314     // the user doesn't do that in the IR. If the user does that in the IR,
4315     // then this zero'ing instruction is dead and gets optimized out.
4316     if (!Traits::Is64Bit) {
4317       assert(T == Dest);
4318       if (Val->getType() == IceType_i64) {
4319         auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
4320         Constant *Zero = Ctx->getConstantZero(IceType_i32);
4321         _mov(DestHi, Zero);
4322       }
4323     } else {
4324       assert(Val->getType() == IceType_i64);
4325       // T is 64 bit. It needs to be copied to dest. We need to:
4326       //
4327       // T_1.32 = trunc T.64 to i32
4328       // T_2.64 = zext T_1.32 to i64
4329       // Dest.<<right_size>> = T_2.<<right_size>>
4330       //
4331       // which ensures the upper 32 bits will always be cleared. Just doing a
4332       //
4333       // mov Dest.32 = trunc T.32 to i32
4334       //
4335       // is dangerous because there's a chance the compiler will optimize this
4336       // copy out. To use _movzx we need two new registers (one 32-, and
4337       // another 64-bit wide.)
4338       Variable *T_1 = makeReg(IceType_i32);
4339       _mov(T_1, T);
4340       Variable *T_2 = makeReg(IceType_i64);
4341       _movzx(T_2, T_1);
4342       _mov(Dest, T_2);
4343     }
4344     return;
4345   }
4346   case Intrinsics::Ctlz: {
4347     // The "is zero undef" parameter is ignored and we always return a
4348     // well-defined value.
4349     Operand *Val = legalize(Instr->getArg(0));
4350     Operand *FirstVal;
4351     Operand *SecondVal = nullptr;
4352     if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
4353       FirstVal = loOperand(Val);
4354       SecondVal = hiOperand(Val);
4355     } else {
4356       FirstVal = Val;
4357     }
4358     constexpr bool IsCttz = false;
4359     lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
4360                     SecondVal);
4361     return;
4362   }
4363   case Intrinsics::Cttz: {
4364     // The "is zero undef" parameter is ignored and we always return a
4365     // well-defined value.
4366     Operand *Val = legalize(Instr->getArg(0));
4367     Operand *FirstVal;
4368     Operand *SecondVal = nullptr;
4369     if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
4370       FirstVal = hiOperand(Val);
4371       SecondVal = loOperand(Val);
4372     } else {
4373       FirstVal = Val;
4374     }
4375     constexpr bool IsCttz = true;
4376     lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
4377                     SecondVal);
4378     return;
4379   }
4380   case Intrinsics::Fabs: {
4381     Operand *Src = legalize(Instr->getArg(0));
4382     Type Ty = Src->getType();
4383     Variable *Dest = Instr->getDest();
4384     Variable *T = makeVectorOfFabsMask(Ty);
4385     // The pand instruction operates on an m128 memory operand, so if Src is an
4386     // f32 or f64, we need to make sure it's in a register.
4387     if (isVectorType(Ty)) {
4388       if (llvm::isa<X86OperandMem>(Src))
4389         Src = legalizeToReg(Src);
4390     } else {
4391       Src = legalizeToReg(Src);
4392     }
4393     _pand(T, Src);
4394     if (isVectorType(Ty))
4395       _movp(Dest, T);
4396     else
4397       _mov(Dest, T);
4398     return;
4399   }
4400   case Intrinsics::Longjmp: {
4401     InstCall *Call = makeHelperCall(RuntimeHelper::H_call_longjmp, nullptr, 2);
4402     Call->addArg(Instr->getArg(0));
4403     Call->addArg(Instr->getArg(1));
4404     lowerCall(Call);
4405     return;
4406   }
4407   case Intrinsics::Memcpy: {
4408     lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
4409     return;
4410   }
4411   case Intrinsics::Memmove: {
4412     lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
4413     return;
4414   }
4415   case Intrinsics::Memset: {
4416     lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
4417     return;
4418   }
4419   case Intrinsics::NaClReadTP: {
4420     if (NeedSandboxing) {
4421       Operand *Src =
4422           dispatchToConcrete(&ConcreteTarget::createNaClReadTPSrcOperand);
4423       Variable *Dest = Instr->getDest();
4424       Variable *T = nullptr;
4425       _mov(T, Src);
4426       _mov(Dest, T);
4427     } else {
4428       InstCall *Call =
4429           makeHelperCall(RuntimeHelper::H_call_read_tp, Instr->getDest(), 0);
4430       lowerCall(Call);
4431     }
4432     return;
4433   }
4434   case Intrinsics::Setjmp: {
4435     InstCall *Call =
4436         makeHelperCall(RuntimeHelper::H_call_setjmp, Instr->getDest(), 1);
4437     Call->addArg(Instr->getArg(0));
4438     lowerCall(Call);
4439     return;
4440   }
4441   case Intrinsics::Sqrt: {
4442     assert(isScalarFloatingType(Instr->getDest()->getType()) ||
4443            getFlags().getApplicationBinaryInterface() != ::Ice::ABI_PNaCl);
4444     Operand *Src = legalize(Instr->getArg(0));
4445     Variable *Dest = Instr->getDest();
4446     Variable *T = makeReg(Dest->getType());
4447     _sqrt(T, Src);
4448     if (isVectorType(Dest->getType())) {
4449       _movp(Dest, T);
4450     } else {
4451       _mov(Dest, T);
4452     }
4453     return;
4454   }
4455   case Intrinsics::Stacksave: {
4456     if (!Traits::Is64Bit || !NeedSandboxing) {
4457       Variable *esp = Func->getTarget()->getPhysicalRegister(getStackReg(),
4458                                                              Traits::WordType);
4459       Variable *Dest = Instr->getDest();
4460       _mov(Dest, esp);
4461       return;
4462     }
4463     Variable *esp = Func->getTarget()->getPhysicalRegister(
4464         Traits::RegisterSet::Reg_esp, IceType_i32);
4465     Variable *Dest = Instr->getDest();
4466     _mov(Dest, esp);
4467 
4468     return;
4469   }
4470   case Intrinsics::Stackrestore: {
4471     Operand *Src = Instr->getArg(0);
4472     _mov_sp(Src);
4473     return;
4474   }
4475 
4476   case Intrinsics::Trap:
4477     _ud2();
4478     return;
4479   case Intrinsics::LoadSubVector: {
4480     assert(llvm::isa<ConstantInteger32>(Instr->getArg(1)) &&
4481            "LoadSubVector second argument must be a constant");
4482     Variable *Dest = Instr->getDest();
4483     Type Ty = Dest->getType();
4484     auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(1));
4485     Operand *Addr = Instr->getArg(0);
4486     X86OperandMem *Src = formMemoryOperand(Addr, Ty);
4487     doMockBoundsCheck(Src);
4488 
4489     if (Dest->isRematerializable()) {
4490       Context.insert<InstFakeDef>(Dest);
4491       return;
4492     }
4493 
4494     auto *T = makeReg(Ty);
4495     switch (SubVectorSize->getValue()) {
4496     case 4:
4497       _movd(T, Src);
4498       break;
4499     case 8:
4500       _movq(T, Src);
4501       break;
4502     default:
4503       Func->setError("Unexpected size for LoadSubVector");
4504       return;
4505     }
4506     _movp(Dest, T);
4507     return;
4508   }
4509   case Intrinsics::StoreSubVector: {
4510     assert(llvm::isa<ConstantInteger32>(Instr->getArg(2)) &&
4511            "StoreSubVector third argument must be a constant");
4512     auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(2));
4513     Operand *Value = Instr->getArg(0);
4514     Operand *Addr = Instr->getArg(1);
4515     X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType());
4516     doMockBoundsCheck(NewAddr);
4517 
4518     Value = legalizeToReg(Value);
4519 
4520     switch (SubVectorSize->getValue()) {
4521     case 4:
4522       _stored(Value, NewAddr);
4523       break;
4524     case 8:
4525       _storeq(Value, NewAddr);
4526       break;
4527     default:
4528       Func->setError("Unexpected size for StoreSubVector");
4529       return;
4530     }
4531     return;
4532   }
4533   case Intrinsics::VectorPackSigned: {
4534     Operand *Src0 = Instr->getArg(0);
4535     Operand *Src1 = Instr->getArg(1);
4536     Variable *Dest = Instr->getDest();
4537     auto *T = makeReg(Src0->getType());
4538     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4539     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4540     _movp(T, Src0RM);
4541     _packss(T, Src1RM);
4542     _movp(Dest, T);
4543     return;
4544   }
4545   case Intrinsics::VectorPackUnsigned: {
4546     Operand *Src0 = Instr->getArg(0);
4547     Operand *Src1 = Instr->getArg(1);
4548     Variable *Dest = Instr->getDest();
4549     auto *T = makeReg(Src0->getType());
4550     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4551     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4552     _movp(T, Src0RM);
4553     _packus(T, Src1RM);
4554     _movp(Dest, T);
4555     return;
4556   }
4557   case Intrinsics::SignMask: {
4558     Operand *SrcReg = legalizeToReg(Instr->getArg(0));
4559     Variable *Dest = Instr->getDest();
4560     Variable *T = makeReg(IceType_i32);
4561     if (SrcReg->getType() == IceType_v4f32 ||
4562         SrcReg->getType() == IceType_v4i32 ||
4563         SrcReg->getType() == IceType_v16i8) {
4564       _movmsk(T, SrcReg);
4565     } else {
4566       // TODO(capn): We could implement v8i16 sign mask using packsswb/pmovmskb
4567       llvm::report_fatal_error("Invalid type for SignMask intrinsic");
4568     }
4569     _mov(Dest, T);
4570     return;
4571   }
4572   case Intrinsics::MultiplyHighSigned: {
4573     Operand *Src0 = Instr->getArg(0);
4574     Operand *Src1 = Instr->getArg(1);
4575     Variable *Dest = Instr->getDest();
4576     auto *T = makeReg(Dest->getType());
4577     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4578     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4579     _movp(T, Src0RM);
4580     _pmulhw(T, Src1RM);
4581     _movp(Dest, T);
4582     return;
4583   }
4584   case Intrinsics::MultiplyHighUnsigned: {
4585     Operand *Src0 = Instr->getArg(0);
4586     Operand *Src1 = Instr->getArg(1);
4587     Variable *Dest = Instr->getDest();
4588     auto *T = makeReg(Dest->getType());
4589     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4590     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4591     _movp(T, Src0RM);
4592     _pmulhuw(T, Src1RM);
4593     _movp(Dest, T);
4594     return;
4595   }
4596   case Intrinsics::MultiplyAddPairs: {
4597     Operand *Src0 = Instr->getArg(0);
4598     Operand *Src1 = Instr->getArg(1);
4599     Variable *Dest = Instr->getDest();
4600     auto *T = makeReg(Dest->getType());
4601     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4602     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4603     _movp(T, Src0RM);
4604     _pmaddwd(T, Src1RM);
4605     _movp(Dest, T);
4606     return;
4607   }
4608   case Intrinsics::AddSaturateSigned: {
4609     Operand *Src0 = Instr->getArg(0);
4610     Operand *Src1 = Instr->getArg(1);
4611     Variable *Dest = Instr->getDest();
4612     auto *T = makeReg(Dest->getType());
4613     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4614     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4615     _movp(T, Src0RM);
4616     _padds(T, Src1RM);
4617     _movp(Dest, T);
4618     return;
4619   }
4620   case Intrinsics::SubtractSaturateSigned: {
4621     Operand *Src0 = Instr->getArg(0);
4622     Operand *Src1 = Instr->getArg(1);
4623     Variable *Dest = Instr->getDest();
4624     auto *T = makeReg(Dest->getType());
4625     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4626     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4627     _movp(T, Src0RM);
4628     _psubs(T, Src1RM);
4629     _movp(Dest, T);
4630     return;
4631   }
4632   case Intrinsics::AddSaturateUnsigned: {
4633     Operand *Src0 = Instr->getArg(0);
4634     Operand *Src1 = Instr->getArg(1);
4635     Variable *Dest = Instr->getDest();
4636     auto *T = makeReg(Dest->getType());
4637     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4638     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4639     _movp(T, Src0RM);
4640     _paddus(T, Src1RM);
4641     _movp(Dest, T);
4642     return;
4643   }
4644   case Intrinsics::SubtractSaturateUnsigned: {
4645     Operand *Src0 = Instr->getArg(0);
4646     Operand *Src1 = Instr->getArg(1);
4647     Variable *Dest = Instr->getDest();
4648     auto *T = makeReg(Dest->getType());
4649     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
4650     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
4651     _movp(T, Src0RM);
4652     _psubus(T, Src1RM);
4653     _movp(Dest, T);
4654     return;
4655   }
4656   case Intrinsics::Nearbyint: {
4657     Operand *Src = Instr->getArg(0);
4658     Variable *Dest = Instr->getDest();
4659     Type DestTy = Dest->getType();
4660     if (isVectorType(DestTy)) {
4661       assert(DestTy == IceType_v4i32);
4662       assert(Src->getType() == IceType_v4f32);
4663       Operand *Src0R = legalizeToReg(Src);
4664       Variable *T = makeReg(DestTy);
4665       _cvt(T, Src0R, Traits::Insts::Cvt::Ps2dq);
4666       _movp(Dest, T);
4667     } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
4668       llvm::report_fatal_error("Helper call was expected");
4669     } else {
4670       Operand *Src0RM = legalize(Src, Legal_Reg | Legal_Mem);
4671       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
4672       Variable *T_1 = nullptr;
4673       if (Traits::Is64Bit && DestTy == IceType_i64) {
4674         T_1 = makeReg(IceType_i64);
4675       } else {
4676         assert(DestTy != IceType_i64);
4677         T_1 = makeReg(IceType_i32);
4678       }
4679       // cvt() requires its integer argument to be a GPR.
4680       Variable *T_2 = makeReg(DestTy);
4681       if (isByteSizedType(DestTy)) {
4682         assert(T_1->getType() == IceType_i32);
4683         T_1->setRegClass(RCX86_Is32To8);
4684         T_2->setRegClass(RCX86_IsTrunc8Rcvr);
4685       }
4686       _cvt(T_1, Src0RM, Traits::Insts::Cvt::Ss2si);
4687       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
4688       if (DestTy == IceType_i1)
4689         _and(T_2, Ctx->getConstantInt1(1));
4690       _mov(Dest, T_2);
4691     }
4692     return;
4693   }
4694   case Intrinsics::Round: {
4695     assert(InstructionSet >= Traits::SSE4_1);
4696     Variable *Dest = Instr->getDest();
4697     Operand *Src = Instr->getArg(0);
4698     Operand *Mode = Instr->getArg(1);
4699     assert(llvm::isa<ConstantInteger32>(Mode) &&
4700            "Round last argument must be a constant");
4701     auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem);
4702     int32_t Imm = llvm::cast<ConstantInteger32>(Mode)->getValue();
4703     (void)Imm;
4704     assert(Imm >= 0 && Imm < 4 && "Invalid rounding mode");
4705     auto *T = makeReg(Dest->getType());
4706     _round(T, SrcRM, Mode);
4707     _movp(Dest, T);
4708     return;
4709   }
4710   default: // UnknownIntrinsic
4711     Func->setError("Unexpected intrinsic");
4712     return;
4713   }
4714   return;
4715 }
4716 
4717 template <typename TraitsType>
4718 void TargetX86Base<TraitsType>::lowerAtomicCmpxchg(Variable *DestPrev,
4719                                                    Operand *Ptr,
4720                                                    Operand *Expected,
4721                                                    Operand *Desired) {
4722   Type Ty = Expected->getType();
4723   if (!Traits::Is64Bit && Ty == IceType_i64) {
4724     // Reserve the pre-colored registers first, before adding any more
4725     // infinite-weight variables from formMemoryOperand's legalization.
4726     Variable *T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
4727     Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
4728     Variable *T_ecx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx);
4729     Variable *T_ebx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ebx);
4730     _mov(T_eax, loOperand(Expected));
4731     _mov(T_edx, hiOperand(Expected));
4732     _mov(T_ebx, loOperand(Desired));
4733     _mov(T_ecx, hiOperand(Desired));
4734     X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
4735     constexpr bool Locked = true;
4736     _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
4737     auto *DestLo = llvm::cast<Variable>(loOperand(DestPrev));
4738     auto *DestHi = llvm::cast<Variable>(hiOperand(DestPrev));
4739     _mov(DestLo, T_eax);
4740     _mov(DestHi, T_edx);
4741     return;
4742   }
4743   RegNumT Eax;
4744   switch (Ty) {
4745   default:
4746     llvm::report_fatal_error("Bad type for cmpxchg");
4747   case IceType_i64:
4748     Eax = Traits::getRaxOrDie();
4749     break;
4750   case IceType_i32:
4751     Eax = Traits::RegisterSet::Reg_eax;
4752     break;
4753   case IceType_i16:
4754     Eax = Traits::RegisterSet::Reg_ax;
4755     break;
4756   case IceType_i8:
4757     Eax = Traits::RegisterSet::Reg_al;
4758     break;
4759   }
4760   Variable *T_eax = makeReg(Ty, Eax);
4761   _mov(T_eax, Expected);
4762   X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
4763   Variable *DesiredReg = legalizeToReg(Desired);
4764   constexpr bool Locked = true;
4765   _cmpxchg(Addr, T_eax, DesiredReg, Locked);
4766   _mov(DestPrev, T_eax);
4767 }
4768 
4769 template <typename TraitsType>
4770 bool TargetX86Base<TraitsType>::tryOptimizedCmpxchgCmpBr(Variable *Dest,
4771                                                          Operand *PtrToMem,
4772                                                          Operand *Expected,
4773                                                          Operand *Desired) {
4774   if (Func->getOptLevel() == Opt_m1)
4775     return false;
4776   // Peek ahead a few instructions and see how Dest is used.
4777   // It's very common to have:
4778   //
4779   // %x = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* ptr, i32 %expected, ...)
4780   // [%y_phi = ...] // list of phi stores
4781   // %p = icmp eq i32 %x, %expected
4782   // br i1 %p, label %l1, label %l2
4783   //
4784   // which we can optimize into:
4785   //
4786   // %x = <cmpxchg code>
4787   // [%y_phi = ...] // list of phi stores
4788   // br eq, %l1, %l2
4789   InstList::iterator I = Context.getCur();
4790   // I is currently the InstIntrinsicCall. Peek past that.
4791   // This assumes that the atomic cmpxchg has not been lowered yet,
4792   // so that the instructions seen in the scan from "Cur" is simple.
4793   assert(llvm::isa<InstIntrinsicCall>(*I));
4794   Inst *NextInst = Context.getNextInst(I);
4795   if (!NextInst)
4796     return false;
4797   // There might be phi assignments right before the compare+branch, since this
4798   // could be a backward branch for a loop. This placement of assignments is
4799   // determined by placePhiStores().
4800   CfgVector<InstAssign *> PhiAssigns;
4801   while (auto *PhiAssign = llvm::dyn_cast<InstAssign>(NextInst)) {
4802     if (PhiAssign->getDest() == Dest)
4803       return false;
4804     PhiAssigns.push_back(PhiAssign);
4805     NextInst = Context.getNextInst(I);
4806     if (!NextInst)
4807       return false;
4808   }
4809   if (auto *NextCmp = llvm::dyn_cast<InstIcmp>(NextInst)) {
4810     if (!(NextCmp->getCondition() == InstIcmp::Eq &&
4811           ((NextCmp->getSrc(0) == Dest && NextCmp->getSrc(1) == Expected) ||
4812            (NextCmp->getSrc(1) == Dest && NextCmp->getSrc(0) == Expected)))) {
4813       return false;
4814     }
4815     NextInst = Context.getNextInst(I);
4816     if (!NextInst)
4817       return false;
4818     if (auto *NextBr = llvm::dyn_cast<InstBr>(NextInst)) {
4819       if (!NextBr->isUnconditional() &&
4820           NextCmp->getDest() == NextBr->getCondition() &&
4821           NextBr->isLastUse(NextCmp->getDest())) {
4822         lowerAtomicCmpxchg(Dest, PtrToMem, Expected, Desired);
4823         for (size_t i = 0; i < PhiAssigns.size(); ++i) {
4824           // Lower the phi assignments now, before the branch (same placement
4825           // as before).
4826           InstAssign *PhiAssign = PhiAssigns[i];
4827           PhiAssign->setDeleted();
4828           lowerAssign(PhiAssign);
4829           Context.advanceNext();
4830         }
4831         _br(Traits::Cond::Br_e, NextBr->getTargetTrue(),
4832             NextBr->getTargetFalse());
4833         // Skip over the old compare and branch, by deleting them.
4834         NextCmp->setDeleted();
4835         NextBr->setDeleted();
4836         Context.advanceNext();
4837         Context.advanceNext();
4838         return true;
4839       }
4840     }
4841   }
4842   return false;
4843 }
4844 
4845 template <typename TraitsType>
4846 void TargetX86Base<TraitsType>::lowerAtomicRMW(Variable *Dest,
4847                                                uint32_t Operation, Operand *Ptr,
4848                                                Operand *Val) {
4849   bool NeedsCmpxchg = false;
4850   LowerBinOp Op_Lo = nullptr;
4851   LowerBinOp Op_Hi = nullptr;
4852   switch (Operation) {
4853   default:
4854     Func->setError("Unknown AtomicRMW operation");
4855     return;
4856   case Intrinsics::AtomicAdd: {
4857     if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
4858       // All the fall-through paths must set this to true, but use this
4859       // for asserting.
4860       NeedsCmpxchg = true;
4861       Op_Lo = &TargetX86Base<TraitsType>::_add;
4862       Op_Hi = &TargetX86Base<TraitsType>::_adc;
4863       break;
4864     }
4865     X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
4866     constexpr bool Locked = true;
4867     Variable *T = nullptr;
4868     _mov(T, Val);
4869     _xadd(Addr, T, Locked);
4870     _mov(Dest, T);
4871     return;
4872   }
4873   case Intrinsics::AtomicSub: {
4874     if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
4875       NeedsCmpxchg = true;
4876       Op_Lo = &TargetX86Base<TraitsType>::_sub;
4877       Op_Hi = &TargetX86Base<TraitsType>::_sbb;
4878       break;
4879     }
4880     X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
4881     constexpr bool Locked = true;
4882     Variable *T = nullptr;
4883     _mov(T, Val);
4884     _neg(T);
4885     _xadd(Addr, T, Locked);
4886     _mov(Dest, T);
4887     return;
4888   }
4889   case Intrinsics::AtomicOr:
4890     // TODO(jvoung): If Dest is null or dead, then some of these
4891     // operations do not need an "exchange", but just a locked op.
4892     // That appears to be "worth" it for sub, or, and, and xor.
4893     // xadd is probably fine vs lock add for add, and xchg is fine
4894     // vs an atomic store.
4895     NeedsCmpxchg = true;
4896     Op_Lo = &TargetX86Base<TraitsType>::_or;
4897     Op_Hi = &TargetX86Base<TraitsType>::_or;
4898     break;
4899   case Intrinsics::AtomicAnd:
4900     NeedsCmpxchg = true;
4901     Op_Lo = &TargetX86Base<TraitsType>::_and;
4902     Op_Hi = &TargetX86Base<TraitsType>::_and;
4903     break;
4904   case Intrinsics::AtomicXor:
4905     NeedsCmpxchg = true;
4906     Op_Lo = &TargetX86Base<TraitsType>::_xor;
4907     Op_Hi = &TargetX86Base<TraitsType>::_xor;
4908     break;
4909   case Intrinsics::AtomicExchange:
4910     if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
4911       NeedsCmpxchg = true;
4912       // NeedsCmpxchg, but no real Op_Lo/Op_Hi need to be done. The values
4913       // just need to be moved to the ecx and ebx registers.
4914       Op_Lo = nullptr;
4915       Op_Hi = nullptr;
4916       break;
4917     }
4918     X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
4919     Variable *T = nullptr;
4920     _mov(T, Val);
4921     _xchg(Addr, T);
4922     _mov(Dest, T);
4923     return;
4924   }
4925   // Otherwise, we need a cmpxchg loop.
4926   (void)NeedsCmpxchg;
4927   assert(NeedsCmpxchg);
4928   expandAtomicRMWAsCmpxchg(Op_Lo, Op_Hi, Dest, Ptr, Val);
4929 }
4930 
4931 template <typename TraitsType>
4932 void TargetX86Base<TraitsType>::expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo,
4933                                                          LowerBinOp Op_Hi,
4934                                                          Variable *Dest,
4935                                                          Operand *Ptr,
4936                                                          Operand *Val) {
4937   // Expand a more complex RMW operation as a cmpxchg loop:
4938   // For 64-bit:
4939   //   mov     eax, [ptr]
4940   //   mov     edx, [ptr + 4]
4941   // .LABEL:
4942   //   mov     ebx, eax
4943   //   <Op_Lo> ebx, <desired_adj_lo>
4944   //   mov     ecx, edx
4945   //   <Op_Hi> ecx, <desired_adj_hi>
4946   //   lock cmpxchg8b [ptr]
4947   //   jne     .LABEL
4948   //   mov     <dest_lo>, eax
4949   //   mov     <dest_lo>, edx
4950   //
4951   // For 32-bit:
4952   //   mov     eax, [ptr]
4953   // .LABEL:
4954   //   mov     <reg>, eax
4955   //   op      <reg>, [desired_adj]
4956   //   lock cmpxchg [ptr], <reg>
4957   //   jne     .LABEL
4958   //   mov     <dest>, eax
4959   //
4960   // If Op_{Lo,Hi} are nullptr, then just copy the value.
4961   Val = legalize(Val);
4962   Type Ty = Val->getType();
4963   if (!Traits::Is64Bit && Ty == IceType_i64) {
4964     Variable *T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
4965     Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
4966     X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
4967     _mov(T_eax, loOperand(Addr));
4968     _mov(T_edx, hiOperand(Addr));
4969     Variable *T_ecx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx);
4970     Variable *T_ebx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ebx);
4971     InstX86Label *Label = InstX86Label::create(Func, this);
4972     const bool IsXchg8b = Op_Lo == nullptr && Op_Hi == nullptr;
4973     if (!IsXchg8b) {
4974       Context.insert(Label);
4975       _mov(T_ebx, T_eax);
4976       (this->*Op_Lo)(T_ebx, loOperand(Val));
4977       _mov(T_ecx, T_edx);
4978       (this->*Op_Hi)(T_ecx, hiOperand(Val));
4979     } else {
4980       // This is for xchg, which doesn't need an actual Op_Lo/Op_Hi.
4981       // It just needs the Val loaded into ebx and ecx.
4982       // That can also be done before the loop.
4983       _mov(T_ebx, loOperand(Val));
4984       _mov(T_ecx, hiOperand(Val));
4985       Context.insert(Label);
4986     }
4987     constexpr bool Locked = true;
4988     _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
4989     _br(Traits::Cond::Br_ne, Label);
4990     if (!IsXchg8b) {
4991       // If Val is a variable, model the extended live range of Val through
4992       // the end of the loop, since it will be re-used by the loop.
4993       if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) {
4994         auto *ValLo = llvm::cast<Variable>(loOperand(ValVar));
4995         auto *ValHi = llvm::cast<Variable>(hiOperand(ValVar));
4996         Context.insert<InstFakeUse>(ValLo);
4997         Context.insert<InstFakeUse>(ValHi);
4998       }
4999     } else {
5000       // For xchg, the loop is slightly smaller and ebx/ecx are used.
5001       Context.insert<InstFakeUse>(T_ebx);
5002       Context.insert<InstFakeUse>(T_ecx);
5003     }
5004     // The address base (if any) is also reused in the loop.
5005     if (Variable *Base = Addr->getBase())
5006       Context.insert<InstFakeUse>(Base);
5007     auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
5008     auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
5009     _mov(DestLo, T_eax);
5010     _mov(DestHi, T_edx);
5011     return;
5012   }
5013   X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
5014   RegNumT Eax;
5015   switch (Ty) {
5016   default:
5017     llvm::report_fatal_error("Bad type for atomicRMW");
5018   case IceType_i64:
5019     Eax = Traits::getRaxOrDie();
5020     break;
5021   case IceType_i32:
5022     Eax = Traits::RegisterSet::Reg_eax;
5023     break;
5024   case IceType_i16:
5025     Eax = Traits::RegisterSet::Reg_ax;
5026     break;
5027   case IceType_i8:
5028     Eax = Traits::RegisterSet::Reg_al;
5029     break;
5030   }
5031   Variable *T_eax = makeReg(Ty, Eax);
5032   _mov(T_eax, Addr);
5033   auto *Label = Context.insert<InstX86Label>(this);
5034   // We want to pick a different register for T than Eax, so don't use
5035   // _mov(T == nullptr, T_eax).
5036   Variable *T = makeReg(Ty);
5037   _mov(T, T_eax);
5038   (this->*Op_Lo)(T, Val);
5039   constexpr bool Locked = true;
5040   _cmpxchg(Addr, T_eax, T, Locked);
5041   _br(Traits::Cond::Br_ne, Label);
5042   // If Val is a variable, model the extended live range of Val through
5043   // the end of the loop, since it will be re-used by the loop.
5044   if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) {
5045     Context.insert<InstFakeUse>(ValVar);
5046   }
5047   // The address base (if any) is also reused in the loop.
5048   if (Variable *Base = Addr->getBase())
5049     Context.insert<InstFakeUse>(Base);
5050   _mov(Dest, T_eax);
5051 }
5052 
5053 /// Lowers count {trailing, leading} zeros intrinsic.
5054 ///
5055 /// We could do constant folding here, but that should have
5056 /// been done by the front-end/middle-end optimizations.
5057 template <typename TraitsType>
5058 void TargetX86Base<TraitsType>::lowerCountZeros(bool Cttz, Type Ty,
5059                                                 Variable *Dest,
5060                                                 Operand *FirstVal,
5061                                                 Operand *SecondVal) {
5062   // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI).
5063   // Then the instructions will handle the Val == 0 case much more simply
5064   // and won't require conversion from bit position to number of zeros.
5065   //
5066   // Otherwise:
5067   //   bsr IF_NOT_ZERO, Val
5068   //   mov T_DEST, ((Ty == i32) ? 63 : 127)
5069   //   cmovne T_DEST, IF_NOT_ZERO
5070   //   xor T_DEST, ((Ty == i32) ? 31 : 63)
5071   //   mov DEST, T_DEST
5072   //
5073   // NOTE: T_DEST must be a register because cmov requires its dest to be a
5074   // register. Also, bsf and bsr require their dest to be a register.
5075   //
5076   // The xor DEST, C(31|63) converts a bit position to # of leading zeroes.
5077   // E.g., for 000... 00001100, bsr will say that the most significant bit
5078   // set is at position 3, while the number of leading zeros is 28. Xor is
5079   // like (M - N) for N <= M, and converts 63 to 32, and 127 to 64 (for the
5080   // all-zeros case).
5081   //
5082   // X8632 only: Similar for 64-bit, but start w/ speculating that the upper 32
5083   // bits are all zero, and compute the result for that case (checking the
5084   // lower 32 bits). Then actually compute the result for the upper bits and
5085   // cmov in the result from the lower computation if the earlier speculation
5086   // was correct.
5087   //
5088   // Cttz, is similar, but uses bsf instead, and doesn't require the xor
5089   // bit position conversion, and the speculation is reversed.
5090 
5091   // TODO(jpp): refactor this method.
5092   assert(Ty == IceType_i32 || Ty == IceType_i64);
5093   const Type DestTy = Traits::Is64Bit ? Dest->getType() : IceType_i32;
5094   Variable *T = makeReg(DestTy);
5095   Operand *FirstValRM = legalize(FirstVal, Legal_Mem | Legal_Reg);
5096   if (Cttz) {
5097     _bsf(T, FirstValRM);
5098   } else {
5099     _bsr(T, FirstValRM);
5100   }
5101   Variable *T_Dest = makeReg(DestTy);
5102   Constant *_31 = Ctx->getConstantInt32(31);
5103   Constant *_32 = Ctx->getConstantInt(DestTy, 32);
5104   Constant *_63 = Ctx->getConstantInt(DestTy, 63);
5105   Constant *_64 = Ctx->getConstantInt(DestTy, 64);
5106   if (Cttz) {
5107     if (DestTy == IceType_i64) {
5108       _mov(T_Dest, _64);
5109     } else {
5110       _mov(T_Dest, _32);
5111     }
5112   } else {
5113     Constant *_127 = Ctx->getConstantInt(DestTy, 127);
5114     if (DestTy == IceType_i64) {
5115       _mov(T_Dest, _127);
5116     } else {
5117       _mov(T_Dest, _63);
5118     }
5119   }
5120   _cmov(T_Dest, T, Traits::Cond::Br_ne);
5121   if (!Cttz) {
5122     if (DestTy == IceType_i64) {
5123       // Even though there's a _63 available at this point, that constant might
5124       // not be an i32, which will cause the xor emission to fail.
5125       Constant *_63 = Ctx->getConstantInt32(63);
5126       _xor(T_Dest, _63);
5127     } else {
5128       _xor(T_Dest, _31);
5129     }
5130   }
5131   if (Traits::Is64Bit || Ty == IceType_i32) {
5132     _mov(Dest, T_Dest);
5133     return;
5134   }
5135   _add(T_Dest, _32);
5136   auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
5137   auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
5138   // Will be using "test" on this, so we need a registerized variable.
5139   Variable *SecondVar = legalizeToReg(SecondVal);
5140   Variable *T_Dest2 = makeReg(IceType_i32);
5141   if (Cttz) {
5142     _bsf(T_Dest2, SecondVar);
5143   } else {
5144     _bsr(T_Dest2, SecondVar);
5145     _xor(T_Dest2, _31);
5146   }
5147   _test(SecondVar, SecondVar);
5148   _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e);
5149   _mov(DestLo, T_Dest2);
5150   _mov(DestHi, Ctx->getConstantZero(IceType_i32));
5151 }
5152 
5153 template <typename TraitsType>
5154 void TargetX86Base<TraitsType>::typedLoad(Type Ty, Variable *Dest,
5155                                           Variable *Base, Constant *Offset) {
5156   // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
5157   // legalize Mem properly.
5158   if (Offset)
5159     assert(!llvm::isa<ConstantRelocatable>(Offset));
5160 
5161   auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
5162 
5163   if (isVectorType(Ty))
5164     _movp(Dest, Mem);
5165   else if (Ty == IceType_f64)
5166     _movq(Dest, Mem);
5167   else
5168     _mov(Dest, Mem);
5169 }
5170 
5171 template <typename TraitsType>
5172 void TargetX86Base<TraitsType>::typedStore(Type Ty, Variable *Value,
5173                                            Variable *Base, Constant *Offset) {
5174   // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
5175   // legalize Mem properly.
5176   if (Offset)
5177     assert(!llvm::isa<ConstantRelocatable>(Offset));
5178 
5179   auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
5180 
5181   if (isVectorType(Ty))
5182     _storep(Value, Mem);
5183   else if (Ty == IceType_f64)
5184     _storeq(Value, Mem);
5185   else
5186     _store(Value, Mem);
5187 }
5188 
5189 template <typename TraitsType>
5190 void TargetX86Base<TraitsType>::copyMemory(Type Ty, Variable *Dest,
5191                                            Variable *Src, int32_t OffsetAmt) {
5192   Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
5193   // TODO(ascull): this or add nullptr test to _movp, _movq
5194   Variable *Data = makeReg(Ty);
5195 
5196   typedLoad(Ty, Data, Src, Offset);
5197   typedStore(Ty, Data, Dest, Offset);
5198 }
5199 
5200 template <typename TraitsType>
5201 void TargetX86Base<TraitsType>::lowerMemcpy(Operand *Dest, Operand *Src,
5202                                             Operand *Count) {
5203   // There is a load and store for each chunk in the unroll
5204   constexpr uint32_t BytesPerStorep = 16;
5205 
5206   // Check if the operands are constants
5207   const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
5208   const bool IsCountConst = CountConst != nullptr;
5209   const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
5210 
5211   if (shouldOptimizeMemIntrins() && IsCountConst &&
5212       CountValue <= BytesPerStorep * Traits::MEMCPY_UNROLL_LIMIT) {
5213     // Unlikely, but nothing to do if it does happen
5214     if (CountValue == 0)
5215       return;
5216 
5217     Variable *SrcBase = legalizeToReg(Src);
5218     Variable *DestBase = legalizeToReg(Dest);
5219 
5220     // Find the largest type that can be used and use it as much as possible in
5221     // reverse order. Then handle any remainder with overlapping copies. Since
5222     // the remainder will be at the end, there will be reduced pressure on the
5223     // memory unit as the accesses to the same memory are far apart.
5224     Type Ty = largestTypeInSize(CountValue);
5225     uint32_t TyWidth = typeWidthInBytes(Ty);
5226 
5227     uint32_t RemainingBytes = CountValue;
5228     int32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
5229     while (RemainingBytes >= TyWidth) {
5230       copyMemory(Ty, DestBase, SrcBase, Offset);
5231       RemainingBytes -= TyWidth;
5232       Offset -= TyWidth;
5233     }
5234 
5235     if (RemainingBytes == 0)
5236       return;
5237 
5238     // Lower the remaining bytes. Adjust to larger types in order to make use
5239     // of overlaps in the copies.
5240     Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
5241     Offset = CountValue - typeWidthInBytes(LeftOverTy);
5242     copyMemory(LeftOverTy, DestBase, SrcBase, Offset);
5243     return;
5244   }
5245 
5246   // Fall back on a function call
5247   InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memcpy, nullptr, 3);
5248   Call->addArg(Dest);
5249   Call->addArg(Src);
5250   Call->addArg(Count);
5251   lowerCall(Call);
5252 }
5253 
5254 template <typename TraitsType>
5255 void TargetX86Base<TraitsType>::lowerMemmove(Operand *Dest, Operand *Src,
5256                                              Operand *Count) {
5257   // There is a load and store for each chunk in the unroll
5258   constexpr uint32_t BytesPerStorep = 16;
5259 
5260   // Check if the operands are constants
5261   const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
5262   const bool IsCountConst = CountConst != nullptr;
5263   const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
5264 
5265   if (shouldOptimizeMemIntrins() && IsCountConst &&
5266       CountValue <= BytesPerStorep * Traits::MEMMOVE_UNROLL_LIMIT) {
5267     // Unlikely, but nothing to do if it does happen
5268     if (CountValue == 0)
5269       return;
5270 
5271     Variable *SrcBase = legalizeToReg(Src);
5272     Variable *DestBase = legalizeToReg(Dest);
5273 
5274     std::tuple<Type, Constant *, Variable *>
5275         Moves[Traits::MEMMOVE_UNROLL_LIMIT];
5276     Constant *Offset;
5277     Variable *Reg;
5278 
5279     // Copy the data into registers as the source and destination could overlap
5280     // so make sure not to clobber the memory. This also means overlapping
5281     // moves can be used as we are taking a safe snapshot of the memory.
5282     Type Ty = largestTypeInSize(CountValue);
5283     uint32_t TyWidth = typeWidthInBytes(Ty);
5284 
5285     uint32_t RemainingBytes = CountValue;
5286     int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth;
5287     size_t N = 0;
5288     while (RemainingBytes >= TyWidth) {
5289       assert(N <= Traits::MEMMOVE_UNROLL_LIMIT);
5290       Offset = Ctx->getConstantInt32(OffsetAmt);
5291       Reg = makeReg(Ty);
5292       typedLoad(Ty, Reg, SrcBase, Offset);
5293       RemainingBytes -= TyWidth;
5294       OffsetAmt -= TyWidth;
5295       Moves[N++] = std::make_tuple(Ty, Offset, Reg);
5296     }
5297 
5298     if (RemainingBytes != 0) {
5299       // Lower the remaining bytes. Adjust to larger types in order to make use
5300       // of overlaps in the copies.
5301       assert(N <= Traits::MEMMOVE_UNROLL_LIMIT);
5302       Ty = firstTypeThatFitsSize(RemainingBytes);
5303       Offset = Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty));
5304       Reg = makeReg(Ty);
5305       typedLoad(Ty, Reg, SrcBase, Offset);
5306       Moves[N++] = std::make_tuple(Ty, Offset, Reg);
5307     }
5308 
5309     // Copy the data out into the destination memory
5310     for (size_t i = 0; i < N; ++i) {
5311       std::tie(Ty, Offset, Reg) = Moves[i];
5312       typedStore(Ty, Reg, DestBase, Offset);
5313     }
5314 
5315     return;
5316   }
5317 
5318   // Fall back on a function call
5319   InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memmove, nullptr, 3);
5320   Call->addArg(Dest);
5321   Call->addArg(Src);
5322   Call->addArg(Count);
5323   lowerCall(Call);
5324 }
5325 
5326 template <typename TraitsType>
5327 void TargetX86Base<TraitsType>::lowerMemset(Operand *Dest, Operand *Val,
5328                                             Operand *Count) {
5329   constexpr uint32_t BytesPerStorep = 16;
5330   constexpr uint32_t BytesPerStoreq = 8;
5331   constexpr uint32_t BytesPerStorei32 = 4;
5332   assert(Val->getType() == IceType_i8);
5333 
5334   // Check if the operands are constants
5335   const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
5336   const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val);
5337   const bool IsCountConst = CountConst != nullptr;
5338   const bool IsValConst = ValConst != nullptr;
5339   const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
5340   const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0;
5341 
5342   // Unlikely, but nothing to do if it does happen
5343   if (IsCountConst && CountValue == 0)
5344     return;
5345 
5346   // TODO(ascull): if the count is constant but val is not it would be possible
5347   // to inline by spreading the value across 4 bytes and accessing subregs e.g.
5348   // eax, ax and al.
5349   if (shouldOptimizeMemIntrins() && IsCountConst && IsValConst) {
5350     Variable *Base = nullptr;
5351     Variable *VecReg = nullptr;
5352     const uint32_t MaskValue = (ValValue & 0xff);
5353     const uint32_t SpreadValue =
5354         (MaskValue << 24) | (MaskValue << 16) | (MaskValue << 8) | MaskValue;
5355 
5356     auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,
5357                                                         uint32_t OffsetAmt) {
5358       assert(Base != nullptr);
5359       Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
5360 
5361       // TODO(ascull): is 64-bit better with vector or scalar movq?
5362       auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
5363       if (isVectorType(Ty)) {
5364         assert(VecReg != nullptr);
5365         _storep(VecReg, Mem);
5366       } else if (Ty == IceType_f64) {
5367         assert(VecReg != nullptr);
5368         _storeq(VecReg, Mem);
5369       } else {
5370         assert(Ty != IceType_i64);
5371         _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);
5372       }
5373     };
5374 
5375     // Find the largest type that can be used and use it as much as possible in
5376     // reverse order. Then handle any remainder with overlapping copies. Since
5377     // the remainder will be at the end, there will be reduces pressure on the
5378     // memory unit as the access to the same memory are far apart.
5379     Type Ty = IceType_void;
5380     if (ValValue == 0 && CountValue >= BytesPerStoreq &&
5381         CountValue <= BytesPerStorep * Traits::MEMSET_UNROLL_LIMIT) {
5382       // When the value is zero it can be loaded into a vector register cheaply
5383       // using the xor trick.
5384       Base = legalizeToReg(Dest);
5385       VecReg = makeVectorOfZeros(IceType_v16i8);
5386       Ty = largestTypeInSize(CountValue);
5387     } else if (CountValue <= BytesPerStorei32 * Traits::MEMSET_UNROLL_LIMIT) {
5388       // When the value is non-zero or the count is small we can't use vector
5389       // instructions so are limited to 32-bit stores.
5390       Base = legalizeToReg(Dest);
5391       constexpr uint32_t MaxSize = 4;
5392       Ty = largestTypeInSize(CountValue, MaxSize);
5393     }
5394 
5395     if (Base) {
5396       uint32_t TyWidth = typeWidthInBytes(Ty);
5397 
5398       uint32_t RemainingBytes = CountValue;
5399       uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
5400       while (RemainingBytes >= TyWidth) {
5401         lowerSet(Ty, Offset);
5402         RemainingBytes -= TyWidth;
5403         Offset -= TyWidth;
5404       }
5405 
5406       if (RemainingBytes == 0)
5407         return;
5408 
5409       // Lower the remaining bytes. Adjust to larger types in order to make use
5410       // of overlaps in the copies.
5411       Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
5412       Offset = CountValue - typeWidthInBytes(LeftOverTy);
5413       lowerSet(LeftOverTy, Offset);
5414       return;
5415     }
5416   }
5417 
5418   // Fall back on calling the memset function. The value operand needs to be
5419   // extended to a stack slot size because the PNaCl ABI requires arguments to
5420   // be at least 32 bits wide.
5421   Operand *ValExt;
5422   if (IsValConst) {
5423     ValExt = Ctx->getConstantInt(stackSlotType(), ValValue);
5424   } else {
5425     Variable *ValExtVar = Func->makeVariable(stackSlotType());
5426     lowerCast(InstCast::create(Func, InstCast::Zext, ValExtVar, Val));
5427     ValExt = ValExtVar;
5428   }
5429   InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memset, nullptr, 3);
5430   Call->addArg(Dest);
5431   Call->addArg(ValExt);
5432   Call->addArg(Count);
5433   lowerCall(Call);
5434 }
5435 
5436 class AddressOptimizer {
5437   AddressOptimizer() = delete;
5438   AddressOptimizer(const AddressOptimizer &) = delete;
5439   AddressOptimizer &operator=(const AddressOptimizer &) = delete;
5440 
5441 public:
5442   explicit AddressOptimizer(const Cfg *Func)
5443       : Func(Func), VMetadata(Func->getVMetadata()) {}
5444 
5445   inline void dumpAddressOpt(const ConstantRelocatable *const Relocatable,
5446                              int32_t Offset, const Variable *Base,
5447                              const Variable *Index, uint16_t Shift,
5448                              const Inst *Reason) const;
5449 
5450   inline const Inst *matchAssign(Variable **Var,
5451                                  ConstantRelocatable **Relocatable,
5452                                  int32_t *Offset);
5453 
5454   inline const Inst *matchCombinedBaseIndex(Variable **Base, Variable **Index,
5455                                             uint16_t *Shift);
5456 
5457   inline const Inst *matchShiftedIndex(Variable **Index, uint16_t *Shift);
5458 
5459   inline const Inst *matchOffsetIndexOrBase(Variable **IndexOrBase,
5460                                             const uint16_t Shift,
5461                                             ConstantRelocatable **Relocatable,
5462                                             int32_t *Offset);
5463 
5464 private:
5465   const Cfg *const Func;
5466   const VariablesMetadata *const VMetadata;
5467 
5468   static bool isAdd(const Inst *Instr) {
5469     if (auto *Arith = llvm::dyn_cast_or_null<const InstArithmetic>(Instr)) {
5470       return (Arith->getOp() == InstArithmetic::Add);
5471     }
5472     return false;
5473   }
5474 };
5475 
5476 void AddressOptimizer::dumpAddressOpt(
5477     const ConstantRelocatable *const Relocatable, int32_t Offset,
5478     const Variable *Base, const Variable *Index, uint16_t Shift,
5479     const Inst *Reason) const {
5480   if (!BuildDefs::dump())
5481     return;
5482   if (!Func->isVerbose(IceV_AddrOpt))
5483     return;
5484   OstreamLocker L(Func->getContext());
5485   Ostream &Str = Func->getContext()->getStrDump();
5486   Str << "Instruction: ";
5487   Reason->dumpDecorated(Func);
5488   Str << "  results in Base=";
5489   if (Base)
5490     Base->dump(Func);
5491   else
5492     Str << "<null>";
5493   Str << ", Index=";
5494   if (Index)
5495     Index->dump(Func);
5496   else
5497     Str << "<null>";
5498   Str << ", Shift=" << Shift << ", Offset=" << Offset
5499       << ", Relocatable=" << Relocatable << "\n";
5500 }
5501 
5502 const Inst *AddressOptimizer::matchAssign(Variable **Var,
5503                                           ConstantRelocatable **Relocatable,
5504                                           int32_t *Offset) {
5505   // Var originates from Var=SrcVar ==> set Var:=SrcVar
5506   if (*Var == nullptr)
5507     return nullptr;
5508   if (const Inst *VarAssign = VMetadata->getSingleDefinition(*Var)) {
5509     assert(!VMetadata->isMultiDef(*Var));
5510     if (llvm::isa<InstAssign>(VarAssign)) {
5511       Operand *SrcOp = VarAssign->getSrc(0);
5512       assert(SrcOp);
5513       if (auto *SrcVar = llvm::dyn_cast<Variable>(SrcOp)) {
5514         if (!VMetadata->isMultiDef(SrcVar) &&
5515             // TODO: ensure SrcVar stays single-BB
5516             true) {
5517           *Var = SrcVar;
5518           return VarAssign;
5519         }
5520       } else if (auto *Const = llvm::dyn_cast<ConstantInteger32>(SrcOp)) {
5521         int32_t MoreOffset = Const->getValue();
5522         if (Utils::WouldOverflowAdd(*Offset, MoreOffset))
5523           return nullptr;
5524         *Var = nullptr;
5525         *Offset += MoreOffset;
5526         return VarAssign;
5527       } else if (auto *AddReloc = llvm::dyn_cast<ConstantRelocatable>(SrcOp)) {
5528         if (*Relocatable == nullptr) {
5529           // It is always safe to fold a relocatable through assignment -- the
5530           // assignment frees a slot in the address operand that can be used to
5531           // hold the Sandbox Pointer -- if any.
5532           *Var = nullptr;
5533           *Relocatable = AddReloc;
5534           return VarAssign;
5535         }
5536       }
5537     }
5538   }
5539   return nullptr;
5540 }
5541 
5542 const Inst *AddressOptimizer::matchCombinedBaseIndex(Variable **Base,
5543                                                      Variable **Index,
5544                                                      uint16_t *Shift) {
5545   // Index==nullptr && Base is Base=Var1+Var2 ==>
5546   //   set Base=Var1, Index=Var2, Shift=0
5547   if (*Base == nullptr)
5548     return nullptr;
5549   if (*Index != nullptr)
5550     return nullptr;
5551   auto *BaseInst = VMetadata->getSingleDefinition(*Base);
5552   if (BaseInst == nullptr)
5553     return nullptr;
5554   assert(!VMetadata->isMultiDef(*Base));
5555   if (BaseInst->getSrcSize() < 2)
5556     return nullptr;
5557   if (auto *Var1 = llvm::dyn_cast<Variable>(BaseInst->getSrc(0))) {
5558     if (VMetadata->isMultiDef(Var1))
5559       return nullptr;
5560     if (auto *Var2 = llvm::dyn_cast<Variable>(BaseInst->getSrc(1))) {
5561       if (VMetadata->isMultiDef(Var2))
5562         return nullptr;
5563       if (isAdd(BaseInst) &&
5564           // TODO: ensure Var1 and Var2 stay single-BB
5565           true) {
5566         *Base = Var1;
5567         *Index = Var2;
5568         *Shift = 0; // should already have been 0
5569         return BaseInst;
5570       }
5571     }
5572   }
5573   return nullptr;
5574 }
5575 
5576 const Inst *AddressOptimizer::matchShiftedIndex(Variable **Index,
5577                                                 uint16_t *Shift) {
5578   // Index is Index=Var*Const && log2(Const)+Shift<=3 ==>
5579   //   Index=Var, Shift+=log2(Const)
5580   if (*Index == nullptr)
5581     return nullptr;
5582   auto *IndexInst = VMetadata->getSingleDefinition(*Index);
5583   if (IndexInst == nullptr)
5584     return nullptr;
5585   assert(!VMetadata->isMultiDef(*Index));
5586 
5587   // When using an unsigned 32-bit array index on x64, it gets zero-extended
5588   // before the shift & add. The explicit zero extension can be eliminated
5589   // because x86 32-bit operations automatically get zero-extended into the
5590   // corresponding 64-bit register.
5591   if (auto *CastInst = llvm::dyn_cast<InstCast>(IndexInst)) {
5592     if (CastInst->getCastKind() == InstCast::Zext) {
5593       if (auto *Var = llvm::dyn_cast<Variable>(CastInst->getSrc(0))) {
5594         if (Var->getType() == IceType_i32 &&
5595             CastInst->getDest()->getType() == IceType_i64) {
5596           IndexInst = VMetadata->getSingleDefinition(Var);
5597         }
5598       }
5599     }
5600   }
5601 
5602   if (IndexInst->getSrcSize() < 2)
5603     return nullptr;
5604   if (auto *ArithInst = llvm::dyn_cast<InstArithmetic>(IndexInst)) {
5605     if (auto *Var = llvm::dyn_cast<Variable>(ArithInst->getSrc(0))) {
5606       if (auto *Const =
5607               llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(1))) {
5608         if (VMetadata->isMultiDef(Var) || Const->getType() != IceType_i32)
5609           return nullptr;
5610         switch (ArithInst->getOp()) {
5611         default:
5612           return nullptr;
5613         case InstArithmetic::Mul: {
5614           uint32_t Mult = Const->getValue();
5615           uint32_t LogMult;
5616           switch (Mult) {
5617           case 1:
5618             LogMult = 0;
5619             break;
5620           case 2:
5621             LogMult = 1;
5622             break;
5623           case 4:
5624             LogMult = 2;
5625             break;
5626           case 8:
5627             LogMult = 3;
5628             break;
5629           default:
5630             return nullptr;
5631           }
5632           if (*Shift + LogMult <= 3) {
5633             *Index = Var;
5634             *Shift += LogMult;
5635             return IndexInst;
5636           }
5637         }
5638         case InstArithmetic::Shl: {
5639           uint32_t ShiftAmount = Const->getValue();
5640           switch (ShiftAmount) {
5641           case 0:
5642           case 1:
5643           case 2:
5644           case 3:
5645             break;
5646           default:
5647             return nullptr;
5648           }
5649           if (*Shift + ShiftAmount <= 3) {
5650             *Index = Var;
5651             *Shift += ShiftAmount;
5652             return IndexInst;
5653           }
5654         }
5655         }
5656       }
5657     }
5658   }
5659   return nullptr;
5660 }
5661 
5662 const Inst *AddressOptimizer::matchOffsetIndexOrBase(
5663     Variable **IndexOrBase, const uint16_t Shift,
5664     ConstantRelocatable **Relocatable, int32_t *Offset) {
5665   // Base is Base=Var+Const || Base is Base=Const+Var ==>
5666   //   set Base=Var, Offset+=Const
5667   // Base is Base=Var-Const ==>
5668   //   set Base=Var, Offset-=Const
5669   // Index is Index=Var+Const ==>
5670   //   set Index=Var, Offset+=(Const<<Shift)
5671   // Index is Index=Const+Var ==>
5672   //   set Index=Var, Offset+=(Const<<Shift)
5673   // Index is Index=Var-Const ==>
5674   //   set Index=Var, Offset-=(Const<<Shift)
5675   // Treat Index=Var Or Const as Index=Var + Const
5676   //    when Var = Var' << N and log2(Const) <= N
5677   // or when Var = (2^M) * (2^N) and log2(Const) <= (M+N)
5678 
5679   if (*IndexOrBase == nullptr) {
5680     return nullptr;
5681   }
5682   const Inst *Definition = VMetadata->getSingleDefinition(*IndexOrBase);
5683   if (Definition == nullptr) {
5684     return nullptr;
5685   }
5686   assert(!VMetadata->isMultiDef(*IndexOrBase));
5687   if (auto *ArithInst = llvm::dyn_cast<const InstArithmetic>(Definition)) {
5688     switch (ArithInst->getOp()) {
5689     case InstArithmetic::Add:
5690     case InstArithmetic::Sub:
5691     case InstArithmetic::Or:
5692       break;
5693     default:
5694       return nullptr;
5695     }
5696 
5697     Operand *Src0 = ArithInst->getSrc(0);
5698     Operand *Src1 = ArithInst->getSrc(1);
5699     auto *Var0 = llvm::dyn_cast<Variable>(Src0);
5700     auto *Var1 = llvm::dyn_cast<Variable>(Src1);
5701     auto *Const0 = llvm::dyn_cast<ConstantInteger32>(Src0);
5702     auto *Const1 = llvm::dyn_cast<ConstantInteger32>(Src1);
5703     auto *Reloc0 = llvm::dyn_cast<ConstantRelocatable>(Src0);
5704     auto *Reloc1 = llvm::dyn_cast<ConstantRelocatable>(Src1);
5705 
5706     bool IsAdd = false;
5707     if (ArithInst->getOp() == InstArithmetic::Or) {
5708       Variable *Var = nullptr;
5709       ConstantInteger32 *Const = nullptr;
5710       if (Var0 && Const1) {
5711         Var = Var0;
5712         Const = Const1;
5713       } else if (Const0 && Var1) {
5714         Var = Var1;
5715         Const = Const0;
5716       } else {
5717         return nullptr;
5718       }
5719       auto *VarDef =
5720           llvm::dyn_cast<InstArithmetic>(VMetadata->getSingleDefinition(Var));
5721       if (VarDef == nullptr)
5722         return nullptr;
5723 
5724       SizeT ZeroesAvailable = 0;
5725       if (VarDef->getOp() == InstArithmetic::Shl) {
5726         if (auto *ConstInt =
5727                 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) {
5728           ZeroesAvailable = ConstInt->getValue();
5729         }
5730       } else if (VarDef->getOp() == InstArithmetic::Mul) {
5731         SizeT PowerOfTwo = 0;
5732         if (auto *MultConst =
5733                 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(0))) {
5734           if (llvm::isPowerOf2_32(MultConst->getValue())) {
5735             PowerOfTwo += MultConst->getValue();
5736           }
5737         }
5738         if (auto *MultConst =
5739                 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) {
5740           if (llvm::isPowerOf2_32(MultConst->getValue())) {
5741             PowerOfTwo += MultConst->getValue();
5742           }
5743         }
5744         ZeroesAvailable = llvm::Log2_32(PowerOfTwo) + 1;
5745       }
5746       SizeT ZeroesNeeded = llvm::Log2_32(Const->getValue()) + 1;
5747       if (ZeroesNeeded == 0 || ZeroesNeeded > ZeroesAvailable)
5748         return nullptr;
5749       IsAdd = true; // treat it as an add if the above conditions hold
5750     } else {
5751       IsAdd = ArithInst->getOp() == InstArithmetic::Add;
5752     }
5753 
5754     Variable *NewIndexOrBase = nullptr;
5755     int32_t NewOffset = 0;
5756     ConstantRelocatable *NewRelocatable = *Relocatable;
5757     if (Var0 && Var1)
5758       // TODO(sehr): merge base/index splitting into here.
5759       return nullptr;
5760     if (!IsAdd && Var1)
5761       return nullptr;
5762     if (Var0)
5763       NewIndexOrBase = Var0;
5764     else if (Var1)
5765       NewIndexOrBase = Var1;
5766     // Don't know how to add/subtract two relocatables.
5767     if ((*Relocatable && (Reloc0 || Reloc1)) || (Reloc0 && Reloc1))
5768       return nullptr;
5769     // Don't know how to subtract a relocatable.
5770     if (!IsAdd && Reloc1)
5771       return nullptr;
5772     // Incorporate ConstantRelocatables.
5773     if (Reloc0)
5774       NewRelocatable = Reloc0;
5775     else if (Reloc1)
5776       NewRelocatable = Reloc1;
5777     // Compute the updated constant offset.
5778     if (Const0) {
5779       const int32_t MoreOffset =
5780           IsAdd ? Const0->getValue() : -Const0->getValue();
5781       if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset))
5782         return nullptr;
5783       NewOffset += MoreOffset;
5784     }
5785     if (Const1) {
5786       const int32_t MoreOffset =
5787           IsAdd ? Const1->getValue() : -Const1->getValue();
5788       if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset))
5789         return nullptr;
5790       NewOffset += MoreOffset;
5791     }
5792     if (Utils::WouldOverflowAdd(*Offset, NewOffset << Shift))
5793       return nullptr;
5794     *IndexOrBase = NewIndexOrBase;
5795     *Offset += (NewOffset << Shift);
5796     // Shift is always zero if this is called with the base
5797     *Relocatable = NewRelocatable;
5798     return Definition;
5799   }
5800   return nullptr;
5801 }
5802 
5803 template <typename TypeTraits>
5804 typename TargetX86Base<TypeTraits>::X86OperandMem *
5805 TargetX86Base<TypeTraits>::computeAddressOpt(const Inst *Instr, Type MemType,
5806                                              Operand *Addr) {
5807   Func->resetCurrentNode();
5808   if (Func->isVerbose(IceV_AddrOpt)) {
5809     OstreamLocker L(Func->getContext());
5810     Ostream &Str = Func->getContext()->getStrDump();
5811     Str << "\nStarting computeAddressOpt for instruction:\n  ";
5812     Instr->dumpDecorated(Func);
5813   }
5814 
5815   OptAddr NewAddr;
5816   NewAddr.Base = llvm::dyn_cast<Variable>(Addr);
5817   if (NewAddr.Base == nullptr)
5818     return nullptr;
5819 
5820   // If the Base has more than one use or is live across multiple blocks, then
5821   // don't go further. Alternatively (?), never consider a transformation that
5822   // would change a variable that is currently *not* live across basic block
5823   // boundaries into one that *is*.
5824   if (!getFlags().getLoopInvariantCodeMotion()) {
5825     // Need multi block address opt when licm is enabled.
5826     // Might make sense to restrict to current node and loop header.
5827     if (Func->getVMetadata()->isMultiBlock(
5828             NewAddr.Base) /* || Base->getUseCount() > 1*/)
5829       return nullptr;
5830   }
5831   AddressOptimizer AddrOpt(Func);
5832   const bool MockBounds = getFlags().getMockBoundsCheck();
5833   const Inst *Reason = nullptr;
5834   bool AddressWasOptimized = false;
5835   // The following unnamed struct identifies the address mode formation steps
5836   // that could potentially create an invalid memory operand (i.e., no free
5837   // slots for RebasePtr.) We add all those variables to this struct so that we
5838   // can use memset() to reset all members to false.
5839   struct {
5840     bool AssignBase = false;
5841     bool AssignIndex = false;
5842     bool OffsetFromBase = false;
5843     bool OffsetFromIndex = false;
5844     bool CombinedBaseIndex = false;
5845   } Skip;
5846   // This points to the boolean in Skip that represents the last folding
5847   // performed. This is used to disable a pattern match that generated an
5848   // invalid address. Without this, the algorithm would never finish.
5849   bool *SkipLastFolding = nullptr;
5850   // NewAddrCheckpoint is used to rollback the address being formed in case an
5851   // invalid address is formed.
5852   OptAddr NewAddrCheckpoint;
5853   Reason = Instr;
5854   do {
5855     if (SandboxingType != ST_None) {
5856       // When sandboxing, we defer the sandboxing of NewAddr to the Concrete
5857       // Target. If our optimization was overly aggressive, then we simply undo
5858       // what the previous iteration did, and set the previous pattern's skip
5859       // bit to true.
5860       if (!legalizeOptAddrForSandbox(&NewAddr)) {
5861         *SkipLastFolding = true;
5862         SkipLastFolding = nullptr;
5863         NewAddr = NewAddrCheckpoint;
5864         Reason = nullptr;
5865       }
5866     }
5867 
5868     if (Reason) {
5869       AddrOpt.dumpAddressOpt(NewAddr.Relocatable, NewAddr.Offset, NewAddr.Base,
5870                              NewAddr.Index, NewAddr.Shift, Reason);
5871       AddressWasOptimized = true;
5872       Reason = nullptr;
5873       SkipLastFolding = nullptr;
5874       memset(reinterpret_cast<void*>(&Skip), 0, sizeof(Skip));
5875     }
5876 
5877     NewAddrCheckpoint = NewAddr;
5878 
5879     // Update Base and Index to follow through assignments to definitions.
5880     if (!Skip.AssignBase &&
5881         (Reason = AddrOpt.matchAssign(&NewAddr.Base, &NewAddr.Relocatable,
5882                                       &NewAddr.Offset))) {
5883       SkipLastFolding = &Skip.AssignBase;
5884       // Assignments of Base from a Relocatable or ConstantInt32 can result
5885       // in Base becoming nullptr.  To avoid code duplication in this loop we
5886       // prefer that Base be non-nullptr if possible.
5887       if ((NewAddr.Base == nullptr) && (NewAddr.Index != nullptr) &&
5888           NewAddr.Shift == 0) {
5889         std::swap(NewAddr.Base, NewAddr.Index);
5890       }
5891       continue;
5892     }
5893     if (!Skip.AssignBase &&
5894         (Reason = AddrOpt.matchAssign(&NewAddr.Index, &NewAddr.Relocatable,
5895                                       &NewAddr.Offset))) {
5896       SkipLastFolding = &Skip.AssignIndex;
5897       continue;
5898     }
5899 
5900     if (!MockBounds) {
5901       // Transition from:
5902       //   <Relocatable + Offset>(Base) to
5903       //   <Relocatable + Offset>(Base, Index)
5904       if (!Skip.CombinedBaseIndex &&
5905           (Reason = AddrOpt.matchCombinedBaseIndex(
5906                &NewAddr.Base, &NewAddr.Index, &NewAddr.Shift))) {
5907         SkipLastFolding = &Skip.CombinedBaseIndex;
5908         continue;
5909       }
5910 
5911       // Recognize multiply/shift and update Shift amount.
5912       // Index becomes Index=Var<<Const && Const+Shift<=3 ==>
5913       //   Index=Var, Shift+=Const
5914       // Index becomes Index=Const*Var && log2(Const)+Shift<=3 ==>
5915       //   Index=Var, Shift+=log2(Const)
5916       if ((Reason =
5917                AddrOpt.matchShiftedIndex(&NewAddr.Index, &NewAddr.Shift))) {
5918         continue;
5919       }
5920 
5921       // If Shift is zero, the choice of Base and Index was purely arbitrary.
5922       // Recognize multiply/shift and set Shift amount.
5923       // Shift==0 && Base is Base=Var*Const && log2(Const)+Shift<=3 ==>
5924       //   swap(Index,Base)
5925       // Similar for Base=Const*Var and Base=Var<<Const
5926       if (NewAddr.Shift == 0 &&
5927           (Reason = AddrOpt.matchShiftedIndex(&NewAddr.Base, &NewAddr.Shift))) {
5928         std::swap(NewAddr.Base, NewAddr.Index);
5929         continue;
5930       }
5931     }
5932 
5933     // Update Offset to reflect additions/subtractions with constants and
5934     // relocatables.
5935     // TODO: consider overflow issues with respect to Offset.
5936     if (!Skip.OffsetFromBase && (Reason = AddrOpt.matchOffsetIndexOrBase(
5937                                      &NewAddr.Base, /*Shift =*/0,
5938                                      &NewAddr.Relocatable, &NewAddr.Offset))) {
5939       SkipLastFolding = &Skip.OffsetFromBase;
5940       continue;
5941     }
5942     if (!Skip.OffsetFromIndex && (Reason = AddrOpt.matchOffsetIndexOrBase(
5943                                       &NewAddr.Index, NewAddr.Shift,
5944                                       &NewAddr.Relocatable, &NewAddr.Offset))) {
5945       SkipLastFolding = &Skip.OffsetFromIndex;
5946       continue;
5947     }
5948 
5949     break;
5950   } while (Reason);
5951 
5952   if (!AddressWasOptimized) {
5953     return nullptr;
5954   }
5955 
5956   // Undo any addition of RebasePtr.  It will be added back when the mem
5957   // operand is sandboxed.
5958   if (NewAddr.Base == RebasePtr) {
5959     NewAddr.Base = nullptr;
5960   }
5961 
5962   if (NewAddr.Index == RebasePtr) {
5963     NewAddr.Index = nullptr;
5964     NewAddr.Shift = 0;
5965   }
5966 
5967   Constant *OffsetOp = nullptr;
5968   if (NewAddr.Relocatable == nullptr) {
5969     OffsetOp = Ctx->getConstantInt32(NewAddr.Offset);
5970   } else {
5971     OffsetOp =
5972         Ctx->getConstantSym(NewAddr.Relocatable->getOffset() + NewAddr.Offset,
5973                             NewAddr.Relocatable->getName());
5974   }
5975   // Vanilla ICE load instructions should not use the segment registers, and
5976   // computeAddressOpt only works at the level of Variables and Constants, not
5977   // other X86OperandMem, so there should be no mention of segment
5978   // registers there either.
5979   static constexpr auto SegmentReg =
5980       X86OperandMem::SegmentRegisters::DefaultSegment;
5981 
5982   return X86OperandMem::create(Func, MemType, NewAddr.Base, OffsetOp,
5983                                NewAddr.Index, NewAddr.Shift, SegmentReg);
5984 }
5985 
5986 /// Add a mock bounds check on the memory address before using it as a load or
5987 /// store operand.  The basic idea is that given a memory operand [reg], we
5988 /// would first add bounds-check code something like:
5989 ///
5990 ///   cmp reg, <lb>
5991 ///   jl out_of_line_error
5992 ///   cmp reg, <ub>
5993 ///   jg out_of_line_error
5994 ///
5995 /// In reality, the specific code will depend on how <lb> and <ub> are
5996 /// represented, e.g. an immediate, a global, or a function argument.
5997 ///
5998 /// As such, we need to enforce that the memory operand does not have the form
5999 /// [reg1+reg2], because then there is no simple cmp instruction that would
6000 /// suffice.  However, we consider [reg+offset] to be OK because the offset is
6001 /// usually small, and so <ub> could have a safety buffer built in and then we
6002 /// could instead branch to a custom out_of_line_error that does the precise
6003 /// check and jumps back if it turns out OK.
6004 ///
6005 /// For the purpose of mocking the bounds check, we'll do something like this:
6006 ///
6007 ///   cmp reg, 0
6008 ///   je label
6009 ///   cmp reg, 1
6010 ///   je label
6011 ///   label:
6012 ///
6013 /// Also note that we don't need to add a bounds check to a dereference of a
6014 /// simple global variable address.
6015 template <typename TraitsType>
6016 void TargetX86Base<TraitsType>::doMockBoundsCheck(Operand *Opnd) {
6017   if (!getFlags().getMockBoundsCheck())
6018     return;
6019   if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd)) {
6020     if (Mem->getIndex()) {
6021       llvm::report_fatal_error("doMockBoundsCheck: Opnd contains index reg");
6022     }
6023     Opnd = Mem->getBase();
6024   }
6025   // At this point Opnd could be nullptr, or Variable, or Constant, or perhaps
6026   // something else.  We only care if it is Variable.
6027   auto *Var = llvm::dyn_cast_or_null<Variable>(Opnd);
6028   if (Var == nullptr)
6029     return;
6030   // We use lowerStore() to copy out-args onto the stack.  This creates a memory
6031   // operand with the stack pointer as the base register.  Don't do bounds
6032   // checks on that.
6033   if (Var->getRegNum() == getStackReg())
6034     return;
6035 
6036   auto *Label = InstX86Label::create(Func, this);
6037   _cmp(Opnd, Ctx->getConstantZero(IceType_i32));
6038   _br(Traits::Cond::Br_e, Label);
6039   _cmp(Opnd, Ctx->getConstantInt32(1));
6040   _br(Traits::Cond::Br_e, Label);
6041   Context.insert(Label);
6042 }
6043 
6044 template <typename TraitsType>
6045 void TargetX86Base<TraitsType>::lowerLoad(const InstLoad *Load) {
6046   // A Load instruction can be treated the same as an Assign instruction, after
6047   // the source operand is transformed into an X86OperandMem operand.  Note that
6048   // the address mode optimization already creates an X86OperandMem operand, so
6049   // it doesn't need another level of transformation.
6050   Variable *DestLoad = Load->getDest();
6051   Type Ty = DestLoad->getType();
6052   Operand *Src0 = formMemoryOperand(Load->getSourceAddress(), Ty);
6053   doMockBoundsCheck(Src0);
6054   auto *Assign = InstAssign::create(Func, DestLoad, Src0);
6055   lowerAssign(Assign);
6056 }
6057 
6058 template <typename TraitsType>
6059 void TargetX86Base<TraitsType>::doAddressOptOther() {
6060   // Inverts some Icmp instructions which helps doAddressOptLoad later.
6061   // TODO(manasijm): Refactor to unify the conditions for Var0 and Var1
6062   Inst *Instr = iteratorToInst(Context.getCur());
6063   auto *VMetadata = Func->getVMetadata();
6064   if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Instr)) {
6065     if (llvm::isa<Constant>(Icmp->getSrc(0)) ||
6066         llvm::isa<Constant>(Icmp->getSrc(1)))
6067       return;
6068     auto *Var0 = llvm::dyn_cast<Variable>(Icmp->getSrc(0));
6069     if (Var0 == nullptr)
6070       return;
6071     if (!VMetadata->isTracked(Var0))
6072       return;
6073     auto *Op0Def = VMetadata->getFirstDefinitionSingleBlock(Var0);
6074     if (Op0Def == nullptr || !llvm::isa<InstLoad>(Op0Def))
6075       return;
6076     if (VMetadata->getLocalUseNode(Var0) != Context.getNode())
6077       return;
6078 
6079     auto *Var1 = llvm::dyn_cast<Variable>(Icmp->getSrc(1));
6080     if (Var1 != nullptr && VMetadata->isTracked(Var1)) {
6081       auto *Op1Def = VMetadata->getFirstDefinitionSingleBlock(Var1);
6082       if (Op1Def != nullptr && !VMetadata->isMultiBlock(Var1) &&
6083           llvm::isa<InstLoad>(Op1Def)) {
6084         return; // Both are loads
6085       }
6086     }
6087     Icmp->reverseConditionAndOperands();
6088   }
6089 }
6090 
6091 template <typename TraitsType>
6092 void TargetX86Base<TraitsType>::doAddressOptLoad() {
6093   Inst *Instr = iteratorToInst(Context.getCur());
6094   Operand *Addr = Instr->getSrc(0);
6095   Variable *Dest = Instr->getDest();
6096   if (auto *OptAddr = computeAddressOpt(Instr, Dest->getType(), Addr)) {
6097     Instr->setDeleted();
6098     Context.insert<InstLoad>(Dest, OptAddr);
6099   }
6100 }
6101 
6102 template <typename TraitsType>
6103 void TargetX86Base<TraitsType>::doAddressOptLoadSubVector() {
6104   auto *Intrinsic = llvm::cast<InstIntrinsicCall>(Context.getCur());
6105   Operand *Addr = Intrinsic->getArg(0);
6106   Variable *Dest = Intrinsic->getDest();
6107   if (auto *OptAddr = computeAddressOpt(Intrinsic, Dest->getType(), Addr)) {
6108     Intrinsic->setDeleted();
6109     const Ice::Intrinsics::IntrinsicInfo Info = {
6110         Ice::Intrinsics::LoadSubVector, Ice::Intrinsics::SideEffects_F,
6111         Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
6112     auto Target = Ctx->getConstantUndef(Ice::IceType_i32);
6113     auto *NewLoad = Context.insert<InstIntrinsicCall>(2, Dest, Target, Info);
6114     NewLoad->addArg(OptAddr);
6115     NewLoad->addArg(Intrinsic->getArg(1));
6116   }
6117 }
6118 
6119 template <typename TraitsType>
6120 void TargetX86Base<TraitsType>::randomlyInsertNop(float Probability,
6121                                                   RandomNumberGenerator &RNG) {
6122   RandomNumberGeneratorWrapper RNGW(RNG);
6123   if (RNGW.getTrueWithProbability(Probability)) {
6124     _nop(RNGW(Traits::X86_NUM_NOP_VARIANTS));
6125   }
6126 }
6127 
6128 template <typename TraitsType>
6129 void TargetX86Base<TraitsType>::lowerPhi(const InstPhi * /*Instr*/) {
6130   Func->setError("Phi found in regular instruction list");
6131 }
6132 
6133 template <typename TraitsType>
6134 void TargetX86Base<TraitsType>::lowerRet(const InstRet *Instr) {
6135   Variable *Reg = nullptr;
6136   if (Instr->hasRetValue()) {
6137     Operand *RetValue = legalize(Instr->getRetValue());
6138     const Type ReturnType = RetValue->getType();
6139     assert(isVectorType(ReturnType) || isScalarFloatingType(ReturnType) ||
6140            (ReturnType == IceType_i32) || (ReturnType == IceType_i64));
6141     Reg = moveReturnValueToRegister(RetValue, ReturnType);
6142   }
6143   // Add a ret instruction even if sandboxing is enabled, because addEpilog
6144   // explicitly looks for a ret instruction as a marker for where to insert the
6145   // frame removal instructions.
6146   _ret(Reg);
6147   // Add a fake use of esp to make sure esp stays alive for the entire
6148   // function. Otherwise post-call esp adjustments get dead-code eliminated.
6149   keepEspLiveAtExit();
6150 }
6151 
6152 inline uint32_t makePshufdMask(SizeT Index0, SizeT Index1, SizeT Index2,
6153                                SizeT Index3) {
6154   const SizeT Mask = (Index0 & 0x3) | ((Index1 & 0x3) << 2) |
6155                      ((Index2 & 0x3) << 4) | ((Index3 & 0x3) << 6);
6156   assert(Mask < 256);
6157   return Mask;
6158 }
6159 
6160 template <typename TraitsType>
6161 Variable *TargetX86Base<TraitsType>::lowerShuffleVector_AllFromSameSrc(
6162     Operand *Src, SizeT Index0, SizeT Index1, SizeT Index2, SizeT Index3) {
6163   constexpr SizeT SrcBit = 1 << 2;
6164   assert((Index0 & SrcBit) == (Index1 & SrcBit));
6165   assert((Index0 & SrcBit) == (Index2 & SrcBit));
6166   assert((Index0 & SrcBit) == (Index3 & SrcBit));
6167   (void)SrcBit;
6168 
6169   const Type SrcTy = Src->getType();
6170   auto *T = makeReg(SrcTy);
6171   auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem);
6172   auto *Mask =
6173       Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
6174   _pshufd(T, SrcRM, Mask);
6175   return T;
6176 }
6177 
6178 template <typename TraitsType>
6179 Variable *TargetX86Base<TraitsType>::lowerShuffleVector_TwoFromSameSrc(
6180     Operand *Src0, SizeT Index0, SizeT Index1, Operand *Src1, SizeT Index2,
6181     SizeT Index3) {
6182   constexpr SizeT SrcBit = 1 << 2;
6183   assert((Index0 & SrcBit) == (Index1 & SrcBit) || (Index1 == IGNORE_INDEX));
6184   assert((Index2 & SrcBit) == (Index3 & SrcBit) || (Index3 == IGNORE_INDEX));
6185   (void)SrcBit;
6186 
6187   const Type SrcTy = Src0->getType();
6188   assert(Src1->getType() == SrcTy);
6189   auto *T = makeReg(SrcTy);
6190   auto *Src0R = legalizeToReg(Src0);
6191   auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
6192   auto *Mask =
6193       Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
6194   _movp(T, Src0R);
6195   _shufps(T, Src1RM, Mask);
6196   return T;
6197 }
6198 
6199 template <typename TraitsType>
6200 Variable *TargetX86Base<TraitsType>::lowerShuffleVector_UnifyFromDifferentSrcs(
6201     Operand *Src0, SizeT Index0, Operand *Src1, SizeT Index1) {
6202   return lowerShuffleVector_TwoFromSameSrc(Src0, Index0, IGNORE_INDEX, Src1,
6203                                            Index1, IGNORE_INDEX);
6204 }
6205 
6206 inline SizeT makeSrcSwitchMask(SizeT Index0, SizeT Index1, SizeT Index2,
6207                                SizeT Index3) {
6208   constexpr SizeT SrcBit = 1 << 2;
6209   const SizeT Index0Bits = ((Index0 & SrcBit) == 0) ? 0 : (1 << 0);
6210   const SizeT Index1Bits = ((Index1 & SrcBit) == 0) ? 0 : (1 << 1);
6211   const SizeT Index2Bits = ((Index2 & SrcBit) == 0) ? 0 : (1 << 2);
6212   const SizeT Index3Bits = ((Index3 & SrcBit) == 0) ? 0 : (1 << 3);
6213   return Index0Bits | Index1Bits | Index2Bits | Index3Bits;
6214 }
6215 
6216 template <typename TraitsType>
6217 GlobalString TargetX86Base<TraitsType>::lowerShuffleVector_NewMaskName() {
6218   GlobalString FuncName = Func->getFunctionName();
6219   const SizeT Id = PshufbMaskCount++;
6220   if (!BuildDefs::dump() || !FuncName.hasStdString()) {
6221     return GlobalString::createWithString(
6222         Ctx,
6223         "$PS" + std::to_string(FuncName.getID()) + "_" + std::to_string(Id));
6224   }
6225   return GlobalString::createWithString(
6226       Ctx, "Pshufb$" + Func->getFunctionName() + "$" + std::to_string(Id));
6227 }
6228 
6229 template <typename TraitsType>
6230 ConstantRelocatable *
6231 TargetX86Base<TraitsType>::lowerShuffleVector_CreatePshufbMask(
6232     int8_t Idx0, int8_t Idx1, int8_t Idx2, int8_t Idx3, int8_t Idx4,
6233     int8_t Idx5, int8_t Idx6, int8_t Idx7, int8_t Idx8, int8_t Idx9,
6234     int8_t Idx10, int8_t Idx11, int8_t Idx12, int8_t Idx13, int8_t Idx14,
6235     int8_t Idx15) {
6236   static constexpr uint8_t NumElements = 16;
6237   const char Initializer[NumElements] = {
6238       Idx0, Idx1, Idx2,  Idx3,  Idx4,  Idx5,  Idx6,  Idx7,
6239       Idx8, Idx9, Idx10, Idx11, Idx12, Idx13, Idx14, Idx15,
6240   };
6241 
6242   static constexpr Type V4VectorType = IceType_v4i32;
6243   const uint32_t MaskAlignment = typeWidthInBytesOnStack(V4VectorType);
6244   auto *Mask = VariableDeclaration::create(Func->getGlobalPool());
6245   GlobalString MaskName = lowerShuffleVector_NewMaskName();
6246   Mask->setIsConstant(true);
6247   Mask->addInitializer(VariableDeclaration::DataInitializer::create(
6248       Func->getGlobalPool(), Initializer, NumElements));
6249   Mask->setName(MaskName);
6250   // Mask needs to be 16-byte aligned, or pshufb will seg fault.
6251   Mask->setAlignment(MaskAlignment);
6252   Func->addGlobal(Mask);
6253 
6254   constexpr RelocOffsetT Offset = 0;
6255   return llvm::cast<ConstantRelocatable>(Ctx->getConstantSym(Offset, MaskName));
6256 }
6257 
6258 template <typename TraitsType>
6259 void TargetX86Base<TraitsType>::lowerShuffleVector_UsingPshufb(
6260     Variable *Dest, Operand *Src0, Operand *Src1, int8_t Idx0, int8_t Idx1,
6261     int8_t Idx2, int8_t Idx3, int8_t Idx4, int8_t Idx5, int8_t Idx6,
6262     int8_t Idx7, int8_t Idx8, int8_t Idx9, int8_t Idx10, int8_t Idx11,
6263     int8_t Idx12, int8_t Idx13, int8_t Idx14, int8_t Idx15) {
6264   const Type DestTy = Dest->getType();
6265   static constexpr bool NotRebased = false;
6266   static constexpr Variable *NoBase = nullptr;
6267   // We use void for the memory operand instead of DestTy because using the
6268   // latter causes a validation failure: the X86 Inst layer complains that
6269   // vector mem operands could be under aligned. Thus, using void we avoid the
6270   // validation error. Note that the mask global declaration is aligned, so it
6271   // can be used as an XMM mem operand.
6272   static constexpr Type MaskType = IceType_void;
6273 #define IDX_IN_SRC(N, S)                                                       \
6274   ((((N) & (1 << 4)) == (S << 4)) ? ((N)&0xf) : CLEAR_ALL_BITS)
6275   auto *Mask0M = X86OperandMem::create(
6276       Func, MaskType, NoBase,
6277       lowerShuffleVector_CreatePshufbMask(
6278           IDX_IN_SRC(Idx0, 0), IDX_IN_SRC(Idx1, 0), IDX_IN_SRC(Idx2, 0),
6279           IDX_IN_SRC(Idx3, 0), IDX_IN_SRC(Idx4, 0), IDX_IN_SRC(Idx5, 0),
6280           IDX_IN_SRC(Idx6, 0), IDX_IN_SRC(Idx7, 0), IDX_IN_SRC(Idx8, 0),
6281           IDX_IN_SRC(Idx9, 0), IDX_IN_SRC(Idx10, 0), IDX_IN_SRC(Idx11, 0),
6282           IDX_IN_SRC(Idx12, 0), IDX_IN_SRC(Idx13, 0), IDX_IN_SRC(Idx14, 0),
6283           IDX_IN_SRC(Idx15, 0)),
6284       NotRebased);
6285 
6286   auto *T0 = makeReg(DestTy);
6287   auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
6288   _movp(T0, Src0RM);
6289 
6290   _pshufb(T0, Mask0M);
6291 
6292   if (Idx0 >= 16 || Idx1 >= 16 || Idx2 >= 16 || Idx3 >= 16 || Idx4 >= 16 ||
6293       Idx5 >= 16 || Idx6 >= 16 || Idx7 >= 16 || Idx8 >= 16 || Idx9 >= 16 ||
6294       Idx10 >= 16 || Idx11 >= 16 || Idx12 >= 16 || Idx13 >= 16 || Idx14 >= 16 ||
6295       Idx15 >= 16) {
6296     auto *Mask1M = X86OperandMem::create(
6297         Func, MaskType, NoBase,
6298         lowerShuffleVector_CreatePshufbMask(
6299             IDX_IN_SRC(Idx0, 1), IDX_IN_SRC(Idx1, 1), IDX_IN_SRC(Idx2, 1),
6300             IDX_IN_SRC(Idx3, 1), IDX_IN_SRC(Idx4, 1), IDX_IN_SRC(Idx5, 1),
6301             IDX_IN_SRC(Idx6, 1), IDX_IN_SRC(Idx7, 1), IDX_IN_SRC(Idx8, 1),
6302             IDX_IN_SRC(Idx9, 1), IDX_IN_SRC(Idx10, 1), IDX_IN_SRC(Idx11, 1),
6303             IDX_IN_SRC(Idx12, 1), IDX_IN_SRC(Idx13, 1), IDX_IN_SRC(Idx14, 1),
6304             IDX_IN_SRC(Idx15, 1)),
6305         NotRebased);
6306 #undef IDX_IN_SRC
6307     auto *T1 = makeReg(DestTy);
6308     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
6309     _movp(T1, Src1RM);
6310     _pshufb(T1, Mask1M);
6311     _por(T0, T1);
6312   }
6313 
6314   _movp(Dest, T0);
6315 }
6316 
6317 template <typename TraitsType>
6318 void TargetX86Base<TraitsType>::lowerShuffleVector(
6319     const InstShuffleVector *Instr) {
6320   auto *Dest = Instr->getDest();
6321   const Type DestTy = Dest->getType();
6322   auto *Src0 = Instr->getSrc(0);
6323   auto *Src1 = Instr->getSrc(1);
6324   const SizeT NumElements = typeNumElements(DestTy);
6325 
6326   auto *T = makeReg(DestTy);
6327 
6328   switch (DestTy) {
6329   default:
6330     llvm::report_fatal_error("Unexpected vector type.");
6331   case IceType_v16i1:
6332   case IceType_v16i8: {
6333     static constexpr SizeT ExpectedNumElements = 16;
6334     assert(ExpectedNumElements == Instr->getNumIndexes());
6335     (void)ExpectedNumElements;
6336 
6337     if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7)) {
6338       auto *T = makeReg(DestTy);
6339       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
6340       _movp(T, Src0RM);
6341       _punpckl(T, Src0RM);
6342       _movp(Dest, T);
6343       return;
6344     }
6345 
6346     if (Instr->indexesAre(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7,
6347                           23)) {
6348       auto *T = makeReg(DestTy);
6349       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
6350       auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
6351       _movp(T, Src0RM);
6352       _punpckl(T, Src1RM);
6353       _movp(Dest, T);
6354       return;
6355     }
6356 
6357     if (Instr->indexesAre(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
6358                           15, 15)) {
6359       auto *T = makeReg(DestTy);
6360       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
6361       _movp(T, Src0RM);
6362       _punpckh(T, Src0RM);
6363       _movp(Dest, T);
6364       return;
6365     }
6366 
6367     if (Instr->indexesAre(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30,
6368                           15, 31)) {
6369       auto *T = makeReg(DestTy);
6370       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
6371       auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
6372       _movp(T, Src0RM);
6373       _punpckh(T, Src1RM);
6374       _movp(Dest, T);
6375       return;
6376     }
6377 
6378     if (InstructionSet < Traits::SSE4_1) {
6379       // TODO(jpp): figure out how to lower with sse2.
6380       break;
6381     }
6382 
6383     const SizeT Index0 = Instr->getIndexValue(0);
6384     const SizeT Index1 = Instr->getIndexValue(1);
6385     const SizeT Index2 = Instr->getIndexValue(2);
6386     const SizeT Index3 = Instr->getIndexValue(3);
6387     const SizeT Index4 = Instr->getIndexValue(4);
6388     const SizeT Index5 = Instr->getIndexValue(5);
6389     const SizeT Index6 = Instr->getIndexValue(6);
6390     const SizeT Index7 = Instr->getIndexValue(7);
6391     const SizeT Index8 = Instr->getIndexValue(8);
6392     const SizeT Index9 = Instr->getIndexValue(9);
6393     const SizeT Index10 = Instr->getIndexValue(10);
6394     const SizeT Index11 = Instr->getIndexValue(11);
6395     const SizeT Index12 = Instr->getIndexValue(12);
6396     const SizeT Index13 = Instr->getIndexValue(13);
6397     const SizeT Index14 = Instr->getIndexValue(14);
6398     const SizeT Index15 = Instr->getIndexValue(15);
6399 
6400     lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2,
6401                                    Index3, Index4, Index5, Index6, Index7,
6402                                    Index8, Index9, Index10, Index11, Index12,
6403                                    Index13, Index14, Index15);
6404     return;
6405   }
6406   case IceType_v8i1:
6407   case IceType_v8i16: {
6408     static constexpr SizeT ExpectedNumElements = 8;
6409     assert(ExpectedNumElements == Instr->getNumIndexes());
6410     (void)ExpectedNumElements;
6411 
6412     if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3)) {
6413       auto *T = makeReg(DestTy);
6414       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
6415       _movp(T, Src0RM);
6416       _punpckl(T, Src0RM);
6417       _movp(Dest, T);
6418       return;
6419     }
6420 
6421     if (Instr->indexesAre(0, 8, 1, 9, 2, 10, 3, 11)) {
6422       auto *T = makeReg(DestTy);
6423       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
6424       auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
6425       _movp(T, Src0RM);
6426       _punpckl(T, Src1RM);
6427       _movp(Dest, T);
6428       return;
6429     }
6430 
6431     if (Instr->indexesAre(4, 4, 5, 5, 6, 6, 7, 7)) {
6432       auto *T = makeReg(DestTy);
6433       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
6434       _movp(T, Src0RM);
6435       _punpckh(T, Src0RM);
6436       _movp(Dest, T);
6437       return;
6438     }
6439 
6440     if (Instr->indexesAre(4, 12, 5, 13, 6, 14, 7, 15)) {
6441       auto *T = makeReg(DestTy);
6442       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
6443       auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
6444       _movp(T, Src0RM);
6445       _punpckh(T, Src1RM);
6446       _movp(Dest, T);
6447       return;
6448     }
6449 
6450     if (InstructionSet < Traits::SSE4_1) {
6451       // TODO(jpp): figure out how to lower with sse2.
6452       break;
6453     }
6454 
6455     const SizeT Index0 = Instr->getIndexValue(0);
6456     const SizeT Index1 = Instr->getIndexValue(1);
6457     const SizeT Index2 = Instr->getIndexValue(2);
6458     const SizeT Index3 = Instr->getIndexValue(3);
6459     const SizeT Index4 = Instr->getIndexValue(4);
6460     const SizeT Index5 = Instr->getIndexValue(5);
6461     const SizeT Index6 = Instr->getIndexValue(6);
6462     const SizeT Index7 = Instr->getIndexValue(7);
6463 
6464 #define TO_BYTE_INDEX(I) ((I) << 1)
6465     lowerShuffleVector_UsingPshufb(
6466         Dest, Src0, Src1, TO_BYTE_INDEX(Index0), TO_BYTE_INDEX(Index0) + 1,
6467         TO_BYTE_INDEX(Index1), TO_BYTE_INDEX(Index1) + 1, TO_BYTE_INDEX(Index2),
6468         TO_BYTE_INDEX(Index2) + 1, TO_BYTE_INDEX(Index3),
6469         TO_BYTE_INDEX(Index3) + 1, TO_BYTE_INDEX(Index4),
6470         TO_BYTE_INDEX(Index4) + 1, TO_BYTE_INDEX(Index5),
6471         TO_BYTE_INDEX(Index5) + 1, TO_BYTE_INDEX(Index6),
6472         TO_BYTE_INDEX(Index6) + 1, TO_BYTE_INDEX(Index7),
6473         TO_BYTE_INDEX(Index7) + 1);
6474 #undef TO_BYTE_INDEX
6475     return;
6476   }
6477   case IceType_v4i1:
6478   case IceType_v4i32:
6479   case IceType_v4f32: {
6480     static constexpr SizeT ExpectedNumElements = 4;
6481     assert(ExpectedNumElements == Instr->getNumIndexes());
6482     const SizeT Index0 = Instr->getIndexValue(0);
6483     const SizeT Index1 = Instr->getIndexValue(1);
6484     const SizeT Index2 = Instr->getIndexValue(2);
6485     const SizeT Index3 = Instr->getIndexValue(3);
6486     Variable *T = nullptr;
6487     switch (makeSrcSwitchMask(Index0, Index1, Index2, Index3)) {
6488 #define CASE_SRCS_IN(S0, S1, S2, S3)                                           \
6489   case (((S0) << 0) | ((S1) << 1) | ((S2) << 2) | ((S3) << 3))
6490       CASE_SRCS_IN(0, 0, 0, 0) : {
6491         T = lowerShuffleVector_AllFromSameSrc(Src0, Index0, Index1, Index2,
6492                                               Index3);
6493       }
6494       break;
6495       CASE_SRCS_IN(0, 0, 0, 1) : {
6496         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
6497                                                                   Src1, Index3);
6498         T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
6499                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6500       }
6501       break;
6502       CASE_SRCS_IN(0, 0, 1, 0) : {
6503         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
6504                                                                   Src0, Index3);
6505         T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
6506                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6507       }
6508       break;
6509       CASE_SRCS_IN(0, 0, 1, 1) : {
6510         T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Src1,
6511                                               Index2, Index3);
6512       }
6513       break;
6514       CASE_SRCS_IN(0, 1, 0, 0) : {
6515         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
6516                                                                   Src1, Index1);
6517         T = lowerShuffleVector_TwoFromSameSrc(
6518             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
6519       }
6520       break;
6521       CASE_SRCS_IN(0, 1, 0, 1) : {
6522         if (Index0 == 0 && (Index1 - ExpectedNumElements) == 0 && Index2 == 1 &&
6523             (Index3 - ExpectedNumElements) == 1) {
6524           auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
6525           auto *Src0R = legalizeToReg(Src0);
6526           T = makeReg(DestTy);
6527           _movp(T, Src0R);
6528           _punpckl(T, Src1RM);
6529         } else if (Index0 == Index2 && Index1 == Index3) {
6530           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
6531               Src0, Index0, Src1, Index1);
6532           T = lowerShuffleVector_AllFromSameSrc(
6533               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
6534               UNIFIED_INDEX_1);
6535         } else {
6536           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
6537               Src0, Index0, Src1, Index1);
6538           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
6539               Src0, Index2, Src1, Index3);
6540           T = lowerShuffleVector_TwoFromSameSrc(
6541               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
6542               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6543         }
6544       }
6545       break;
6546       CASE_SRCS_IN(0, 1, 1, 0) : {
6547         if (Index0 == Index3 && Index1 == Index2) {
6548           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
6549               Src0, Index0, Src1, Index1);
6550           T = lowerShuffleVector_AllFromSameSrc(
6551               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
6552               UNIFIED_INDEX_0);
6553         } else {
6554           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
6555               Src0, Index0, Src1, Index1);
6556           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
6557               Src1, Index2, Src0, Index3);
6558           T = lowerShuffleVector_TwoFromSameSrc(
6559               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
6560               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6561         }
6562       }
6563       break;
6564       CASE_SRCS_IN(0, 1, 1, 1) : {
6565         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
6566                                                                   Src1, Index1);
6567         T = lowerShuffleVector_TwoFromSameSrc(
6568             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
6569       }
6570       break;
6571       CASE_SRCS_IN(1, 0, 0, 0) : {
6572         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
6573                                                                   Src0, Index1);
6574         T = lowerShuffleVector_TwoFromSameSrc(
6575             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
6576       }
6577       break;
6578       CASE_SRCS_IN(1, 0, 0, 1) : {
6579         if (Index0 == Index3 && Index1 == Index2) {
6580           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
6581               Src1, Index0, Src0, Index1);
6582           T = lowerShuffleVector_AllFromSameSrc(
6583               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
6584               UNIFIED_INDEX_0);
6585         } else {
6586           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
6587               Src1, Index0, Src0, Index1);
6588           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
6589               Src0, Index2, Src1, Index3);
6590           T = lowerShuffleVector_TwoFromSameSrc(
6591               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
6592               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6593         }
6594       }
6595       break;
6596       CASE_SRCS_IN(1, 0, 1, 0) : {
6597         if ((Index0 - ExpectedNumElements) == 0 && Index1 == 0 &&
6598             (Index2 - ExpectedNumElements) == 1 && Index3 == 1) {
6599           auto *Src1RM = legalize(Src0, Legal_Reg | Legal_Mem);
6600           auto *Src0R = legalizeToReg(Src1);
6601           T = makeReg(DestTy);
6602           _movp(T, Src0R);
6603           _punpckl(T, Src1RM);
6604         } else if (Index0 == Index2 && Index1 == Index3) {
6605           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
6606               Src1, Index0, Src0, Index1);
6607           T = lowerShuffleVector_AllFromSameSrc(
6608               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
6609               UNIFIED_INDEX_1);
6610         } else {
6611           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
6612               Src1, Index0, Src0, Index1);
6613           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
6614               Src1, Index2, Src0, Index3);
6615           T = lowerShuffleVector_TwoFromSameSrc(
6616               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
6617               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6618         }
6619       }
6620       break;
6621       CASE_SRCS_IN(1, 0, 1, 1) : {
6622         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
6623                                                                   Src0, Index1);
6624         T = lowerShuffleVector_TwoFromSameSrc(
6625             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
6626       }
6627       break;
6628       CASE_SRCS_IN(1, 1, 0, 0) : {
6629         T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Src0,
6630                                               Index2, Index3);
6631       }
6632       break;
6633       CASE_SRCS_IN(1, 1, 0, 1) : {
6634         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
6635                                                                   Src1, Index3);
6636         T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
6637                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6638       }
6639       break;
6640       CASE_SRCS_IN(1, 1, 1, 0) : {
6641         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
6642                                                                   Src0, Index3);
6643         T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
6644                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6645       }
6646       break;
6647       CASE_SRCS_IN(1, 1, 1, 1) : {
6648         T = lowerShuffleVector_AllFromSameSrc(Src1, Index0, Index1, Index2,
6649                                               Index3);
6650       }
6651       break;
6652 #undef CASE_SRCS_IN
6653     }
6654 
6655     assert(T != nullptr);
6656     assert(T->getType() == DestTy);
6657     _movp(Dest, T);
6658     return;
6659   } break;
6660   }
6661 
6662   // Unoptimized shuffle. Perform a series of inserts and extracts.
6663   Context.insert<InstFakeDef>(T);
6664   const Type ElementType = typeElementType(DestTy);
6665   for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) {
6666     auto *Index = Instr->getIndex(I);
6667     const SizeT Elem = Index->getValue();
6668     auto *ExtElmt = makeReg(ElementType);
6669     if (Elem < NumElements) {
6670       lowerExtractElement(
6671           InstExtractElement::create(Func, ExtElmt, Src0, Index));
6672     } else {
6673       lowerExtractElement(InstExtractElement::create(
6674           Func, ExtElmt, Src1, Ctx->getConstantInt32(Elem - NumElements)));
6675     }
6676     auto *NewT = makeReg(DestTy);
6677     lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt,
6678                                                  Ctx->getConstantInt32(I)));
6679     T = NewT;
6680   }
6681   _movp(Dest, T);
6682 }
6683 
6684 template <typename TraitsType>
6685 void TargetX86Base<TraitsType>::lowerSelect(const InstSelect *Select) {
6686   Variable *Dest = Select->getDest();
6687 
6688   Operand *Condition = Select->getCondition();
6689   // Handle folding opportunities.
6690   if (const Inst *Producer = FoldingInfo.getProducerFor(Condition)) {
6691     assert(Producer->isDeleted());
6692     switch (BoolFolding<Traits>::getProducerKind(Producer)) {
6693     default:
6694       break;
6695     case BoolFolding<Traits>::PK_Icmp32:
6696     case BoolFolding<Traits>::PK_Icmp64: {
6697       lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Select);
6698       return;
6699     }
6700     case BoolFolding<Traits>::PK_Fcmp: {
6701       lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Select);
6702       return;
6703     }
6704     }
6705   }
6706 
6707   if (isVectorType(Dest->getType())) {
6708     lowerSelectVector(Select);
6709     return;
6710   }
6711 
6712   Operand *CmpResult = legalize(Condition, Legal_Reg | Legal_Mem);
6713   Operand *Zero = Ctx->getConstantZero(IceType_i32);
6714   _cmp(CmpResult, Zero);
6715   Operand *SrcT = Select->getTrueOperand();
6716   Operand *SrcF = Select->getFalseOperand();
6717   const BrCond Cond = Traits::Cond::Br_ne;
6718   lowerSelectMove(Dest, Cond, SrcT, SrcF);
6719 }
6720 
6721 template <typename TraitsType>
6722 void TargetX86Base<TraitsType>::lowerSelectMove(Variable *Dest, BrCond Cond,
6723                                                 Operand *SrcT, Operand *SrcF) {
6724   Type DestTy = Dest->getType();
6725   if (typeWidthInBytes(DestTy) == 1 || isFloatingType(DestTy)) {
6726     // The cmov instruction doesn't allow 8-bit or FP operands, so we need
6727     // explicit control flow.
6728     // d=cmp e,f; a=d?b:c ==> cmp e,f; a=b; jne L1; a=c; L1:
6729     auto *Label = InstX86Label::create(Func, this);
6730     SrcT = legalize(SrcT, Legal_Reg | Legal_Imm);
6731     _mov(Dest, SrcT);
6732     _br(Cond, Label);
6733     SrcF = legalize(SrcF, Legal_Reg | Legal_Imm);
6734     _redefined(_mov(Dest, SrcF));
6735     Context.insert(Label);
6736     return;
6737   }
6738   // mov t, SrcF; cmov_cond t, SrcT; mov dest, t
6739   // But if SrcT is immediate, we might be able to do better, as the cmov
6740   // instruction doesn't allow an immediate operand:
6741   // mov t, SrcT; cmov_!cond t, SrcF; mov dest, t
6742   if (llvm::isa<Constant>(SrcT) && !llvm::isa<Constant>(SrcF)) {
6743     std::swap(SrcT, SrcF);
6744     Cond = InstImpl<TraitsType>::InstX86Base::getOppositeCondition(Cond);
6745   }
6746   if (!Traits::Is64Bit && DestTy == IceType_i64) {
6747     SrcT = legalizeUndef(SrcT);
6748     SrcF = legalizeUndef(SrcF);
6749     // Set the low portion.
6750     auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
6751     lowerSelectIntMove(DestLo, Cond, loOperand(SrcT), loOperand(SrcF));
6752     // Set the high portion.
6753     auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
6754     lowerSelectIntMove(DestHi, Cond, hiOperand(SrcT), hiOperand(SrcF));
6755     return;
6756   }
6757 
6758   assert(DestTy == IceType_i16 || DestTy == IceType_i32 ||
6759          (Traits::Is64Bit && DestTy == IceType_i64));
6760   lowerSelectIntMove(Dest, Cond, SrcT, SrcF);
6761 }
6762 
6763 template <typename TraitsType>
6764 void TargetX86Base<TraitsType>::lowerSelectIntMove(Variable *Dest, BrCond Cond,
6765                                                    Operand *SrcT,
6766                                                    Operand *SrcF) {
6767   Variable *T = nullptr;
6768   SrcF = legalize(SrcF);
6769   _mov(T, SrcF);
6770   SrcT = legalize(SrcT, Legal_Reg | Legal_Mem);
6771   _cmov(T, SrcT, Cond);
6772   _mov(Dest, T);
6773 }
6774 
6775 template <typename TraitsType>
6776 void TargetX86Base<TraitsType>::lowerMove(Variable *Dest, Operand *Src,
6777                                           bool IsRedefinition) {
6778   assert(Dest->getType() == Src->getType());
6779   assert(!Dest->isRematerializable());
6780   if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
6781     Src = legalize(Src);
6782     Operand *SrcLo = loOperand(Src);
6783     Operand *SrcHi = hiOperand(Src);
6784     auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
6785     auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
6786     Variable *T_Lo = nullptr, *T_Hi = nullptr;
6787     _mov(T_Lo, SrcLo);
6788     _redefined(_mov(DestLo, T_Lo), IsRedefinition);
6789     _mov(T_Hi, SrcHi);
6790     _redefined(_mov(DestHi, T_Hi), IsRedefinition);
6791   } else {
6792     Operand *SrcLegal;
6793     if (Dest->hasReg()) {
6794       // If Dest already has a physical register, then only basic legalization
6795       // is needed, as the source operand can be a register, immediate, or
6796       // memory.
6797       SrcLegal = legalize(Src, Legal_Reg, Dest->getRegNum());
6798     } else {
6799       // If Dest could be a stack operand, then RI must be a physical register
6800       // or a scalar integer immediate.
6801       SrcLegal = legalize(Src, Legal_Reg | Legal_Imm);
6802     }
6803     if (isVectorType(Dest->getType())) {
6804       _redefined(_movp(Dest, SrcLegal), IsRedefinition);
6805     } else {
6806       _redefined(_mov(Dest, SrcLegal), IsRedefinition);
6807     }
6808   }
6809 }
6810 
6811 template <typename TraitsType>
6812 bool TargetX86Base<TraitsType>::lowerOptimizeFcmpSelect(
6813     const InstFcmp *Fcmp, const InstSelect *Select) {
6814   Operand *CmpSrc0 = Fcmp->getSrc(0);
6815   Operand *CmpSrc1 = Fcmp->getSrc(1);
6816   Operand *SelectSrcT = Select->getTrueOperand();
6817   Operand *SelectSrcF = Select->getFalseOperand();
6818   Variable *SelectDest = Select->getDest();
6819 
6820   // TODO(capn): also handle swapped compare/select operand order.
6821   if (CmpSrc0 != SelectSrcT || CmpSrc1 != SelectSrcF)
6822     return false;
6823 
6824   // TODO(sehr, stichnot): fcmp/select patterns (e.g., minsd/maxss) go here.
6825   InstFcmp::FCond Condition = Fcmp->getCondition();
6826   switch (Condition) {
6827   default:
6828     return false;
6829   case InstFcmp::True:
6830     break;
6831   case InstFcmp::False:
6832     break;
6833   case InstFcmp::Ogt: {
6834     Variable *T = makeReg(SelectDest->getType());
6835     if (isScalarFloatingType(SelectSrcT->getType())) {
6836       _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
6837       _maxss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
6838       _mov(SelectDest, T);
6839     } else {
6840       _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
6841       _maxps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
6842       _movp(SelectDest, T);
6843     }
6844     return true;
6845   } break;
6846   case InstFcmp::Olt: {
6847     Variable *T = makeReg(SelectSrcT->getType());
6848     if (isScalarFloatingType(SelectSrcT->getType())) {
6849       _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
6850       _minss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
6851       _mov(SelectDest, T);
6852     } else {
6853       _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
6854       _minps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
6855       _movp(SelectDest, T);
6856     }
6857     return true;
6858   } break;
6859   }
6860   return false;
6861 }
6862 
6863 template <typename TraitsType>
6864 void TargetX86Base<TraitsType>::lowerIcmp(const InstIcmp *Icmp) {
6865   Variable *Dest = Icmp->getDest();
6866   if (isVectorType(Dest->getType())) {
6867     lowerIcmpVector(Icmp);
6868   } else {
6869     constexpr Inst *Consumer = nullptr;
6870     lowerIcmpAndConsumer(Icmp, Consumer);
6871   }
6872 }
6873 
6874 template <typename TraitsType>
6875 void TargetX86Base<TraitsType>::lowerSelectVector(const InstSelect *Instr) {
6876   Variable *Dest = Instr->getDest();
6877   Type DestTy = Dest->getType();
6878   Operand *SrcT = Instr->getTrueOperand();
6879   Operand *SrcF = Instr->getFalseOperand();
6880   Operand *Condition = Instr->getCondition();
6881 
6882   if (!isVectorType(DestTy))
6883     llvm::report_fatal_error("Expected a vector select");
6884 
6885   Type SrcTy = SrcT->getType();
6886   Variable *T = makeReg(SrcTy);
6887   Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem);
6888   Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem);
6889 
6890   if (InstructionSet >= Traits::SSE4_1) {
6891     // TODO(wala): If the condition operand is a constant, use blendps or
6892     // pblendw.
6893     //
6894     // Use blendvps or pblendvb to implement select.
6895     if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 ||
6896         SrcTy == IceType_v4f32) {
6897       Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
6898       Variable *xmm0 = makeReg(IceType_v4i32, Traits::RegisterSet::Reg_xmm0);
6899       _movp(xmm0, ConditionRM);
6900       _psll(xmm0, Ctx->getConstantInt8(31));
6901       _movp(T, SrcFRM);
6902       _blendvps(T, SrcTRM, xmm0);
6903       _movp(Dest, T);
6904     } else {
6905       assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16);
6906       Type SignExtTy =
6907           Condition->getType() == IceType_v8i1 ? IceType_v8i16 : IceType_v16i8;
6908       Variable *xmm0 = makeReg(SignExtTy, Traits::RegisterSet::Reg_xmm0);
6909       lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition));
6910       _movp(T, SrcFRM);
6911       _pblendvb(T, SrcTRM, xmm0);
6912       _movp(Dest, T);
6913     }
6914     return;
6915   }
6916   // Lower select without Traits::SSE4.1:
6917   // a=d?b:c ==>
6918   //   if elementtype(d) != i1:
6919   //      d=sext(d);
6920   //   a=(b&d)|(c&~d);
6921   Variable *T2 = makeReg(SrcTy);
6922   // Sign extend the condition operand if applicable.
6923   if (SrcTy == IceType_v4f32) {
6924     // The sext operation takes only integer arguments.
6925     Variable *T3 = Func->makeVariable(IceType_v4i32);
6926     lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition));
6927     _movp(T, T3);
6928   } else if (typeElementType(SrcTy) != IceType_i1) {
6929     lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));
6930   } else {
6931     Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
6932     _movp(T, ConditionRM);
6933   }
6934   _movp(T2, T);
6935   _pand(T, SrcTRM);
6936   _pandn(T2, SrcFRM);
6937   _por(T, T2);
6938   _movp(Dest, T);
6939 
6940   return;
6941 }
6942 
6943 template <typename TraitsType>
6944 void TargetX86Base<TraitsType>::lowerStore(const InstStore *Instr) {
6945   Operand *Value = Instr->getData();
6946   Operand *Addr = Instr->getAddr();
6947   X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType());
6948   doMockBoundsCheck(NewAddr);
6949   Type Ty = NewAddr->getType();
6950 
6951   if (!Traits::Is64Bit && Ty == IceType_i64) {
6952     Value = legalizeUndef(Value);
6953     Operand *ValueHi = legalize(hiOperand(Value), Legal_Reg | Legal_Imm);
6954     _store(ValueHi, llvm::cast<X86OperandMem>(hiOperand(NewAddr)));
6955     Operand *ValueLo = legalize(loOperand(Value), Legal_Reg | Legal_Imm);
6956     _store(ValueLo, llvm::cast<X86OperandMem>(loOperand(NewAddr)));
6957   } else if (isVectorType(Ty)) {
6958     _storep(legalizeToReg(Value), NewAddr);
6959   } else {
6960     Value = legalize(Value, Legal_Reg | Legal_Imm);
6961     _store(Value, NewAddr);
6962   }
6963 }
6964 
6965 template <typename TraitsType>
6966 void TargetX86Base<TraitsType>::doAddressOptStore() {
6967   auto *Instr = llvm::cast<InstStore>(Context.getCur());
6968   Operand *Addr = Instr->getAddr();
6969   Operand *Data = Instr->getData();
6970   if (auto *OptAddr = computeAddressOpt(Instr, Data->getType(), Addr)) {
6971     Instr->setDeleted();
6972     auto *NewStore = Context.insert<InstStore>(Data, OptAddr);
6973     if (Instr->getDest())
6974       NewStore->setRmwBeacon(Instr->getRmwBeacon());
6975   }
6976 }
6977 
6978 template <typename TraitsType>
6979 void TargetX86Base<TraitsType>::doAddressOptStoreSubVector() {
6980   auto *Intrinsic = llvm::cast<InstIntrinsicCall>(Context.getCur());
6981   Operand *Addr = Intrinsic->getArg(1);
6982   Operand *Data = Intrinsic->getArg(0);
6983   if (auto *OptAddr = computeAddressOpt(Intrinsic, Data->getType(), Addr)) {
6984     Intrinsic->setDeleted();
6985     const Ice::Intrinsics::IntrinsicInfo Info = {
6986         Ice::Intrinsics::StoreSubVector, Ice::Intrinsics::SideEffects_T,
6987         Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T};
6988     auto Target = Ctx->getConstantUndef(Ice::IceType_i32);
6989     auto *NewStore =
6990         Context.insert<InstIntrinsicCall>(3, nullptr, Target, Info);
6991     NewStore->addArg(Data);
6992     NewStore->addArg(OptAddr);
6993     NewStore->addArg(Intrinsic->getArg(2));
6994   }
6995 }
6996 
6997 template <typename TraitsType>
6998 Operand *TargetX86Base<TraitsType>::lowerCmpRange(Operand *Comparison,
6999                                                   uint64_t Min, uint64_t Max) {
7000   // TODO(ascull): 64-bit should not reach here but only because it is not
7001   // implemented yet. This should be able to handle the 64-bit case.
7002   assert(Traits::Is64Bit || Comparison->getType() != IceType_i64);
7003   // Subtracting 0 is a nop so don't do it
7004   if (Min != 0) {
7005     // Avoid clobbering the comparison by copying it
7006     Variable *T = nullptr;
7007     _mov(T, Comparison);
7008     _sub(T, Ctx->getConstantInt32(Min));
7009     Comparison = T;
7010   }
7011 
7012   _cmp(Comparison, Ctx->getConstantInt32(Max - Min));
7013 
7014   return Comparison;
7015 }
7016 
7017 template <typename TraitsType>
7018 void TargetX86Base<TraitsType>::lowerCaseCluster(const CaseCluster &Case,
7019                                                  Operand *Comparison,
7020                                                  bool DoneCmp,
7021                                                  CfgNode *DefaultTarget) {
7022   switch (Case.getKind()) {
7023   case CaseCluster::JumpTable: {
7024     InstX86Label *SkipJumpTable;
7025 
7026     Operand *RangeIndex =
7027         lowerCmpRange(Comparison, Case.getLow(), Case.getHigh());
7028     if (DefaultTarget == nullptr) {
7029       // Skip over jump table logic if comparison not in range and no default
7030       SkipJumpTable = InstX86Label::create(Func, this);
7031       _br(Traits::Cond::Br_a, SkipJumpTable);
7032     } else {
7033       _br(Traits::Cond::Br_a, DefaultTarget);
7034     }
7035 
7036     InstJumpTable *JumpTable = Case.getJumpTable();
7037     Context.insert(JumpTable);
7038 
7039     // Make sure the index is a register of the same width as the base
7040     Variable *Index;
7041     const Type PointerType = getPointerType();
7042     if (RangeIndex->getType() != PointerType) {
7043       Index = makeReg(PointerType);
7044       if (RangeIndex->getType() == IceType_i64) {
7045         assert(Traits::Is64Bit);
7046         _mov(Index, RangeIndex); // trunc
7047       } else {
7048         Operand *RangeIndexRM = legalize(RangeIndex, Legal_Reg | Legal_Mem);
7049         _movzx(Index, RangeIndexRM);
7050       }
7051     } else {
7052       Index = legalizeToReg(RangeIndex);
7053     }
7054 
7055     constexpr RelocOffsetT RelocOffset = 0;
7056     constexpr Variable *NoBase = nullptr;
7057     constexpr Constant *NoOffset = nullptr;
7058     auto JTName = GlobalString::createWithString(Ctx, JumpTable->getName());
7059     Constant *Offset = Ctx->getConstantSym(RelocOffset, JTName);
7060     uint16_t Shift = typeWidthInBytesLog2(PointerType);
7061     constexpr auto Segment = X86OperandMem::SegmentRegisters::DefaultSegment;
7062 
7063     Variable *Target = nullptr;
7064     if (Traits::Is64Bit && NeedSandboxing) {
7065       assert(Index != nullptr && Index->getType() == IceType_i32);
7066     }
7067 
7068     if (PointerType == IceType_i32) {
7069       _mov(Target, X86OperandMem::create(Func, PointerType, NoBase, Offset,
7070                                          Index, Shift, Segment));
7071     } else {
7072       auto *Base = makeReg(IceType_i64);
7073       _lea(Base, X86OperandMem::create(Func, IceType_void, NoBase, Offset));
7074       _mov(Target, X86OperandMem::create(Func, PointerType, Base, NoOffset,
7075                                          Index, Shift, Segment));
7076     }
7077 
7078     lowerIndirectJump(Target);
7079 
7080     if (DefaultTarget == nullptr)
7081       Context.insert(SkipJumpTable);
7082     return;
7083   }
7084   case CaseCluster::Range: {
7085     if (Case.isUnitRange()) {
7086       // Single item
7087       if (!DoneCmp) {
7088         Constant *Value = Ctx->getConstantInt32(Case.getLow());
7089         _cmp(Comparison, Value);
7090       }
7091       _br(Traits::Cond::Br_e, Case.getTarget());
7092     } else if (DoneCmp && Case.isPairRange()) {
7093       // Range of two items with first item aleady compared against
7094       _br(Traits::Cond::Br_e, Case.getTarget());
7095       Constant *Value = Ctx->getConstantInt32(Case.getHigh());
7096       _cmp(Comparison, Value);
7097       _br(Traits::Cond::Br_e, Case.getTarget());
7098     } else {
7099       // Range
7100       lowerCmpRange(Comparison, Case.getLow(), Case.getHigh());
7101       _br(Traits::Cond::Br_be, Case.getTarget());
7102     }
7103     if (DefaultTarget != nullptr)
7104       _br(DefaultTarget);
7105     return;
7106   }
7107   }
7108 }
7109 
7110 template <typename TraitsType>
7111 void TargetX86Base<TraitsType>::lowerSwitch(const InstSwitch *Instr) {
7112   // Group cases together and navigate through them with a binary search
7113   CaseClusterArray CaseClusters = CaseCluster::clusterizeSwitch(Func, Instr);
7114   Operand *Src0 = Instr->getComparison();
7115   CfgNode *DefaultTarget = Instr->getLabelDefault();
7116 
7117   assert(CaseClusters.size() != 0); // Should always be at least one
7118 
7119   if (!Traits::Is64Bit && Src0->getType() == IceType_i64) {
7120     Src0 = legalize(Src0); // get Base/Index into physical registers
7121     Operand *Src0Lo = loOperand(Src0);
7122     Operand *Src0Hi = hiOperand(Src0);
7123     if (CaseClusters.back().getHigh() > UINT32_MAX) {
7124       // TODO(ascull): handle 64-bit case properly (currently naive version)
7125       // This might be handled by a higher level lowering of switches.
7126       SizeT NumCases = Instr->getNumCases();
7127       if (NumCases >= 2) {
7128         Src0Lo = legalizeToReg(Src0Lo);
7129         Src0Hi = legalizeToReg(Src0Hi);
7130       } else {
7131         Src0Lo = legalize(Src0Lo, Legal_Reg | Legal_Mem);
7132         Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem);
7133       }
7134       for (SizeT I = 0; I < NumCases; ++I) {
7135         Constant *ValueLo = Ctx->getConstantInt32(Instr->getValue(I));
7136         Constant *ValueHi = Ctx->getConstantInt32(Instr->getValue(I) >> 32);
7137         InstX86Label *Label = InstX86Label::create(Func, this);
7138         _cmp(Src0Lo, ValueLo);
7139         _br(Traits::Cond::Br_ne, Label);
7140         _cmp(Src0Hi, ValueHi);
7141         _br(Traits::Cond::Br_e, Instr->getLabel(I));
7142         Context.insert(Label);
7143       }
7144       _br(Instr->getLabelDefault());
7145       return;
7146     } else {
7147       // All the values are 32-bit so just check the operand is too and then
7148       // fall through to the 32-bit implementation. This is a common case.
7149       Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem);
7150       Constant *Zero = Ctx->getConstantInt32(0);
7151       _cmp(Src0Hi, Zero);
7152       _br(Traits::Cond::Br_ne, DefaultTarget);
7153       Src0 = Src0Lo;
7154     }
7155   }
7156 
7157   // 32-bit lowering
7158 
7159   if (CaseClusters.size() == 1) {
7160     // Jump straight to default if needed. Currently a common case as jump
7161     // tables occur on their own.
7162     constexpr bool DoneCmp = false;
7163     lowerCaseCluster(CaseClusters.front(), Src0, DoneCmp, DefaultTarget);
7164     return;
7165   }
7166 
7167   // Going to be using multiple times so get it in a register early
7168   Variable *Comparison = legalizeToReg(Src0);
7169 
7170   // A span is over the clusters
7171   struct SearchSpan {
7172     SearchSpan(SizeT Begin, SizeT Size, InstX86Label *Label)
7173         : Begin(Begin), Size(Size), Label(Label) {}
7174 
7175     SizeT Begin;
7176     SizeT Size;
7177     InstX86Label *Label;
7178   };
7179   // The stack will only grow to the height of the tree so 12 should be plenty
7180   std::stack<SearchSpan, llvm::SmallVector<SearchSpan, 12>> SearchSpanStack;
7181   SearchSpanStack.emplace(0, CaseClusters.size(), nullptr);
7182   bool DoneCmp = false;
7183 
7184   while (!SearchSpanStack.empty()) {
7185     SearchSpan Span = SearchSpanStack.top();
7186     SearchSpanStack.pop();
7187 
7188     if (Span.Label != nullptr)
7189       Context.insert(Span.Label);
7190 
7191     switch (Span.Size) {
7192     case 0:
7193       llvm::report_fatal_error("Invalid SearchSpan size");
7194       break;
7195 
7196     case 1:
7197       lowerCaseCluster(CaseClusters[Span.Begin], Comparison, DoneCmp,
7198                        SearchSpanStack.empty() ? nullptr : DefaultTarget);
7199       DoneCmp = false;
7200       break;
7201 
7202     case 2: {
7203       const CaseCluster *CaseA = &CaseClusters[Span.Begin];
7204       const CaseCluster *CaseB = &CaseClusters[Span.Begin + 1];
7205 
7206       // Placing a range last may allow register clobbering during the range
7207       // test. That means there is no need to clone the register. If it is a
7208       // unit range the comparison may have already been done in the binary
7209       // search (DoneCmp) and so it should be placed first. If this is a range
7210       // of two items and the comparison with the low value has already been
7211       // done, comparing with the other element is cheaper than a range test.
7212       // If the low end of the range is zero then there is no subtraction and
7213       // nothing to be gained.
7214       if (!CaseA->isUnitRange() &&
7215           !(CaseA->getLow() == 0 || (DoneCmp && CaseA->isPairRange()))) {
7216         std::swap(CaseA, CaseB);
7217         DoneCmp = false;
7218       }
7219 
7220       lowerCaseCluster(*CaseA, Comparison, DoneCmp);
7221       DoneCmp = false;
7222       lowerCaseCluster(*CaseB, Comparison, DoneCmp,
7223                        SearchSpanStack.empty() ? nullptr : DefaultTarget);
7224     } break;
7225 
7226     default:
7227       // Pick the middle item and branch b or ae
7228       SizeT PivotIndex = Span.Begin + (Span.Size / 2);
7229       const CaseCluster &Pivot = CaseClusters[PivotIndex];
7230       Constant *Value = Ctx->getConstantInt32(Pivot.getLow());
7231       InstX86Label *Label = InstX86Label::create(Func, this);
7232       _cmp(Comparison, Value);
7233       // TODO(ascull): does it alway have to be far?
7234       _br(Traits::Cond::Br_b, Label, InstX86Br::Far);
7235       // Lower the left and (pivot+right) sides, falling through to the right
7236       SearchSpanStack.emplace(Span.Begin, Span.Size / 2, Label);
7237       SearchSpanStack.emplace(PivotIndex, Span.Size - (Span.Size / 2), nullptr);
7238       DoneCmp = true;
7239       break;
7240     }
7241   }
7242 
7243   _br(DefaultTarget);
7244 }
7245 
7246 /// The following pattern occurs often in lowered C and C++ code:
7247 ///
7248 ///   %cmp     = fcmp/icmp pred <n x ty> %src0, %src1
7249 ///   %cmp.ext = sext <n x i1> %cmp to <n x ty>
7250 ///
7251 /// We can eliminate the sext operation by copying the result of pcmpeqd,
7252 /// pcmpgtd, or cmpps (which produce sign extended results) to the result of the
7253 /// sext operation.
7254 template <typename TraitsType>
7255 void TargetX86Base<TraitsType>::eliminateNextVectorSextInstruction(
7256     Variable *SignExtendedResult) {
7257   if (auto *NextCast =
7258           llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
7259     if (NextCast->getCastKind() == InstCast::Sext &&
7260         NextCast->getSrc(0) == SignExtendedResult) {
7261       NextCast->setDeleted();
7262       _movp(NextCast->getDest(), legalizeToReg(SignExtendedResult));
7263       // Skip over the instruction.
7264       Context.advanceNext();
7265     }
7266   }
7267 }
7268 
7269 template <typename TraitsType>
7270 void TargetX86Base<TraitsType>::lowerUnreachable(
7271     const InstUnreachable * /*Instr*/) {
7272   _ud2();
7273   // Add a fake use of esp to make sure esp adjustments after the unreachable
7274   // do not get dead-code eliminated.
7275   keepEspLiveAtExit();
7276 }
7277 
7278 template <typename TraitsType>
7279 void TargetX86Base<TraitsType>::lowerBreakpoint(
7280     const InstBreakpoint * /*Instr*/) {
7281   _int3();
7282 }
7283 
7284 template <typename TraitsType>
7285 void TargetX86Base<TraitsType>::lowerRMW(const InstX86FakeRMW *RMW) {
7286   // If the beacon variable's live range does not end in this instruction, then
7287   // it must end in the modified Store instruction that follows. This means
7288   // that the original Store instruction is still there, either because the
7289   // value being stored is used beyond the Store instruction, or because dead
7290   // code elimination did not happen. In either case, we cancel RMW lowering
7291   // (and the caller deletes the RMW instruction).
7292   if (!RMW->isLastUse(RMW->getBeacon()))
7293     return;
7294   Operand *Src = RMW->getData();
7295   Type Ty = Src->getType();
7296   X86OperandMem *Addr = formMemoryOperand(RMW->getAddr(), Ty);
7297   doMockBoundsCheck(Addr);
7298   if (!Traits::Is64Bit && Ty == IceType_i64) {
7299     Src = legalizeUndef(Src);
7300     Operand *SrcLo = legalize(loOperand(Src), Legal_Reg | Legal_Imm);
7301     Operand *SrcHi = legalize(hiOperand(Src), Legal_Reg | Legal_Imm);
7302     auto *AddrLo = llvm::cast<X86OperandMem>(loOperand(Addr));
7303     auto *AddrHi = llvm::cast<X86OperandMem>(hiOperand(Addr));
7304     switch (RMW->getOp()) {
7305     default:
7306       // TODO(stichnot): Implement other arithmetic operators.
7307       break;
7308     case InstArithmetic::Add:
7309       _add_rmw(AddrLo, SrcLo);
7310       _adc_rmw(AddrHi, SrcHi);
7311       return;
7312     case InstArithmetic::Sub:
7313       _sub_rmw(AddrLo, SrcLo);
7314       _sbb_rmw(AddrHi, SrcHi);
7315       return;
7316     case InstArithmetic::And:
7317       _and_rmw(AddrLo, SrcLo);
7318       _and_rmw(AddrHi, SrcHi);
7319       return;
7320     case InstArithmetic::Or:
7321       _or_rmw(AddrLo, SrcLo);
7322       _or_rmw(AddrHi, SrcHi);
7323       return;
7324     case InstArithmetic::Xor:
7325       _xor_rmw(AddrLo, SrcLo);
7326       _xor_rmw(AddrHi, SrcHi);
7327       return;
7328     }
7329   } else {
7330     // x86-32: i8, i16, i32
7331     // x86-64: i8, i16, i32, i64
7332     switch (RMW->getOp()) {
7333     default:
7334       // TODO(stichnot): Implement other arithmetic operators.
7335       break;
7336     case InstArithmetic::Add:
7337       Src = legalize(Src, Legal_Reg | Legal_Imm);
7338       _add_rmw(Addr, Src);
7339       return;
7340     case InstArithmetic::Sub:
7341       Src = legalize(Src, Legal_Reg | Legal_Imm);
7342       _sub_rmw(Addr, Src);
7343       return;
7344     case InstArithmetic::And:
7345       Src = legalize(Src, Legal_Reg | Legal_Imm);
7346       _and_rmw(Addr, Src);
7347       return;
7348     case InstArithmetic::Or:
7349       Src = legalize(Src, Legal_Reg | Legal_Imm);
7350       _or_rmw(Addr, Src);
7351       return;
7352     case InstArithmetic::Xor:
7353       Src = legalize(Src, Legal_Reg | Legal_Imm);
7354       _xor_rmw(Addr, Src);
7355       return;
7356     }
7357   }
7358   llvm::report_fatal_error("Couldn't lower RMW instruction");
7359 }
7360 
7361 template <typename TraitsType>
7362 void TargetX86Base<TraitsType>::lowerOther(const Inst *Instr) {
7363   if (const auto *RMW = llvm::dyn_cast<InstX86FakeRMW>(Instr)) {
7364     lowerRMW(RMW);
7365   } else {
7366     TargetLowering::lowerOther(Instr);
7367   }
7368 }
7369 
7370 /// Turn an i64 Phi instruction into a pair of i32 Phi instructions, to preserve
7371 /// integrity of liveness analysis. Undef values are also turned into zeroes,
7372 /// since loOperand() and hiOperand() don't expect Undef input.  Also, in
7373 /// Non-SFI mode, add a FakeUse(RebasePtr) for every pooled constant operand.
7374 template <typename TraitsType> void TargetX86Base<TraitsType>::prelowerPhis() {
7375   if (getFlags().getUseNonsfi()) {
7376     assert(RebasePtr);
7377     CfgNode *Node = Context.getNode();
7378     uint32_t RebasePtrUseCount = 0;
7379     for (Inst &I : Node->getPhis()) {
7380       auto *Phi = llvm::dyn_cast<InstPhi>(&I);
7381       if (Phi->isDeleted())
7382         continue;
7383       for (SizeT I = 0; I < Phi->getSrcSize(); ++I) {
7384         Operand *Src = Phi->getSrc(I);
7385         // TODO(stichnot): This over-counts for +0.0, and under-counts for other
7386         // kinds of pooling.
7387         if (llvm::isa<ConstantRelocatable>(Src) ||
7388             llvm::isa<ConstantFloat>(Src) || llvm::isa<ConstantDouble>(Src)) {
7389           ++RebasePtrUseCount;
7390         }
7391       }
7392     }
7393     if (RebasePtrUseCount) {
7394       Node->getInsts().push_front(InstFakeUse::create(Func, RebasePtr));
7395     }
7396   }
7397   if (Traits::Is64Bit) {
7398     // On x86-64 we don't need to prelower phis -- the architecture can handle
7399     // 64-bit integer natively.
7400     return;
7401   }
7402 
7403   // Pause constant blinding or pooling, blinding or pooling will be done later
7404   // during phi lowering assignments
7405   BoolFlagSaver B(RandomizationPoolingPaused, true);
7406   PhiLowering::prelowerPhis32Bit<TargetX86Base<TraitsType>>(
7407       this, Context.getNode(), Func);
7408 }
7409 
7410 template <typename TraitsType>
7411 void TargetX86Base<TraitsType>::genTargetHelperCallFor(Inst *Instr) {
7412   uint32_t StackArgumentsSize = 0;
7413   if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) {
7414     RuntimeHelper HelperID = RuntimeHelper::H_Num;
7415     Variable *Dest = Arith->getDest();
7416     Type DestTy = Dest->getType();
7417     if (!Traits::Is64Bit && DestTy == IceType_i64) {
7418       switch (Arith->getOp()) {
7419       default:
7420         return;
7421       case InstArithmetic::Udiv:
7422         HelperID = RuntimeHelper::H_udiv_i64;
7423         break;
7424       case InstArithmetic::Sdiv:
7425         HelperID = RuntimeHelper::H_sdiv_i64;
7426         break;
7427       case InstArithmetic::Urem:
7428         HelperID = RuntimeHelper::H_urem_i64;
7429         break;
7430       case InstArithmetic::Srem:
7431         HelperID = RuntimeHelper::H_srem_i64;
7432         break;
7433       }
7434     } else if (isVectorType(DestTy)) {
7435       Variable *Dest = Arith->getDest();
7436       Operand *Src0 = Arith->getSrc(0);
7437       Operand *Src1 = Arith->getSrc(1);
7438       switch (Arith->getOp()) {
7439       default:
7440         return;
7441       case InstArithmetic::Mul:
7442         if (DestTy == IceType_v16i8) {
7443           scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1);
7444           Arith->setDeleted();
7445         }
7446         return;
7447       case InstArithmetic::Shl:
7448       case InstArithmetic::Lshr:
7449       case InstArithmetic::Ashr:
7450         if (llvm::isa<Constant>(Src1)) {
7451           return;
7452         }
7453       case InstArithmetic::Udiv:
7454       case InstArithmetic::Urem:
7455       case InstArithmetic::Sdiv:
7456       case InstArithmetic::Srem:
7457       case InstArithmetic::Frem:
7458         scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1);
7459         Arith->setDeleted();
7460         return;
7461       }
7462     } else {
7463       switch (Arith->getOp()) {
7464       default:
7465         return;
7466       case InstArithmetic::Frem:
7467         if (isFloat32Asserting32Or64(DestTy))
7468           HelperID = RuntimeHelper::H_frem_f32;
7469         else
7470           HelperID = RuntimeHelper::H_frem_f64;
7471       }
7472     }
7473     constexpr SizeT MaxSrcs = 2;
7474     InstCall *Call = makeHelperCall(HelperID, Dest, MaxSrcs);
7475     Call->addArg(Arith->getSrc(0));
7476     Call->addArg(Arith->getSrc(1));
7477     StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
7478     Context.insert(Call);
7479     Arith->setDeleted();
7480   } else if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
7481     InstCast::OpKind CastKind = Cast->getCastKind();
7482     Operand *Src0 = Cast->getSrc(0);
7483     const Type SrcType = Src0->getType();
7484     Variable *Dest = Cast->getDest();
7485     const Type DestTy = Dest->getType();
7486     RuntimeHelper HelperID = RuntimeHelper::H_Num;
7487     Variable *CallDest = Dest;
7488     switch (CastKind) {
7489     default:
7490       return;
7491     case InstCast::Fptosi:
7492       if (!Traits::Is64Bit && DestTy == IceType_i64) {
7493         HelperID = isFloat32Asserting32Or64(SrcType)
7494                        ? RuntimeHelper::H_fptosi_f32_i64
7495                        : RuntimeHelper::H_fptosi_f64_i64;
7496       } else {
7497         return;
7498       }
7499       break;
7500     case InstCast::Fptoui:
7501       if (isVectorType(DestTy)) {
7502         assert(DestTy == IceType_v4i32);
7503         assert(SrcType == IceType_v4f32);
7504         HelperID = RuntimeHelper::H_fptoui_4xi32_f32;
7505       } else if (DestTy == IceType_i64 ||
7506                  (!Traits::Is64Bit && DestTy == IceType_i32)) {
7507         if (Traits::Is64Bit) {
7508           HelperID = isFloat32Asserting32Or64(SrcType)
7509                          ? RuntimeHelper::H_fptoui_f32_i64
7510                          : RuntimeHelper::H_fptoui_f64_i64;
7511         } else if (isInt32Asserting32Or64(DestTy)) {
7512           HelperID = isFloat32Asserting32Or64(SrcType)
7513                          ? RuntimeHelper::H_fptoui_f32_i32
7514                          : RuntimeHelper::H_fptoui_f64_i32;
7515         } else {
7516           HelperID = isFloat32Asserting32Or64(SrcType)
7517                          ? RuntimeHelper::H_fptoui_f32_i64
7518                          : RuntimeHelper::H_fptoui_f64_i64;
7519         }
7520       } else {
7521         return;
7522       }
7523       break;
7524     case InstCast::Sitofp:
7525       if (!Traits::Is64Bit && SrcType == IceType_i64) {
7526         HelperID = isFloat32Asserting32Or64(DestTy)
7527                        ? RuntimeHelper::H_sitofp_i64_f32
7528                        : RuntimeHelper::H_sitofp_i64_f64;
7529       } else {
7530         return;
7531       }
7532       break;
7533     case InstCast::Uitofp:
7534       if (isVectorType(SrcType)) {
7535         assert(DestTy == IceType_v4f32);
7536         assert(SrcType == IceType_v4i32);
7537         HelperID = RuntimeHelper::H_uitofp_4xi32_4xf32;
7538       } else if (SrcType == IceType_i64 ||
7539                  (!Traits::Is64Bit && SrcType == IceType_i32)) {
7540         if (isInt32Asserting32Or64(SrcType)) {
7541           HelperID = isFloat32Asserting32Or64(DestTy)
7542                          ? RuntimeHelper::H_uitofp_i32_f32
7543                          : RuntimeHelper::H_uitofp_i32_f64;
7544         } else {
7545           HelperID = isFloat32Asserting32Or64(DestTy)
7546                          ? RuntimeHelper::H_uitofp_i64_f32
7547                          : RuntimeHelper::H_uitofp_i64_f64;
7548         }
7549       } else {
7550         return;
7551       }
7552       break;
7553     case InstCast::Bitcast: {
7554       if (DestTy == Src0->getType())
7555         return;
7556       switch (DestTy) {
7557       default:
7558         return;
7559       case IceType_i8:
7560         assert(Src0->getType() == IceType_v8i1);
7561         HelperID = RuntimeHelper::H_bitcast_8xi1_i8;
7562         CallDest = Func->makeVariable(IceType_i32);
7563         break;
7564       case IceType_i16:
7565         assert(Src0->getType() == IceType_v16i1);
7566         HelperID = RuntimeHelper::H_bitcast_16xi1_i16;
7567         CallDest = Func->makeVariable(IceType_i32);
7568         break;
7569       case IceType_v8i1: {
7570         assert(Src0->getType() == IceType_i8);
7571         HelperID = RuntimeHelper::H_bitcast_i8_8xi1;
7572         Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
7573         // Arguments to functions are required to be at least 32 bits wide.
7574         Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
7575         Src0 = Src0AsI32;
7576       } break;
7577       case IceType_v16i1: {
7578         assert(Src0->getType() == IceType_i16);
7579         HelperID = RuntimeHelper::H_bitcast_i16_16xi1;
7580         Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
7581         // Arguments to functions are required to be at least 32 bits wide.
7582         Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
7583         Src0 = Src0AsI32;
7584       } break;
7585       }
7586     } break;
7587     }
7588     constexpr SizeT MaxSrcs = 1;
7589     InstCall *Call = makeHelperCall(HelperID, CallDest, MaxSrcs);
7590     Call->addArg(Src0);
7591     StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
7592     Context.insert(Call);
7593     // The PNaCl ABI disallows i8/i16 return types, so truncate the helper call
7594     // result to the appropriate type as necessary.
7595     if (CallDest->getType() != Dest->getType())
7596       Context.insert<InstCast>(InstCast::Trunc, Dest, CallDest);
7597     Cast->setDeleted();
7598   } else if (auto *Intrinsic = llvm::dyn_cast<InstIntrinsicCall>(Instr)) {
7599     CfgVector<Type> ArgTypes;
7600     Type ReturnType = IceType_void;
7601     switch (Intrinsics::IntrinsicID ID = Intrinsic->getIntrinsicInfo().ID) {
7602     default:
7603       return;
7604     case Intrinsics::Ctpop: {
7605       Operand *Val = Intrinsic->getArg(0);
7606       Type ValTy = Val->getType();
7607       if (ValTy == IceType_i64)
7608         ArgTypes = {IceType_i64};
7609       else
7610         ArgTypes = {IceType_i32};
7611       ReturnType = IceType_i32;
7612     } break;
7613     case Intrinsics::Longjmp:
7614       ArgTypes = {IceType_i32, IceType_i32};
7615       ReturnType = IceType_void;
7616       break;
7617     case Intrinsics::Memcpy:
7618       ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
7619       ReturnType = IceType_void;
7620       break;
7621     case Intrinsics::Memmove:
7622       ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
7623       ReturnType = IceType_void;
7624       break;
7625     case Intrinsics::Memset:
7626       ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
7627       ReturnType = IceType_void;
7628       break;
7629     case Intrinsics::NaClReadTP:
7630       ReturnType = IceType_i32;
7631       break;
7632     case Intrinsics::Setjmp:
7633       ArgTypes = {IceType_i32};
7634       ReturnType = IceType_i32;
7635       break;
7636     }
7637     StackArgumentsSize = getCallStackArgumentsSizeBytes(ArgTypes, ReturnType);
7638   } else if (auto *Call = llvm::dyn_cast<InstCall>(Instr)) {
7639     StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
7640   } else if (auto *Ret = llvm::dyn_cast<InstRet>(Instr)) {
7641     if (!Ret->hasRetValue())
7642       return;
7643     Operand *RetValue = Ret->getRetValue();
7644     Type ReturnType = RetValue->getType();
7645     if (!isScalarFloatingType(ReturnType))
7646       return;
7647     StackArgumentsSize = typeWidthInBytes(ReturnType);
7648   } else {
7649     return;
7650   }
7651   StackArgumentsSize = Traits::applyStackAlignment(StackArgumentsSize);
7652   updateMaxOutArgsSizeBytes(StackArgumentsSize);
7653 }
7654 
7655 template <typename TraitsType>
7656 uint32_t TargetX86Base<TraitsType>::getCallStackArgumentsSizeBytes(
7657     const CfgVector<Type> &ArgTypes, Type ReturnType) {
7658   uint32_t OutArgumentsSizeBytes = 0;
7659   uint32_t XmmArgCount = 0;
7660   uint32_t GprArgCount = 0;
7661   for (SizeT i = 0, NumArgTypes = ArgTypes.size(); i < NumArgTypes; ++i) {
7662     Type Ty = ArgTypes[i];
7663     // The PNaCl ABI requires the width of arguments to be at least 32 bits.
7664     assert(typeWidthInBytes(Ty) >= 4);
7665     if (isVectorType(Ty) &&
7666         Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, XmmArgCount))
7667             .hasValue()) {
7668       ++XmmArgCount;
7669     } else if (isScalarFloatingType(Ty) && Traits::X86_PASS_SCALAR_FP_IN_XMM &&
7670                Traits::getRegisterForXmmArgNum(
7671                    Traits::getArgIndex(i, XmmArgCount))
7672                    .hasValue()) {
7673       ++XmmArgCount;
7674     } else if (isScalarIntegerType(Ty) &&
7675                Traits::getRegisterForGprArgNum(
7676                    Ty, Traits::getArgIndex(i, GprArgCount))
7677                    .hasValue()) {
7678       // The 64 bit ABI allows some integers to be passed in GPRs.
7679       ++GprArgCount;
7680     } else {
7681       if (isVectorType(Ty)) {
7682         OutArgumentsSizeBytes =
7683             Traits::applyStackAlignment(OutArgumentsSizeBytes);
7684       }
7685       OutArgumentsSizeBytes += typeWidthInBytesOnStack(Ty);
7686     }
7687   }
7688   if (Traits::Is64Bit)
7689     return OutArgumentsSizeBytes;
7690   // The 32 bit ABI requires floating point values to be returned on the x87 FP
7691   // stack. Ensure there is enough space for the fstp/movs for floating returns.
7692   if (isScalarFloatingType(ReturnType)) {
7693     OutArgumentsSizeBytes =
7694         std::max(OutArgumentsSizeBytes,
7695                  static_cast<uint32_t>(typeWidthInBytesOnStack(ReturnType)));
7696   }
7697   return OutArgumentsSizeBytes;
7698 }
7699 
7700 template <typename TraitsType>
7701 uint32_t TargetX86Base<TraitsType>::getCallStackArgumentsSizeBytes(
7702     const InstCall *Instr) {
7703   // Build a vector of the arguments' types.
7704   const SizeT NumArgs = Instr->getNumArgs();
7705   CfgVector<Type> ArgTypes;
7706   ArgTypes.reserve(NumArgs);
7707   for (SizeT i = 0; i < NumArgs; ++i) {
7708     Operand *Arg = Instr->getArg(i);
7709     ArgTypes.emplace_back(Arg->getType());
7710   }
7711   // Compute the return type (if any);
7712   Type ReturnType = IceType_void;
7713   Variable *Dest = Instr->getDest();
7714   if (Dest != nullptr)
7715     ReturnType = Dest->getType();
7716   return getShadowStoreSize<Traits>() + getCallStackArgumentsSizeBytes(ArgTypes, ReturnType);
7717 }
7718 
7719 template <typename TraitsType>
7720 Variable *TargetX86Base<TraitsType>::makeZeroedRegister(Type Ty,
7721                                                         RegNumT RegNum) {
7722   Variable *Reg = makeReg(Ty, RegNum);
7723   switch (Ty) {
7724   case IceType_i1:
7725   case IceType_i8:
7726   case IceType_i16:
7727   case IceType_i32:
7728   case IceType_i64:
7729     // Conservatively do "mov reg, 0" to avoid modifying FLAGS.
7730     _mov(Reg, Ctx->getConstantZero(Ty));
7731     break;
7732   case IceType_f32:
7733   case IceType_f64:
7734     Context.insert<InstFakeDef>(Reg);
7735     _xorps(Reg, Reg);
7736     break;
7737   default:
7738     // All vector types use the same pxor instruction.
7739     assert(isVectorType(Ty));
7740     Context.insert<InstFakeDef>(Reg);
7741     _pxor(Reg, Reg);
7742     break;
7743   }
7744   return Reg;
7745 }
7746 
7747 // There is no support for loading or emitting vector constants, so the vector
7748 // values returned from makeVectorOfZeros, makeVectorOfOnes, etc. are
7749 // initialized with register operations.
7750 //
7751 // TODO(wala): Add limited support for vector constants so that complex
7752 // initialization in registers is unnecessary.
7753 
7754 template <typename TraitsType>
7755 Variable *TargetX86Base<TraitsType>::makeVectorOfZeros(Type Ty,
7756                                                        RegNumT RegNum) {
7757   return makeZeroedRegister(Ty, RegNum);
7758 }
7759 
7760 template <typename TraitsType>
7761 Variable *TargetX86Base<TraitsType>::makeVectorOfMinusOnes(Type Ty,
7762                                                            RegNumT RegNum) {
7763   Variable *MinusOnes = makeReg(Ty, RegNum);
7764   // Insert a FakeDef so the live range of MinusOnes is not overestimated.
7765   Context.insert<InstFakeDef>(MinusOnes);
7766   if (Ty == IceType_f64)
7767     // Making a vector of minus ones of type f64 is currently only used for the
7768     // fabs intrinsic.  To use the f64 type to create this mask with pcmpeqq
7769     // requires SSE 4.1.  Since we're just creating a mask, pcmpeqd does the
7770     // same job and only requires SSE2.
7771     _pcmpeq(MinusOnes, MinusOnes, IceType_f32);
7772   else
7773     _pcmpeq(MinusOnes, MinusOnes);
7774   return MinusOnes;
7775 }
7776 
7777 template <typename TraitsType>
7778 Variable *TargetX86Base<TraitsType>::makeVectorOfOnes(Type Ty, RegNumT RegNum) {
7779   Variable *Dest = makeVectorOfZeros(Ty, RegNum);
7780   Variable *MinusOne = makeVectorOfMinusOnes(Ty);
7781   _psub(Dest, MinusOne);
7782   return Dest;
7783 }
7784 
7785 template <typename TraitsType>
7786 Variable *TargetX86Base<TraitsType>::makeVectorOfHighOrderBits(Type Ty,
7787                                                                RegNumT RegNum) {
7788   assert(Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v8i16 ||
7789          Ty == IceType_v16i8);
7790   if (Ty == IceType_v4f32 || Ty == IceType_v4i32 || Ty == IceType_v8i16) {
7791     Variable *Reg = makeVectorOfOnes(Ty, RegNum);
7792     SizeT Shift =
7793         typeWidthInBytes(typeElementType(Ty)) * Traits::X86_CHAR_BIT - 1;
7794     _psll(Reg, Ctx->getConstantInt8(Shift));
7795     return Reg;
7796   } else {
7797     // SSE has no left shift operation for vectors of 8 bit integers.
7798     constexpr uint32_t HIGH_ORDER_BITS_MASK = 0x80808080;
7799     Constant *ConstantMask = Ctx->getConstantInt32(HIGH_ORDER_BITS_MASK);
7800     Variable *Reg = makeReg(Ty, RegNum);
7801     _movd(Reg, legalize(ConstantMask, Legal_Reg | Legal_Mem));
7802     _pshufd(Reg, Reg, Ctx->getConstantZero(IceType_i8));
7803     return Reg;
7804   }
7805 }
7806 
7807 /// Construct a mask in a register that can be and'ed with a floating-point
7808 /// value to mask off its sign bit. The value will be <4 x 0x7fffffff> for f32
7809 /// and v4f32, and <2 x 0x7fffffffffffffff> for f64. Construct it as vector of
7810 /// ones logically right shifted one bit.
7811 // TODO(stichnot): Fix the wala
7812 // TODO: above, to represent vector constants in memory.
7813 template <typename TraitsType>
7814 Variable *TargetX86Base<TraitsType>::makeVectorOfFabsMask(Type Ty,
7815                                                           RegNumT RegNum) {
7816   Variable *Reg = makeVectorOfMinusOnes(Ty, RegNum);
7817   _psrl(Reg, Ctx->getConstantInt8(1));
7818   return Reg;
7819 }
7820 
7821 template <typename TraitsType>
7822 typename TargetX86Base<TraitsType>::X86OperandMem *
7823 TargetX86Base<TraitsType>::getMemoryOperandForStackSlot(Type Ty, Variable *Slot,
7824                                                         uint32_t Offset) {
7825   // Ensure that Loc is a stack slot.
7826   assert(Slot->mustNotHaveReg());
7827   assert(Slot->getRegNum().hasNoValue());
7828   // Compute the location of Loc in memory.
7829   // TODO(wala,stichnot): lea should not
7830   // be required. The address of the stack slot is known at compile time
7831   // (although not until after addProlog()).
7832   const Type PointerType = getPointerType();
7833   Variable *Loc = makeReg(PointerType);
7834   _lea(Loc, Slot);
7835   Constant *ConstantOffset = Ctx->getConstantInt32(Offset);
7836   return X86OperandMem::create(Func, Ty, Loc, ConstantOffset);
7837 }
7838 
7839 /// Lowering helper to copy a scalar integer source operand into some 8-bit GPR.
7840 /// Src is assumed to already be legalized.  If the source operand is known to
7841 /// be a memory or immediate operand, a simple mov will suffice.  But if the
7842 /// source operand can be a physical register, then it must first be copied into
7843 /// a physical register that is truncable to 8-bit, then truncated into a
7844 /// physical register that can receive a truncation, and finally copied into the
7845 /// result 8-bit register (which in general can be any 8-bit register).  For
7846 /// example, moving %ebp into %ah may be accomplished as:
7847 ///   movl %ebp, %edx
7848 ///   mov_trunc %edx, %dl  // this redundant assignment is ultimately elided
7849 ///   movb %dl, %ah
7850 /// On the other hand, moving a memory or immediate operand into ah:
7851 ///   movb 4(%ebp), %ah
7852 ///   movb $my_imm, %ah
7853 ///
7854 /// Note #1.  On a 64-bit target, the "movb 4(%ebp), %ah" is likely not
7855 /// encodable, so RegNum=Reg_ah should NOT be given as an argument.  Instead,
7856 /// use RegNum=RegNumT() and then let the caller do a separate copy into
7857 /// Reg_ah.
7858 ///
7859 /// Note #2.  ConstantRelocatable operands are also put through this process
7860 /// (not truncated directly) because our ELF emitter does R_386_32 relocations
7861 /// but not R_386_8 relocations.
7862 ///
7863 /// Note #3.  If Src is a Variable, the result will be an infinite-weight i8
7864 /// Variable with the RCX86_IsTrunc8Rcvr register class.  As such, this helper
7865 /// is a convenient way to prevent ah/bh/ch/dh from being an (invalid) argument
7866 /// to the pinsrb instruction.
7867 template <typename TraitsType>
7868 Variable *TargetX86Base<TraitsType>::copyToReg8(Operand *Src, RegNumT RegNum) {
7869   Type Ty = Src->getType();
7870   assert(isScalarIntegerType(Ty));
7871   assert(Ty != IceType_i1);
7872   Variable *Reg = makeReg(IceType_i8, RegNum);
7873   Reg->setRegClass(RCX86_IsTrunc8Rcvr);
7874   if (llvm::isa<Variable>(Src) || llvm::isa<ConstantRelocatable>(Src)) {
7875     Variable *SrcTruncable = makeReg(Ty);
7876     switch (Ty) {
7877     case IceType_i64:
7878       SrcTruncable->setRegClass(RCX86_Is64To8);
7879       break;
7880     case IceType_i32:
7881       SrcTruncable->setRegClass(RCX86_Is32To8);
7882       break;
7883     case IceType_i16:
7884       SrcTruncable->setRegClass(RCX86_Is16To8);
7885       break;
7886     default:
7887       // i8 - just use default register class
7888       break;
7889     }
7890     Variable *SrcRcvr = makeReg(IceType_i8);
7891     SrcRcvr->setRegClass(RCX86_IsTrunc8Rcvr);
7892     _mov(SrcTruncable, Src);
7893     _mov(SrcRcvr, SrcTruncable);
7894     Src = SrcRcvr;
7895   }
7896   _mov(Reg, Src);
7897   return Reg;
7898 }
7899 
7900 /// Helper for legalize() to emit the right code to lower an operand to a
7901 /// register of the appropriate type.
7902 template <typename TraitsType>
7903 Variable *TargetX86Base<TraitsType>::copyToReg(Operand *Src, RegNumT RegNum) {
7904   Type Ty = Src->getType();
7905   Variable *Reg = makeReg(Ty, RegNum);
7906   if (isVectorType(Ty)) {
7907     _movp(Reg, Src);
7908   } else {
7909     _mov(Reg, Src);
7910   }
7911   return Reg;
7912 }
7913 
7914 template <typename TraitsType>
7915 Operand *TargetX86Base<TraitsType>::legalize(Operand *From, LegalMask Allowed,
7916                                              RegNumT RegNum) {
7917   const bool UseNonsfi = getFlags().getUseNonsfi();
7918   const Type Ty = From->getType();
7919   // Assert that a physical register is allowed. To date, all calls to
7920   // legalize() allow a physical register. If a physical register needs to be
7921   // explicitly disallowed, then new code will need to be written to force a
7922   // spill.
7923   assert(Allowed & Legal_Reg);
7924   // If we're asking for a specific physical register, make sure we're not
7925   // allowing any other operand kinds. (This could be future work, e.g. allow
7926   // the shl shift amount to be either an immediate or in ecx.)
7927   assert(RegNum.hasNoValue() || Allowed == Legal_Reg);
7928 
7929   // Substitute with an available infinite-weight variable if possible.  Only do
7930   // this when we are not asking for a specific register, and when the
7931   // substitution is not locked to a specific register, and when the types
7932   // match, in order to capture the vast majority of opportunities and avoid
7933   // corner cases in the lowering.
7934   if (RegNum.hasNoValue()) {
7935     if (Variable *Subst = getContext().availabilityGet(From)) {
7936       // At this point we know there is a potential substitution available.
7937       if (Subst->mustHaveReg() && !Subst->hasReg()) {
7938         // At this point we know the substitution will have a register.
7939         if (From->getType() == Subst->getType()) {
7940           // At this point we know the substitution's register is compatible.
7941           return Subst;
7942         }
7943       }
7944     }
7945   }
7946 
7947   if (auto *Mem = llvm::dyn_cast<X86OperandMem>(From)) {
7948     // Before doing anything with a Mem operand, we need to ensure that the
7949     // Base and Index components are in physical registers.
7950     Variable *Base = Mem->getBase();
7951     Variable *Index = Mem->getIndex();
7952     Constant *Offset = Mem->getOffset();
7953     Variable *RegBase = nullptr;
7954     Variable *RegIndex = nullptr;
7955     uint16_t Shift = Mem->getShift();
7956     if (Base) {
7957       RegBase = llvm::cast<Variable>(
7958           legalize(Base, Legal_Reg | Legal_Rematerializable));
7959     }
7960     if (Index) {
7961       // TODO(jpp): perhaps we should only allow Legal_Reg if
7962       // Base->isRematerializable.
7963       RegIndex = llvm::cast<Variable>(
7964           legalize(Index, Legal_Reg | Legal_Rematerializable));
7965     }
7966 
7967     if (Base != RegBase || Index != RegIndex) {
7968       Mem = X86OperandMem::create(Func, Ty, RegBase, Offset, RegIndex, Shift,
7969                                   Mem->getSegmentRegister());
7970     }
7971 
7972     // For all Memory Operands, we do randomization/pooling here.
7973     From = randomizeOrPoolImmediate(Mem);
7974 
7975     if (!(Allowed & Legal_Mem)) {
7976       From = copyToReg(From, RegNum);
7977     }
7978     return From;
7979   }
7980 
7981   if (auto *Const = llvm::dyn_cast<Constant>(From)) {
7982     if (llvm::isa<ConstantUndef>(Const)) {
7983       From = legalizeUndef(Const, RegNum);
7984       if (isVectorType(Ty))
7985         return From;
7986       Const = llvm::cast<Constant>(From);
7987     }
7988     // There should be no constants of vector type (other than undef).
7989     assert(!isVectorType(Ty));
7990 
7991     // If the operand is a 64 bit constant integer we need to legalize it to a
7992     // register in x86-64.
7993     if (Traits::Is64Bit) {
7994       if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Const)) {
7995         if (!Utils::IsInt(32, C64->getValue())) {
7996           if (RegNum.hasValue()) {
7997             assert(Traits::getGprForType(IceType_i64, RegNum) == RegNum);
7998           }
7999           return copyToReg(Const, RegNum);
8000         }
8001       }
8002     }
8003 
8004     // If the operand is an 32 bit constant integer, we should check whether we
8005     // need to randomize it or pool it.
8006     if (auto *C = llvm::dyn_cast<ConstantInteger32>(Const)) {
8007       Operand *NewConst = randomizeOrPoolImmediate(C, RegNum);
8008       if (NewConst != Const) {
8009         return NewConst;
8010       }
8011     }
8012 
8013     if (auto *CR = llvm::dyn_cast<ConstantRelocatable>(Const)) {
8014       // If the operand is a ConstantRelocatable, and Legal_AddrAbs is not
8015       // specified, and UseNonsfi is indicated, we need to add RebasePtr.
8016       if (UseNonsfi && !(Allowed & Legal_AddrAbs)) {
8017         assert(Ty == IceType_i32);
8018         Variable *NewVar = makeReg(Ty, RegNum);
8019         auto *Mem = Traits::X86OperandMem::create(Func, Ty, nullptr, CR);
8020         // LEAs are not automatically sandboxed, thus we explicitly invoke
8021         // _sandbox_mem_reference.
8022         _lea(NewVar, _sandbox_mem_reference(Mem));
8023         From = NewVar;
8024       }
8025     } else if (isScalarFloatingType(Ty)) {
8026       // Convert a scalar floating point constant into an explicit memory
8027       // operand.
8028       if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(Const)) {
8029         if (Utils::isPositiveZero(ConstFloat->getValue()))
8030           return makeZeroedRegister(Ty, RegNum);
8031       } else if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(Const)) {
8032         if (Utils::isPositiveZero(ConstDouble->getValue()))
8033           return makeZeroedRegister(Ty, RegNum);
8034       }
8035 
8036       auto *CFrom = llvm::cast<Constant>(From);
8037       assert(CFrom->getShouldBePooled());
8038       Constant *Offset = Ctx->getConstantSym(0, CFrom->getLabelName());
8039       auto *Mem = X86OperandMem::create(Func, Ty, nullptr, Offset);
8040       From = Mem;
8041     }
8042 
8043     bool NeedsReg = false;
8044     if (!(Allowed & Legal_Imm) && !isScalarFloatingType(Ty))
8045       // Immediate specifically not allowed.
8046       NeedsReg = true;
8047     if (!(Allowed & Legal_Mem) && isScalarFloatingType(Ty))
8048       // On x86, FP constants are lowered to mem operands.
8049       NeedsReg = true;
8050     if (NeedsReg) {
8051       From = copyToReg(From, RegNum);
8052     }
8053     return From;
8054   }
8055 
8056   if (auto *Var = llvm::dyn_cast<Variable>(From)) {
8057     // Check if the variable is guaranteed a physical register. This can happen
8058     // either when the variable is pre-colored or when it is assigned infinite
8059     // weight.
8060     bool MustHaveRegister = (Var->hasReg() || Var->mustHaveReg());
8061     bool MustRematerialize =
8062         (Var->isRematerializable() && !(Allowed & Legal_Rematerializable));
8063     // We need a new physical register for the operand if:
8064     // - Mem is not allowed and Var isn't guaranteed a physical register, or
8065     // - RegNum is required and Var->getRegNum() doesn't match, or
8066     // - Var is a rematerializable variable and rematerializable pass-through is
8067     //   not allowed (in which case we need a lea instruction).
8068     if (MustRematerialize) {
8069       Variable *NewVar = makeReg(Ty, RegNum);
8070       // Since Var is rematerializable, the offset will be added when the lea is
8071       // emitted.
8072       constexpr Constant *NoOffset = nullptr;
8073       auto *Mem = X86OperandMem::create(Func, Ty, Var, NoOffset);
8074       _lea(NewVar, Mem);
8075       From = NewVar;
8076     } else if ((!(Allowed & Legal_Mem) && !MustHaveRegister) ||
8077                (RegNum.hasValue() && RegNum != Var->getRegNum())) {
8078       From = copyToReg(From, RegNum);
8079     }
8080     return From;
8081   }
8082 
8083   llvm::report_fatal_error("Unhandled operand kind in legalize()");
8084   return From;
8085 }
8086 
8087 /// Provide a trivial wrapper to legalize() for this common usage.
8088 template <typename TraitsType>
8089 Variable *TargetX86Base<TraitsType>::legalizeToReg(Operand *From,
8090                                                    RegNumT RegNum) {
8091   return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum));
8092 }
8093 
8094 /// Legalize undef values to concrete values.
8095 template <typename TraitsType>
8096 Operand *TargetX86Base<TraitsType>::legalizeUndef(Operand *From,
8097                                                   RegNumT RegNum) {
8098   Type Ty = From->getType();
8099   if (llvm::isa<ConstantUndef>(From)) {
8100     // Lower undefs to zero.  Another option is to lower undefs to an
8101     // uninitialized register; however, using an uninitialized register results
8102     // in less predictable code.
8103     //
8104     // If in the future the implementation is changed to lower undef values to
8105     // uninitialized registers, a FakeDef will be needed:
8106     //     Context.insert<InstFakeDef>(Reg);
8107     // This is in order to ensure that the live range of Reg is not
8108     // overestimated.  If the constant being lowered is a 64 bit value, then
8109     // the result should be split and the lo and hi components will need to go
8110     // in uninitialized registers.
8111     if (isVectorType(Ty))
8112       return makeVectorOfZeros(Ty, RegNum);
8113     return Ctx->getConstantZero(Ty);
8114   }
8115   return From;
8116 }
8117 
8118 /// For the cmp instruction, if Src1 is an immediate, or known to be a physical
8119 /// register, we can allow Src0 to be a memory operand. Otherwise, Src0 must be
8120 /// copied into a physical register. (Actually, either Src0 or Src1 can be
8121 /// chosen for the physical register, but unfortunately we have to commit to one
8122 /// or the other before register allocation.)
8123 template <typename TraitsType>
8124 Operand *TargetX86Base<TraitsType>::legalizeSrc0ForCmp(Operand *Src0,
8125                                                        Operand *Src1) {
8126   bool IsSrc1ImmOrReg = false;
8127   if (llvm::isa<Constant>(Src1)) {
8128     IsSrc1ImmOrReg = true;
8129   } else if (auto *Var = llvm::dyn_cast<Variable>(Src1)) {
8130     if (Var->hasReg())
8131       IsSrc1ImmOrReg = true;
8132   }
8133   return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg);
8134 }
8135 
8136 template <typename TraitsType>
8137 typename TargetX86Base<TraitsType>::X86OperandMem *
8138 TargetX86Base<TraitsType>::formMemoryOperand(Operand *Opnd, Type Ty,
8139                                              bool DoLegalize) {
8140   auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd);
8141   // It may be the case that address mode optimization already creates an
8142   // X86OperandMem, so in that case it wouldn't need another level of
8143   // transformation.
8144   if (!Mem) {
8145     auto *Base = llvm::dyn_cast<Variable>(Opnd);
8146     auto *Offset = llvm::dyn_cast<Constant>(Opnd);
8147     assert(Base || Offset);
8148     if (Offset) {
8149       // During memory operand building, we do not blind or pool the constant
8150       // offset, we will work on the whole memory operand later as one entity
8151       // later, this save one instruction. By turning blinding and pooling off,
8152       // we guarantee legalize(Offset) will return a Constant*.
8153       if (!llvm::isa<ConstantRelocatable>(Offset)) {
8154         BoolFlagSaver B(RandomizationPoolingPaused, true);
8155 
8156         Offset = llvm::cast<Constant>(legalize(Offset));
8157       }
8158 
8159       assert(llvm::isa<ConstantInteger32>(Offset) ||
8160              llvm::isa<ConstantRelocatable>(Offset));
8161     }
8162     // Not completely sure whether it's OK to leave IsRebased unset when
8163     // creating the mem operand.  If DoLegalize is true, it will definitely be
8164     // applied during the legalize() call, but perhaps not during the
8165     // randomizeOrPoolImmediate() call.  In any case, the emit routines will
8166     // assert that PIC legalization has been applied.
8167     Mem = X86OperandMem::create(Func, Ty, Base, Offset);
8168   }
8169   // Do legalization, which contains randomization/pooling or do
8170   // randomization/pooling.
8171   return llvm::cast<X86OperandMem>(DoLegalize ? legalize(Mem)
8172                                               : randomizeOrPoolImmediate(Mem));
8173 }
8174 
8175 template <typename TraitsType>
8176 Variable *TargetX86Base<TraitsType>::makeReg(Type Type, RegNumT RegNum) {
8177   // There aren't any 64-bit integer registers for x86-32.
8178   assert(Traits::Is64Bit || Type != IceType_i64);
8179   Variable *Reg = Func->makeVariable(Type);
8180   if (RegNum.hasValue())
8181     Reg->setRegNum(RegNum);
8182   else
8183     Reg->setMustHaveReg();
8184   return Reg;
8185 }
8186 
8187 const Type TypeForSize[] = {IceType_i8, IceType_i16, IceType_i32, IceType_f64,
8188                             IceType_v16i8};
8189 
8190 template <typename TraitsType>
8191 Type TargetX86Base<TraitsType>::largestTypeInSize(uint32_t Size,
8192                                                   uint32_t MaxSize) {
8193   assert(Size != 0);
8194   uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
8195   uint32_t MaxIndex = MaxSize == NoSizeLimit
8196                           ? llvm::array_lengthof(TypeForSize) - 1
8197                           : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
8198   return TypeForSize[std::min(TyIndex, MaxIndex)];
8199 }
8200 
8201 template <typename TraitsType>
8202 Type TargetX86Base<TraitsType>::firstTypeThatFitsSize(uint32_t Size,
8203                                                       uint32_t MaxSize) {
8204   assert(Size != 0);
8205   uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
8206   if (!llvm::isPowerOf2_32(Size))
8207     ++TyIndex;
8208   uint32_t MaxIndex = MaxSize == NoSizeLimit
8209                           ? llvm::array_lengthof(TypeForSize) - 1
8210                           : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
8211   return TypeForSize[std::min(TyIndex, MaxIndex)];
8212 }
8213 
8214 template <typename TraitsType> void TargetX86Base<TraitsType>::postLower() {
8215   if (Func->getOptLevel() == Opt_m1)
8216     return;
8217   markRedefinitions();
8218   Context.availabilityUpdate();
8219 }
8220 
8221 template <typename TraitsType>
8222 void TargetX86Base<TraitsType>::makeRandomRegisterPermutation(
8223     llvm::SmallVectorImpl<RegNumT> &Permutation,
8224     const SmallBitVector &ExcludeRegisters, uint64_t Salt) const {
8225   Traits::makeRandomRegisterPermutation(Func, Permutation, ExcludeRegisters,
8226                                         Salt);
8227 }
8228 
8229 template <typename TraitsType>
8230 void TargetX86Base<TraitsType>::emit(const ConstantInteger32 *C) const {
8231   if (!BuildDefs::dump())
8232     return;
8233   Ostream &Str = Ctx->getStrEmit();
8234   Str << "$" << C->getValue();
8235 }
8236 
8237 template <typename TraitsType>
8238 void TargetX86Base<TraitsType>::emit(const ConstantInteger64 *C) const {
8239   if (!Traits::Is64Bit) {
8240     llvm::report_fatal_error("Not expecting to emit 64-bit integers");
8241   } else {
8242     if (!BuildDefs::dump())
8243       return;
8244     Ostream &Str = Ctx->getStrEmit();
8245     Str << "$" << C->getValue();
8246   }
8247 }
8248 
8249 template <typename TraitsType>
8250 void TargetX86Base<TraitsType>::emit(const ConstantFloat *C) const {
8251   if (!BuildDefs::dump())
8252     return;
8253   Ostream &Str = Ctx->getStrEmit();
8254   Str << C->getLabelName();
8255 }
8256 
8257 template <typename TraitsType>
8258 void TargetX86Base<TraitsType>::emit(const ConstantDouble *C) const {
8259   if (!BuildDefs::dump())
8260     return;
8261   Ostream &Str = Ctx->getStrEmit();
8262   Str << C->getLabelName();
8263 }
8264 
8265 template <typename TraitsType>
8266 void TargetX86Base<TraitsType>::emit(const ConstantUndef *) const {
8267   llvm::report_fatal_error("undef value encountered by emitter.");
8268 }
8269 
8270 template <class Machine>
8271 void TargetX86Base<Machine>::emit(const ConstantRelocatable *C) const {
8272   if (!BuildDefs::dump())
8273     return;
8274   assert(!getFlags().getUseNonsfi() ||
8275          C->getName().toString() == GlobalOffsetTable);
8276   Ostream &Str = Ctx->getStrEmit();
8277   Str << "$";
8278   emitWithoutPrefix(C);
8279 }
8280 
8281 /// Randomize or pool an Immediate.
8282 template <typename TraitsType>
8283 Operand *
8284 TargetX86Base<TraitsType>::randomizeOrPoolImmediate(Constant *Immediate,
8285                                                     RegNumT RegNum) {
8286   assert(llvm::isa<ConstantInteger32>(Immediate) ||
8287          llvm::isa<ConstantRelocatable>(Immediate));
8288   if (getFlags().getRandomizeAndPoolImmediatesOption() == RPI_None ||
8289       RandomizationPoolingPaused == true) {
8290     // Immediates randomization/pooling off or paused
8291     return Immediate;
8292   }
8293 
8294   if (Traits::Is64Bit && NeedSandboxing) {
8295     // Immediate randomization/pooling is currently disabled for x86-64
8296     // sandboxing for it could generate invalid memory operands.
8297     assert(false &&
8298            "Constant pooling/randomization is disabled for x8664 sandbox.");
8299     return Immediate;
8300   }
8301 
8302   if (!Immediate->shouldBeRandomizedOrPooled()) {
8303     // the constant Immediate is not eligible for blinding/pooling
8304     return Immediate;
8305   }
8306   Ctx->statsUpdateRPImms();
8307   switch (getFlags().getRandomizeAndPoolImmediatesOption()) {
8308   default:
8309     llvm::report_fatal_error("Unsupported -randomize-pool-immediates option");
8310   case RPI_Randomize: {
8311     // blind the constant
8312     // FROM:
8313     //  imm
8314     // TO:
8315     //  insert: mov imm+cookie, Reg
8316     //  insert: lea -cookie[Reg], Reg
8317     //  => Reg
8318     // If we have already assigned a phy register, we must come from
8319     // advancedPhiLowering()=>lowerAssign(). In this case we should reuse the
8320     // assigned register as this assignment is that start of its use-def
8321     // chain. So we add RegNum argument here. Note we use 'lea' instruction
8322     // instead of 'xor' to avoid affecting the flags.
8323     Variable *Reg = makeReg(IceType_i32, RegNum);
8324     auto *Integer = llvm::cast<ConstantInteger32>(Immediate);
8325     uint32_t Value = Integer->getValue();
8326     uint32_t Cookie = Func->getConstantBlindingCookie();
8327     _mov(Reg, Ctx->getConstantInt(IceType_i32, Cookie + Value));
8328     Constant *Offset = Ctx->getConstantInt(IceType_i32, 0 - Cookie);
8329     _lea(Reg, X86OperandMem::create(Func, IceType_i32, Reg, Offset));
8330     if (Immediate->getType() == IceType_i32) {
8331       return Reg;
8332     }
8333     Variable *TruncReg = makeReg(Immediate->getType(), RegNum);
8334     _mov(TruncReg, Reg);
8335     return TruncReg;
8336   }
8337   case RPI_Pool: {
8338     // pool the constant
8339     // FROM:
8340     //  imm
8341     // TO:
8342     //  insert: mov $label, Reg
8343     //  => Reg
8344     assert(getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Pool);
8345     assert(Immediate->getShouldBePooled());
8346     // if we have already assigned a phy register, we must come from
8347     // advancedPhiLowering()=>lowerAssign(). In this case we should reuse the
8348     // assigned register as this assignment is that start of its use-def
8349     // chain. So we add RegNum argument here.
8350     Variable *Reg = makeReg(Immediate->getType(), RegNum);
8351     constexpr RelocOffsetT Offset = 0;
8352     Constant *Symbol = Ctx->getConstantSym(Offset, Immediate->getLabelName());
8353     constexpr Variable *NoBase = nullptr;
8354     X86OperandMem *MemOperand =
8355         X86OperandMem::create(Func, Immediate->getType(), NoBase, Symbol);
8356     _mov(Reg, MemOperand);
8357     return Reg;
8358   }
8359   }
8360 }
8361 
8362 template <typename TraitsType>
8363 typename TargetX86Base<TraitsType>::X86OperandMem *
8364 TargetX86Base<TraitsType>::randomizeOrPoolImmediate(X86OperandMem *MemOperand,
8365                                                     RegNumT RegNum) {
8366   assert(MemOperand);
8367   if (getFlags().getRandomizeAndPoolImmediatesOption() == RPI_None ||
8368       RandomizationPoolingPaused == true) {
8369     // immediates randomization/pooling is turned off
8370     return MemOperand;
8371   }
8372 
8373   if (Traits::Is64Bit && NeedSandboxing) {
8374     // Immediate randomization/pooling is currently disabled for x86-64
8375     // sandboxing for it could generate invalid memory operands.
8376     assert(false &&
8377            "Constant pooling/randomization is disabled for x8664 sandbox.");
8378     return MemOperand;
8379   }
8380 
8381   // If this memory operand is already a randomized one, we do not randomize it
8382   // again.
8383   if (MemOperand->getRandomized())
8384     return MemOperand;
8385 
8386   auto *C = llvm::dyn_cast_or_null<Constant>(MemOperand->getOffset());
8387 
8388   if (C == nullptr) {
8389     return MemOperand;
8390   }
8391 
8392   if (!C->shouldBeRandomizedOrPooled()) {
8393     return MemOperand;
8394   }
8395 
8396   // The offset of this mem operand should be blinded or pooled
8397   Ctx->statsUpdateRPImms();
8398   switch (getFlags().getRandomizeAndPoolImmediatesOption()) {
8399   default:
8400     llvm::report_fatal_error("Unsupported -randomize-pool-immediates option");
8401   case RPI_Randomize: {
8402     // blind the constant offset
8403     // FROM:
8404     //  offset[base, index, shift]
8405     // TO:
8406     //  insert: lea offset+cookie[base], RegTemp
8407     //  => -cookie[RegTemp, index, shift]
8408     uint32_t Value =
8409         llvm::dyn_cast<ConstantInteger32>(MemOperand->getOffset())->getValue();
8410     uint32_t Cookie = Func->getConstantBlindingCookie();
8411     Constant *Mask1 =
8412         Ctx->getConstantInt(MemOperand->getOffset()->getType(), Cookie + Value);
8413     Constant *Mask2 =
8414         Ctx->getConstantInt(MemOperand->getOffset()->getType(), 0 - Cookie);
8415 
8416     X86OperandMem *TempMemOperand = X86OperandMem::create(
8417         Func, MemOperand->getType(), MemOperand->getBase(), Mask1);
8418     // If we have already assigned a physical register, we must come from
8419     // advancedPhiLowering()=>lowerAssign(). In this case we should reuse
8420     // the assigned register as this assignment is that start of its
8421     // use-def chain. So we add RegNum argument here.
8422     Variable *RegTemp = makeReg(MemOperand->getOffset()->getType(), RegNum);
8423     _lea(RegTemp, TempMemOperand);
8424 
8425     X86OperandMem *NewMemOperand = X86OperandMem::create(
8426         Func, MemOperand->getType(), RegTemp, Mask2, MemOperand->getIndex(),
8427         MemOperand->getShift(), MemOperand->getSegmentRegister(),
8428         MemOperand->getIsRebased());
8429 
8430     // Label this memory operand as randomized, so we won't randomize it
8431     // again in case we call legalize() multiple times on this memory
8432     // operand.
8433     NewMemOperand->setRandomized(true);
8434     return NewMemOperand;
8435   }
8436   case RPI_Pool: {
8437     // pool the constant offset
8438     // FROM:
8439     //  offset[base, index, shift]
8440     // TO:
8441     //  insert: mov $label, RegTemp
8442     //  insert: lea [base, RegTemp], RegTemp
8443     //  =>[RegTemp, index, shift]
8444 
8445     // Memory operand should never exist as source operands in phi lowering
8446     // assignments, so there is no need to reuse any registers here. For
8447     // phi lowering, we should not ask for new physical registers in
8448     // general. However, if we do meet Memory Operand during phi lowering,
8449     // we should not blind or pool the immediates for now.
8450     if (RegNum.hasValue())
8451       return MemOperand;
8452     Variable *RegTemp = makeReg(IceType_i32);
8453     assert(MemOperand->getOffset()->getShouldBePooled());
8454     constexpr RelocOffsetT SymOffset = 0;
8455     Constant *Symbol =
8456         Ctx->getConstantSym(SymOffset, MemOperand->getOffset()->getLabelName());
8457     constexpr Variable *NoBase = nullptr;
8458     X86OperandMem *SymbolOperand = X86OperandMem::create(
8459         Func, MemOperand->getOffset()->getType(), NoBase, Symbol);
8460     _mov(RegTemp, SymbolOperand);
8461     // If we have a base variable here, we should add the lea instruction
8462     // to add the value of the base variable to RegTemp. If there is no
8463     // base variable, we won't need this lea instruction.
8464     if (MemOperand->getBase()) {
8465       X86OperandMem *CalculateOperand = X86OperandMem::create(
8466           Func, MemOperand->getType(), MemOperand->getBase(), nullptr, RegTemp,
8467           0, MemOperand->getSegmentRegister());
8468       _lea(RegTemp, CalculateOperand);
8469     }
8470     X86OperandMem *NewMemOperand = X86OperandMem::create(
8471         Func, MemOperand->getType(), RegTemp, nullptr, MemOperand->getIndex(),
8472         MemOperand->getShift(), MemOperand->getSegmentRegister());
8473     return NewMemOperand;
8474   }
8475   }
8476 }
8477 
8478 template <typename TraitsType>
8479 void TargetX86Base<TraitsType>::emitJumpTable(
8480     const Cfg *, const InstJumpTable *JumpTable) const {
8481   if (!BuildDefs::dump())
8482     return;
8483   Ostream &Str = Ctx->getStrEmit();
8484   const bool UseNonsfi = getFlags().getUseNonsfi();
8485   const char *Prefix = UseNonsfi ? ".data.rel.ro." : ".rodata.";
8486   Str << "\t.section\t" << Prefix << JumpTable->getSectionName()
8487       << ",\"a\",@progbits\n"
8488          "\t.align\t" << typeWidthInBytes(getPointerType()) << "\n"
8489       << JumpTable->getName() << ":";
8490 
8491   // On X86 ILP32 pointers are 32-bit hence the use of .long
8492   for (SizeT I = 0; I < JumpTable->getNumTargets(); ++I)
8493     Str << "\n\t.long\t" << JumpTable->getTarget(I)->getAsmName();
8494   Str << "\n";
8495 }
8496 
8497 template <typename TraitsType>
8498 template <typename T>
8499 void TargetDataX86<TraitsType>::emitConstantPool(GlobalContext *Ctx) {
8500   if (!BuildDefs::dump())
8501     return;
8502   Ostream &Str = Ctx->getStrEmit();
8503   Type Ty = T::Ty;
8504   SizeT Align = typeAlignInBytes(Ty);
8505   ConstantList Pool = Ctx->getConstantPool(Ty);
8506 
8507   Str << "\t.section\t.rodata.cst" << Align << ",\"aM\",@progbits," << Align
8508       << "\n";
8509   Str << "\t.align\t" << Align << "\n";
8510 
8511   // If reorder-pooled-constants option is set to true, we need to shuffle the
8512   // constant pool before emitting it.
8513   if (getFlags().getReorderPooledConstants() && !Pool.empty()) {
8514     // Use the constant's kind value as the salt for creating random number
8515     // generator.
8516     Operand::OperandKind K = (*Pool.begin())->getKind();
8517     RandomNumberGenerator RNG(getFlags().getRandomSeed(),
8518                               RPE_PooledConstantReordering, K);
8519     RandomShuffle(Pool.begin(), Pool.end(),
8520                   [&RNG](uint64_t N) { return (uint32_t)RNG.next(N); });
8521   }
8522 
8523   for (Constant *C : Pool) {
8524     if (!C->getShouldBePooled())
8525       continue;
8526     auto *Const = llvm::cast<typename T::IceType>(C);
8527     typename T::IceType::PrimType Value = Const->getValue();
8528     // Use memcpy() to copy bits from Value into RawValue in a way that avoids
8529     // breaking strict-aliasing rules.
8530     typename T::PrimitiveIntType RawValue;
8531     memcpy(&RawValue, &Value, sizeof(Value));
8532     char buf[30];
8533     int CharsPrinted =
8534         snprintf(buf, llvm::array_lengthof(buf), T::PrintfString, RawValue);
8535     assert(CharsPrinted >= 0);
8536     assert((size_t)CharsPrinted < llvm::array_lengthof(buf));
8537     (void)CharsPrinted; // avoid warnings if asserts are disabled
8538     Str << Const->getLabelName();
8539     Str << ":\n\t" << T::AsmTag << "\t" << buf << "\t/* " << T::TypeName << " "
8540         << Value << " */\n";
8541   }
8542 }
8543 
8544 template <typename TraitsType>
8545 void TargetDataX86<TraitsType>::lowerConstants() {
8546   if (getFlags().getDisableTranslation())
8547     return;
8548   switch (getFlags().getOutFileType()) {
8549   case FT_Elf: {
8550     ELFObjectWriter *Writer = Ctx->getObjectWriter();
8551 
8552     Writer->writeConstantPool<ConstantInteger32>(IceType_i8);
8553     Writer->writeConstantPool<ConstantInteger32>(IceType_i16);
8554     Writer->writeConstantPool<ConstantInteger32>(IceType_i32);
8555 
8556     Writer->writeConstantPool<ConstantFloat>(IceType_f32);
8557     Writer->writeConstantPool<ConstantDouble>(IceType_f64);
8558   } break;
8559   case FT_Asm:
8560   case FT_Iasm: {
8561     OstreamLocker L(Ctx);
8562 
8563     emitConstantPool<PoolTypeConverter<uint8_t>>(Ctx);
8564     emitConstantPool<PoolTypeConverter<uint16_t>>(Ctx);
8565     emitConstantPool<PoolTypeConverter<uint32_t>>(Ctx);
8566 
8567     emitConstantPool<PoolTypeConverter<float>>(Ctx);
8568     emitConstantPool<PoolTypeConverter<double>>(Ctx);
8569   } break;
8570   }
8571 }
8572 
8573 template <typename TraitsType>
8574 void TargetDataX86<TraitsType>::lowerJumpTables() {
8575   const bool IsPIC = getFlags().getUseNonsfi();
8576   switch (getFlags().getOutFileType()) {
8577   case FT_Elf: {
8578     ELFObjectWriter *Writer = Ctx->getObjectWriter();
8579     constexpr FixupKind FK_Abs64 = llvm::ELF::R_X86_64_64;
8580     const FixupKind RelocationKind =
8581         (getPointerType() == IceType_i32) ? Traits::FK_Abs : FK_Abs64;
8582     for (const JumpTableData &JT : Ctx->getJumpTables())
8583       Writer->writeJumpTable(JT, RelocationKind, IsPIC);
8584   } break;
8585   case FT_Asm:
8586     // Already emitted from Cfg
8587     break;
8588   case FT_Iasm: {
8589     if (!BuildDefs::dump())
8590       return;
8591     Ostream &Str = Ctx->getStrEmit();
8592     const char *Prefix = IsPIC ? ".data.rel.ro." : ".rodata.";
8593     for (const JumpTableData &JT : Ctx->getJumpTables()) {
8594       Str << "\t.section\t" << Prefix << JT.getSectionName()
8595           << ",\"a\",@progbits\n"
8596              "\t.align\t" << typeWidthInBytes(getPointerType()) << "\n"
8597           << JT.getName().toString() << ":";
8598 
8599       // On X8664 ILP32 pointers are 32-bit hence the use of .long
8600       for (intptr_t TargetOffset : JT.getTargetOffsets())
8601         Str << "\n\t.long\t" << JT.getFunctionName() << "+" << TargetOffset;
8602       Str << "\n";
8603     }
8604   } break;
8605   }
8606 }
8607 
8608 template <typename TraitsType>
8609 void TargetDataX86<TraitsType>::lowerGlobals(
8610     const VariableDeclarationList &Vars, const std::string &SectionSuffix) {
8611   const bool IsPIC = getFlags().getUseNonsfi();
8612   switch (getFlags().getOutFileType()) {
8613   case FT_Elf: {
8614     ELFObjectWriter *Writer = Ctx->getObjectWriter();
8615     Writer->writeDataSection(Vars, Traits::FK_Abs, SectionSuffix, IsPIC);
8616   } break;
8617   case FT_Asm:
8618   case FT_Iasm: {
8619     OstreamLocker L(Ctx);
8620     for (const VariableDeclaration *Var : Vars) {
8621       if (getFlags().matchTranslateOnly(Var->getName(), 0)) {
8622         emitGlobal(*Var, SectionSuffix);
8623       }
8624     }
8625   } break;
8626   }
8627 }
8628 } // end of namespace X86NAMESPACE
8629 } // end of namespace Ice
8630 
8631 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H
8632