1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==// 2 // 3 // The Subzero Code Generator 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// 10 /// \file 11 /// \brief Implements the TargetLoweringX86Base class, which consists almost 12 /// entirely of the lowering sequence for each high-level instruction. 13 /// 14 //===----------------------------------------------------------------------===// 15 16 #ifndef SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H 17 #define SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H 18 19 #include "IceCfg.h" 20 #include "IceCfgNode.h" 21 #include "IceClFlags.h" 22 #include "IceDefs.h" 23 #include "IceELFObjectWriter.h" 24 #include "IceGlobalInits.h" 25 #include "IceInstVarIter.h" 26 #include "IceInstX86Base.h" 27 #include "IceLiveness.h" 28 #include "IceOperand.h" 29 #include "IcePhiLoweringImpl.h" 30 #include "IceUtils.h" 31 #include "IceVariableSplitting.h" 32 33 #include "llvm/Support/MathExtras.h" 34 35 #include <stack> 36 37 namespace Ice { 38 namespace X86 { 39 template <typename T> struct PoolTypeConverter {}; 40 41 template <> struct PoolTypeConverter<float> { 42 using PrimitiveIntType = uint32_t; 43 using IceType = ConstantFloat; 44 static const Type Ty = IceType_f32; 45 static const char *TypeName; 46 static const char *AsmTag; 47 static const char *PrintfString; 48 }; 49 50 template <> struct PoolTypeConverter<double> { 51 using PrimitiveIntType = uint64_t; 52 using IceType = ConstantDouble; 53 static const Type Ty = IceType_f64; 54 static const char *TypeName; 55 static const char *AsmTag; 56 static const char *PrintfString; 57 }; 58 59 // Add converter for int type constant pooling 60 template <> struct PoolTypeConverter<uint32_t> { 61 using PrimitiveIntType = uint32_t; 62 using IceType = ConstantInteger32; 63 static const Type Ty = IceType_i32; 64 static const char *TypeName; 65 static const char *AsmTag; 66 static const char *PrintfString; 67 }; 68 69 // Add converter for int type constant pooling 70 template <> struct PoolTypeConverter<uint16_t> { 71 using PrimitiveIntType = uint32_t; 72 using IceType = ConstantInteger32; 73 static const Type Ty = IceType_i16; 74 static const char *TypeName; 75 static const char *AsmTag; 76 static const char *PrintfString; 77 }; 78 79 // Add converter for int type constant pooling 80 template <> struct PoolTypeConverter<uint8_t> { 81 using PrimitiveIntType = uint32_t; 82 using IceType = ConstantInteger32; 83 static const Type Ty = IceType_i8; 84 static const char *TypeName; 85 static const char *AsmTag; 86 static const char *PrintfString; 87 }; 88 } // end of namespace X86 89 90 namespace X86NAMESPACE { 91 92 // The Microsoft x64 ABI requires the caller to allocate a minimum 32 byte 93 // "shadow store" (aka "home space") so that the callee may copy the 4 94 // register args to it. 95 template <typename Traits> SizeT getShadowStoreSize() { 96 #if defined(SUBZERO_USE_MICROSOFT_ABI) 97 static const SizeT ShadowStoreSize = 98 Traits::Is64Bit ? 4 * typeWidthInBytes(Traits::WordType) : 0; 99 return ShadowStoreSize; 100 #else 101 return 0; 102 #endif 103 } 104 105 using Utils::BoolFlagSaver; 106 107 template <typename Traits> class BoolFoldingEntry { 108 BoolFoldingEntry(const BoolFoldingEntry &) = delete; 109 110 public: 111 BoolFoldingEntry() = default; 112 explicit BoolFoldingEntry(Inst *I); 113 BoolFoldingEntry &operator=(const BoolFoldingEntry &) = default; 114 /// Instr is the instruction producing the i1-type variable of interest. 115 Inst *Instr = nullptr; 116 /// IsComplex is the cached result of BoolFolding::hasComplexLowering(Instr). 117 bool IsComplex = false; 118 /// IsLiveOut is initialized conservatively to true, and is set to false when 119 /// we encounter an instruction that ends Var's live range. We disable the 120 /// folding optimization when Var is live beyond this basic block. Note that 121 /// if liveness analysis is not performed (e.g. in Om1 mode), IsLiveOut will 122 /// always be true and the folding optimization will never be performed. 123 bool IsLiveOut = true; 124 // NumUses counts the number of times Var is used as a source operand in the 125 // basic block. If IsComplex is true and there is more than one use of Var, 126 // then the folding optimization is disabled for Var. 127 uint32_t NumUses = 0; 128 }; 129 130 template <typename Traits> class BoolFolding { 131 public: 132 enum BoolFoldingProducerKind { 133 PK_None, 134 // TODO(jpp): PK_Icmp32 is no longer meaningful. Rename to PK_IcmpNative. 135 PK_Icmp32, 136 PK_Icmp64, 137 PK_Fcmp, 138 PK_Trunc, 139 PK_Arith // A flag-setting arithmetic instruction. 140 }; 141 142 /// Currently the actual enum values are not used (other than CK_None), but we 143 /// go ahead and produce them anyway for symmetry with the 144 /// BoolFoldingProducerKind. 145 enum BoolFoldingConsumerKind { CK_None, CK_Br, CK_Select, CK_Sext, CK_Zext }; 146 147 private: 148 BoolFolding(const BoolFolding &) = delete; 149 BoolFolding &operator=(const BoolFolding &) = delete; 150 151 public: 152 BoolFolding() = default; 153 static BoolFoldingProducerKind getProducerKind(const Inst *Instr); 154 static BoolFoldingConsumerKind getConsumerKind(const Inst *Instr); 155 static bool hasComplexLowering(const Inst *Instr); 156 static bool isValidFolding(BoolFoldingProducerKind ProducerKind, 157 BoolFoldingConsumerKind ConsumerKind); 158 void init(CfgNode *Node); 159 const Inst *getProducerFor(const Operand *Opnd) const; 160 void dump(const Cfg *Func) const; 161 162 private: 163 /// Returns true if Producers contains a valid entry for the given VarNum. 164 bool containsValid(SizeT VarNum) const { 165 auto Element = Producers.find(VarNum); 166 return Element != Producers.end() && Element->second.Instr != nullptr; 167 } 168 void setInvalid(SizeT VarNum) { Producers[VarNum].Instr = nullptr; } 169 void invalidateProducersOnStore(const Inst *Instr); 170 /// Producers maps Variable::Number to a BoolFoldingEntry. 171 CfgUnorderedMap<SizeT, BoolFoldingEntry<Traits>> Producers; 172 }; 173 174 template <typename Traits> 175 BoolFoldingEntry<Traits>::BoolFoldingEntry(Inst *I) 176 : Instr(I), IsComplex(BoolFolding<Traits>::hasComplexLowering(I)) {} 177 178 template <typename Traits> 179 typename BoolFolding<Traits>::BoolFoldingProducerKind 180 BoolFolding<Traits>::getProducerKind(const Inst *Instr) { 181 if (llvm::isa<InstIcmp>(Instr)) { 182 if (Traits::Is64Bit || Instr->getSrc(0)->getType() != IceType_i64) 183 return PK_Icmp32; 184 return PK_Icmp64; 185 } 186 if (llvm::isa<InstFcmp>(Instr)) 187 return PK_Fcmp; 188 if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) { 189 if (Traits::Is64Bit || Arith->getSrc(0)->getType() != IceType_i64) { 190 switch (Arith->getOp()) { 191 default: 192 return PK_None; 193 case InstArithmetic::And: 194 case InstArithmetic::Or: 195 return PK_Arith; 196 } 197 } 198 } 199 return PK_None; // TODO(stichnot): remove this 200 201 if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) { 202 switch (Cast->getCastKind()) { 203 default: 204 return PK_None; 205 case InstCast::Trunc: 206 return PK_Trunc; 207 } 208 } 209 return PK_None; 210 } 211 212 template <typename Traits> 213 typename BoolFolding<Traits>::BoolFoldingConsumerKind 214 BoolFolding<Traits>::getConsumerKind(const Inst *Instr) { 215 if (llvm::isa<InstBr>(Instr)) 216 return CK_Br; 217 if (llvm::isa<InstSelect>(Instr)) 218 return CK_Select; 219 return CK_None; // TODO(stichnot): remove this 220 221 if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) { 222 switch (Cast->getCastKind()) { 223 default: 224 return CK_None; 225 case InstCast::Sext: 226 return CK_Sext; 227 case InstCast::Zext: 228 return CK_Zext; 229 } 230 } 231 return CK_None; 232 } 233 234 /// Returns true if the producing instruction has a "complex" lowering sequence. 235 /// This generally means that its lowering sequence requires more than one 236 /// conditional branch, namely 64-bit integer compares and some floating-point 237 /// compares. When this is true, and there is more than one consumer, we prefer 238 /// to disable the folding optimization because it minimizes branches. 239 template <typename Traits> 240 bool BoolFolding<Traits>::hasComplexLowering(const Inst *Instr) { 241 switch (getProducerKind(Instr)) { 242 default: 243 return false; 244 case PK_Icmp64: 245 return !Traits::Is64Bit; 246 case PK_Fcmp: 247 return Traits::TableFcmp[llvm::cast<InstFcmp>(Instr)->getCondition()].C2 != 248 Traits::Cond::Br_None; 249 } 250 } 251 252 template <typename Traits> 253 bool BoolFolding<Traits>::isValidFolding( 254 typename BoolFolding<Traits>::BoolFoldingProducerKind ProducerKind, 255 typename BoolFolding<Traits>::BoolFoldingConsumerKind ConsumerKind) { 256 switch (ProducerKind) { 257 default: 258 return false; 259 case PK_Icmp32: 260 case PK_Icmp64: 261 case PK_Fcmp: 262 return (ConsumerKind == CK_Br) || (ConsumerKind == CK_Select); 263 case PK_Arith: 264 return ConsumerKind == CK_Br; 265 } 266 } 267 268 template <typename Traits> void BoolFolding<Traits>::init(CfgNode *Node) { 269 Producers.clear(); 270 for (Inst &Instr : Node->getInsts()) { 271 if (Instr.isDeleted()) 272 continue; 273 invalidateProducersOnStore(&Instr); 274 // Check whether Instr is a valid producer. 275 Variable *Var = Instr.getDest(); 276 if (Var) { // only consider instructions with an actual dest var 277 if (isBooleanType(Var->getType())) { // only bool-type dest vars 278 if (getProducerKind(&Instr) != PK_None) { // white-listed instructions 279 Producers[Var->getIndex()] = BoolFoldingEntry<Traits>(&Instr); 280 } 281 } 282 } 283 // Check each src variable against the map. 284 FOREACH_VAR_IN_INST(Var, Instr) { 285 SizeT VarNum = Var->getIndex(); 286 if (!containsValid(VarNum)) 287 continue; 288 // All valid consumers use Var as the first source operand 289 if (IndexOfVarOperandInInst(Var) != 0) { 290 setInvalid(VarNum); 291 continue; 292 } 293 // Consumer instructions must be white-listed 294 typename BoolFolding<Traits>::BoolFoldingConsumerKind ConsumerKind = 295 getConsumerKind(&Instr); 296 if (ConsumerKind == CK_None) { 297 setInvalid(VarNum); 298 continue; 299 } 300 typename BoolFolding<Traits>::BoolFoldingProducerKind ProducerKind = 301 getProducerKind(Producers[VarNum].Instr); 302 if (!isValidFolding(ProducerKind, ConsumerKind)) { 303 setInvalid(VarNum); 304 continue; 305 } 306 // Avoid creating multiple copies of complex producer instructions. 307 if (Producers[VarNum].IsComplex && Producers[VarNum].NumUses > 0) { 308 setInvalid(VarNum); 309 continue; 310 } 311 ++Producers[VarNum].NumUses; 312 if (Instr.isLastUse(Var)) { 313 Producers[VarNum].IsLiveOut = false; 314 } 315 } 316 } 317 for (auto &I : Producers) { 318 // Ignore entries previously marked invalid. 319 if (I.second.Instr == nullptr) 320 continue; 321 // Disable the producer if its dest may be live beyond this block. 322 if (I.second.IsLiveOut) { 323 setInvalid(I.first); 324 continue; 325 } 326 // Mark as "dead" rather than outright deleting. This is so that other 327 // peephole style optimizations during or before lowering have access to 328 // this instruction in undeleted form. See for example 329 // tryOptimizedCmpxchgCmpBr(). 330 I.second.Instr->setDead(); 331 } 332 } 333 334 template <typename Traits> 335 const Inst *BoolFolding<Traits>::getProducerFor(const Operand *Opnd) const { 336 auto *Var = llvm::dyn_cast<const Variable>(Opnd); 337 if (Var == nullptr) 338 return nullptr; 339 SizeT VarNum = Var->getIndex(); 340 auto Element = Producers.find(VarNum); 341 if (Element == Producers.end()) 342 return nullptr; 343 return Element->second.Instr; 344 } 345 346 template <typename Traits> 347 void BoolFolding<Traits>::dump(const Cfg *Func) const { 348 if (!BuildDefs::dump() || !Func->isVerbose(IceV_Folding)) 349 return; 350 OstreamLocker L(Func->getContext()); 351 Ostream &Str = Func->getContext()->getStrDump(); 352 for (auto &I : Producers) { 353 if (I.second.Instr == nullptr) 354 continue; 355 Str << "Found foldable producer:\n "; 356 I.second.Instr->dump(Func); 357 Str << "\n"; 358 } 359 } 360 361 /// If the given instruction has potential memory side effects (e.g. store, rmw, 362 /// or a call instruction with potential memory side effects), then we must not 363 /// allow a pre-store Producer instruction with memory operands to be folded 364 /// into a post-store Consumer instruction. If this is detected, the Producer 365 /// is invalidated. 366 /// 367 /// We use the Producer's IsLiveOut field to determine whether any potential 368 /// Consumers come after this store instruction. The IsLiveOut field is 369 /// initialized to true, and BoolFolding::init() sets IsLiveOut to false when it 370 /// sees the variable's definitive last use (indicating the variable is not in 371 /// the node's live-out set). Thus if we see here that IsLiveOut is false, we 372 /// know that there can be no consumers after the store, and therefore we know 373 /// the folding is safe despite the store instruction. 374 template <typename Traits> 375 void BoolFolding<Traits>::invalidateProducersOnStore(const Inst *Instr) { 376 if (!Instr->isMemoryWrite()) 377 return; 378 for (auto &ProducerPair : Producers) { 379 if (!ProducerPair.second.IsLiveOut) 380 continue; 381 Inst *PInst = ProducerPair.second.Instr; 382 if (PInst == nullptr) 383 continue; 384 bool HasMemOperand = false; 385 const SizeT SrcSize = PInst->getSrcSize(); 386 for (SizeT I = 0; I < SrcSize; ++I) { 387 if (llvm::isa<typename Traits::X86OperandMem>(PInst->getSrc(I))) { 388 HasMemOperand = true; 389 break; 390 } 391 } 392 if (!HasMemOperand) 393 continue; 394 setInvalid(ProducerPair.first); 395 } 396 } 397 398 template <typename TraitsType> 399 void TargetX86Base<TraitsType>::initNodeForLowering(CfgNode *Node) { 400 FoldingInfo.init(Node); 401 FoldingInfo.dump(Func); 402 } 403 404 template <typename TraitsType> 405 TargetX86Base<TraitsType>::TargetX86Base(Cfg *Func) 406 : TargetLowering(Func), NeedSandboxing(SandboxingType == ST_NaCl) { 407 static_assert( 408 (Traits::InstructionSet::End - Traits::InstructionSet::Begin) == 409 (TargetInstructionSet::X86InstructionSet_End - 410 TargetInstructionSet::X86InstructionSet_Begin), 411 "Traits::InstructionSet range different from TargetInstructionSet"); 412 if (getFlags().getTargetInstructionSet() != 413 TargetInstructionSet::BaseInstructionSet) { 414 InstructionSet = static_cast<InstructionSetEnum>( 415 (getFlags().getTargetInstructionSet() - 416 TargetInstructionSet::X86InstructionSet_Begin) + 417 Traits::InstructionSet::Begin); 418 } 419 } 420 421 template <typename TraitsType> 422 void TargetX86Base<TraitsType>::staticInit(GlobalContext *Ctx) { 423 RegNumT::setLimit(Traits::RegisterSet::Reg_NUM); 424 Traits::initRegisterSet(getFlags(), &TypeToRegisterSet, &RegisterAliases); 425 for (size_t i = 0; i < TypeToRegisterSet.size(); ++i) 426 TypeToRegisterSetUnfiltered[i] = TypeToRegisterSet[i]; 427 filterTypeToRegisterSet(Ctx, Traits::RegisterSet::Reg_NUM, 428 TypeToRegisterSet.data(), TypeToRegisterSet.size(), 429 Traits::getRegName, getRegClassName); 430 PcRelFixup = Traits::FK_PcRel; 431 AbsFixup = getFlags().getUseNonsfi() ? Traits::FK_Gotoff : Traits::FK_Abs; 432 } 433 434 template <typename TraitsType> 435 bool TargetX86Base<TraitsType>::shouldBePooled(const Constant *C) { 436 if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(C)) { 437 return !Utils::isPositiveZero(ConstFloat->getValue()); 438 } 439 if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(C)) { 440 return !Utils::isPositiveZero(ConstDouble->getValue()); 441 } 442 if (getFlags().getRandomizeAndPoolImmediatesOption() != RPI_Pool) { 443 return false; 444 } 445 return C->shouldBeRandomizedOrPooled(); 446 } 447 448 template <typename TraitsType> 449 ::Ice::Type TargetX86Base<TraitsType>::getPointerType() { 450 if (!Traits::Is64Bit || 451 ::Ice::getFlags().getApplicationBinaryInterface() == ::Ice::ABI_PNaCl) { 452 return ::Ice::IceType_i32; 453 } 454 return ::Ice::IceType_i64; 455 } 456 457 template <typename TraitsType> void TargetX86Base<TraitsType>::translateO2() { 458 TimerMarker T(TimerStack::TT_O2, Func); 459 460 if (SandboxingType != ST_None) { 461 initRebasePtr(); 462 } 463 464 genTargetHelperCalls(); 465 Func->dump("After target helper call insertion"); 466 467 // Merge Alloca instructions, and lay out the stack. 468 static constexpr bool SortAndCombineAllocas = true; 469 Func->processAllocas(SortAndCombineAllocas); 470 Func->dump("After Alloca processing"); 471 472 // Run this early so it can be used to focus optimizations on potentially hot 473 // code. 474 // TODO(stichnot,ascull): currently only used for regalloc not 475 // expensive high level optimizations which could be focused on potentially 476 // hot code. 477 Func->generateLoopInfo(); 478 Func->dump("After loop analysis"); 479 if (getFlags().getLoopInvariantCodeMotion()) { 480 Func->loopInvariantCodeMotion(); 481 Func->dump("After LICM"); 482 } 483 484 if (getFlags().getLocalCSE() != Ice::LCSE_Disabled) { 485 Func->localCSE(getFlags().getLocalCSE() == Ice::LCSE_EnabledSSA); 486 Func->dump("After Local CSE"); 487 Func->floatConstantCSE(); 488 } 489 if (getFlags().getEnableShortCircuit()) { 490 Func->shortCircuitJumps(); 491 Func->dump("After Short Circuiting"); 492 } 493 494 if (!getFlags().getEnablePhiEdgeSplit()) { 495 // Lower Phi instructions. 496 Func->placePhiLoads(); 497 if (Func->hasError()) 498 return; 499 Func->placePhiStores(); 500 if (Func->hasError()) 501 return; 502 Func->deletePhis(); 503 if (Func->hasError()) 504 return; 505 Func->dump("After Phi lowering"); 506 } 507 508 // Address mode optimization. 509 Func->getVMetadata()->init(VMK_SingleDefs); 510 Func->doAddressOpt(); 511 Func->materializeVectorShuffles(); 512 513 // Find read-modify-write opportunities. Do this after address mode 514 // optimization so that doAddressOpt() doesn't need to be applied to RMW 515 // instructions as well. 516 findRMW(); 517 Func->dump("After RMW transform"); 518 519 // Argument lowering 520 Func->doArgLowering(); 521 522 // Target lowering. This requires liveness analysis for some parts of the 523 // lowering decisions, such as compare/branch fusing. If non-lightweight 524 // liveness analysis is used, the instructions need to be renumbered first 525 // TODO: This renumbering should only be necessary if we're actually 526 // calculating live intervals, which we only do for register allocation. 527 Func->renumberInstructions(); 528 if (Func->hasError()) 529 return; 530 531 // TODO: It should be sufficient to use the fastest liveness calculation, 532 // i.e. livenessLightweight(). However, for some reason that slows down the 533 // rest of the translation. Investigate. 534 Func->liveness(Liveness_Basic); 535 if (Func->hasError()) 536 return; 537 Func->dump("After x86 address mode opt"); 538 539 // Disable constant blinding or pooling for load optimization. 540 { 541 BoolFlagSaver B(RandomizationPoolingPaused, true); 542 doLoadOpt(); 543 } 544 Func->genCode(); 545 if (Func->hasError()) 546 return; 547 if (SandboxingType != ST_None) { 548 initSandbox(); 549 } 550 Func->dump("After x86 codegen"); 551 splitBlockLocalVariables(Func); 552 553 // Register allocation. This requires instruction renumbering and full 554 // liveness analysis. Loops must be identified before liveness so variable 555 // use weights are correct. 556 Func->renumberInstructions(); 557 if (Func->hasError()) 558 return; 559 Func->liveness(Liveness_Intervals); 560 if (Func->hasError()) 561 return; 562 // The post-codegen dump is done here, after liveness analysis and associated 563 // cleanup, to make the dump cleaner and more useful. 564 Func->dump("After initial x86 codegen"); 565 // Validate the live range computations. The expensive validation call is 566 // deliberately only made when assertions are enabled. 567 assert(Func->validateLiveness()); 568 Func->getVMetadata()->init(VMK_All); 569 regAlloc(RAK_Global); 570 if (Func->hasError()) 571 return; 572 Func->dump("After linear scan regalloc"); 573 574 if (getFlags().getEnablePhiEdgeSplit()) { 575 Func->advancedPhiLowering(); 576 Func->dump("After advanced Phi lowering"); 577 } 578 579 // Stack frame mapping. 580 Func->genFrame(); 581 if (Func->hasError()) 582 return; 583 Func->dump("After stack frame mapping"); 584 585 Func->contractEmptyNodes(); 586 Func->reorderNodes(); 587 588 // Shuffle basic block order if -reorder-basic-blocks is enabled. 589 Func->shuffleNodes(); 590 591 // Branch optimization. This needs to be done just before code emission. In 592 // particular, no transformations that insert or reorder CfgNodes should be 593 // done after branch optimization. We go ahead and do it before nop insertion 594 // to reduce the amount of work needed for searching for opportunities. 595 Func->doBranchOpt(); 596 Func->dump("After branch optimization"); 597 598 // Nop insertion if -nop-insertion is enabled. 599 Func->doNopInsertion(); 600 601 // Mark nodes that require sandbox alignment 602 if (NeedSandboxing) { 603 Func->markNodesForSandboxing(); 604 } 605 } 606 607 template <typename TraitsType> void TargetX86Base<TraitsType>::translateOm1() { 608 TimerMarker T(TimerStack::TT_Om1, Func); 609 610 if (SandboxingType != ST_None) { 611 initRebasePtr(); 612 } 613 614 genTargetHelperCalls(); 615 616 // Do not merge Alloca instructions, and lay out the stack. 617 static constexpr bool SortAndCombineAllocas = false; 618 Func->processAllocas(SortAndCombineAllocas); 619 Func->dump("After Alloca processing"); 620 621 Func->placePhiLoads(); 622 if (Func->hasError()) 623 return; 624 Func->placePhiStores(); 625 if (Func->hasError()) 626 return; 627 Func->deletePhis(); 628 if (Func->hasError()) 629 return; 630 Func->dump("After Phi lowering"); 631 632 Func->doArgLowering(); 633 Func->genCode(); 634 if (Func->hasError()) 635 return; 636 if (SandboxingType != ST_None) { 637 initSandbox(); 638 } 639 Func->dump("After initial x86 codegen"); 640 641 regAlloc(RAK_InfOnly); 642 if (Func->hasError()) 643 return; 644 Func->dump("After regalloc of infinite-weight variables"); 645 646 Func->genFrame(); 647 if (Func->hasError()) 648 return; 649 Func->dump("After stack frame mapping"); 650 651 // Shuffle basic block order if -reorder-basic-blocks is enabled. 652 Func->shuffleNodes(); 653 654 // Nop insertion if -nop-insertion is enabled. 655 Func->doNopInsertion(); 656 657 // Mark nodes that require sandbox alignment 658 if (NeedSandboxing) 659 Func->markNodesForSandboxing(); 660 } 661 662 inline bool canRMW(const InstArithmetic *Arith) { 663 Type Ty = Arith->getDest()->getType(); 664 // X86 vector instructions write to a register and have no RMW option. 665 if (isVectorType(Ty)) 666 return false; 667 bool isI64 = Ty == IceType_i64; 668 669 switch (Arith->getOp()) { 670 // Not handled for lack of simple lowering: 671 // shift on i64 672 // mul, udiv, urem, sdiv, srem, frem 673 // Not handled for lack of RMW instructions: 674 // fadd, fsub, fmul, fdiv (also vector types) 675 default: 676 return false; 677 case InstArithmetic::Add: 678 case InstArithmetic::Sub: 679 case InstArithmetic::And: 680 case InstArithmetic::Or: 681 case InstArithmetic::Xor: 682 return true; 683 case InstArithmetic::Shl: 684 case InstArithmetic::Lshr: 685 case InstArithmetic::Ashr: 686 return false; // TODO(stichnot): implement 687 return !isI64; 688 } 689 } 690 691 template <typename TraitsType> 692 bool isSameMemAddressOperand(const Operand *A, const Operand *B) { 693 if (A == B) 694 return true; 695 if (auto *MemA = 696 llvm::dyn_cast<typename TargetX86Base<TraitsType>::X86OperandMem>( 697 A)) { 698 if (auto *MemB = 699 llvm::dyn_cast<typename TargetX86Base<TraitsType>::X86OperandMem>( 700 B)) { 701 return MemA->getBase() == MemB->getBase() && 702 MemA->getOffset() == MemB->getOffset() && 703 MemA->getIndex() == MemB->getIndex() && 704 MemA->getShift() == MemB->getShift() && 705 MemA->getSegmentRegister() == MemB->getSegmentRegister(); 706 } 707 } 708 return false; 709 } 710 711 template <typename TraitsType> void TargetX86Base<TraitsType>::findRMW() { 712 TimerMarker _(TimerStack::TT_findRMW, Func); 713 Func->dump("Before RMW"); 714 if (Func->isVerbose(IceV_RMW)) 715 Func->getContext()->lockStr(); 716 for (CfgNode *Node : Func->getNodes()) { 717 // Walk through the instructions, considering each sequence of 3 718 // instructions, and look for the particular RMW pattern. Note that this 719 // search can be "broken" (false negatives) if there are intervening 720 // deleted instructions, or intervening instructions that could be safely 721 // moved out of the way to reveal an RMW pattern. 722 auto E = Node->getInsts().end(); 723 auto I1 = E, I2 = E, I3 = Node->getInsts().begin(); 724 for (; I3 != E; I1 = I2, I2 = I3, ++I3) { 725 // Make I3 skip over deleted instructions. 726 while (I3 != E && I3->isDeleted()) 727 ++I3; 728 if (I1 == E || I2 == E || I3 == E) 729 continue; 730 assert(!I1->isDeleted()); 731 assert(!I2->isDeleted()); 732 assert(!I3->isDeleted()); 733 auto *Load = llvm::dyn_cast<InstLoad>(I1); 734 auto *Arith = llvm::dyn_cast<InstArithmetic>(I2); 735 auto *Store = llvm::dyn_cast<InstStore>(I3); 736 if (!Load || !Arith || !Store) 737 continue; 738 // Look for: 739 // a = Load addr 740 // b = <op> a, other 741 // Store b, addr 742 // Change to: 743 // a = Load addr 744 // b = <op> a, other 745 // x = FakeDef 746 // RMW <op>, addr, other, x 747 // b = Store b, addr, x 748 // Note that inferTwoAddress() makes sure setDestRedefined() gets called 749 // on the updated Store instruction, to avoid liveness problems later. 750 // 751 // With this transformation, the Store instruction acquires a Dest 752 // variable and is now subject to dead code elimination if there are no 753 // more uses of "b". Variable "x" is a beacon for determining whether the 754 // Store instruction gets dead-code eliminated. If the Store instruction 755 // is eliminated, then it must be the case that the RMW instruction ends 756 // x's live range, and therefore the RMW instruction will be retained and 757 // later lowered. On the other hand, if the RMW instruction does not end 758 // x's live range, then the Store instruction must still be present, and 759 // therefore the RMW instruction is ignored during lowering because it is 760 // redundant with the Store instruction. 761 // 762 // Note that if "a" has further uses, the RMW transformation may still 763 // trigger, resulting in two loads and one store, which is worse than the 764 // original one load and one store. However, this is probably rare, and 765 // caching probably keeps it just as fast. 766 if (!isSameMemAddressOperand<TraitsType>(Load->getSourceAddress(), 767 Store->getAddr())) 768 continue; 769 Operand *ArithSrcFromLoad = Arith->getSrc(0); 770 Operand *ArithSrcOther = Arith->getSrc(1); 771 if (ArithSrcFromLoad != Load->getDest()) { 772 if (!Arith->isCommutative() || ArithSrcOther != Load->getDest()) 773 continue; 774 std::swap(ArithSrcFromLoad, ArithSrcOther); 775 } 776 if (Arith->getDest() != Store->getData()) 777 continue; 778 if (!canRMW(Arith)) 779 continue; 780 if (Func->isVerbose(IceV_RMW)) { 781 Ostream &Str = Func->getContext()->getStrDump(); 782 Str << "Found RMW in " << Func->getFunctionName() << ":\n "; 783 Load->dump(Func); 784 Str << "\n "; 785 Arith->dump(Func); 786 Str << "\n "; 787 Store->dump(Func); 788 Str << "\n"; 789 } 790 Variable *Beacon = Func->makeVariable(IceType_i32); 791 Beacon->setMustNotHaveReg(); 792 Store->setRmwBeacon(Beacon); 793 auto *BeaconDef = InstFakeDef::create(Func, Beacon); 794 Node->getInsts().insert(I3, BeaconDef); 795 auto *RMW = InstX86FakeRMW::create(Func, ArithSrcOther, Store->getAddr(), 796 Beacon, Arith->getOp()); 797 Node->getInsts().insert(I3, RMW); 798 } 799 } 800 if (Func->isVerbose(IceV_RMW)) 801 Func->getContext()->unlockStr(); 802 } 803 804 // Converts a ConstantInteger32 operand into its constant value, or 805 // MemoryOrderInvalid if the operand is not a ConstantInteger32. 806 inline uint64_t getConstantMemoryOrder(Operand *Opnd) { 807 if (auto *Integer = llvm::dyn_cast<ConstantInteger32>(Opnd)) 808 return Integer->getValue(); 809 return Intrinsics::MemoryOrderInvalid; 810 } 811 812 /// Determines whether the dest of a Load instruction can be folded into one of 813 /// the src operands of a 2-operand instruction. This is true as long as the 814 /// load dest matches exactly one of the binary instruction's src operands. 815 /// Replaces Src0 or Src1 with LoadSrc if the answer is true. 816 inline bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest, 817 Operand *&Src0, Operand *&Src1) { 818 if (Src0 == LoadDest && Src1 != LoadDest) { 819 Src0 = LoadSrc; 820 return true; 821 } 822 if (Src0 != LoadDest && Src1 == LoadDest) { 823 Src1 = LoadSrc; 824 return true; 825 } 826 return false; 827 } 828 829 template <typename TraitsType> void TargetX86Base<TraitsType>::doLoadOpt() { 830 TimerMarker _(TimerStack::TT_loadOpt, Func); 831 for (CfgNode *Node : Func->getNodes()) { 832 Context.init(Node); 833 while (!Context.atEnd()) { 834 Variable *LoadDest = nullptr; 835 Operand *LoadSrc = nullptr; 836 Inst *CurInst = iteratorToInst(Context.getCur()); 837 Inst *Next = Context.getNextInst(); 838 // Determine whether the current instruction is a Load instruction or 839 // equivalent. 840 if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) { 841 // An InstLoad always qualifies. 842 LoadDest = Load->getDest(); 843 constexpr bool DoLegalize = false; 844 LoadSrc = formMemoryOperand(Load->getSourceAddress(), 845 LoadDest->getType(), DoLegalize); 846 } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsicCall>(CurInst)) { 847 // An AtomicLoad intrinsic qualifies as long as it has a valid memory 848 // ordering, and can be implemented in a single instruction (i.e., not 849 // i64 on x86-32). 850 Intrinsics::IntrinsicID ID = Intrin->getIntrinsicInfo().ID; 851 if (ID == Intrinsics::AtomicLoad && 852 (Traits::Is64Bit || Intrin->getDest()->getType() != IceType_i64) && 853 Intrinsics::isMemoryOrderValid( 854 ID, getConstantMemoryOrder(Intrin->getArg(1)))) { 855 LoadDest = Intrin->getDest(); 856 constexpr bool DoLegalize = false; 857 LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(), 858 DoLegalize); 859 } 860 } 861 // A Load instruction can be folded into the following instruction only 862 // if the following instruction ends the Load's Dest variable's live 863 // range. 864 if (LoadDest && Next && Next->isLastUse(LoadDest)) { 865 assert(LoadSrc); 866 Inst *NewInst = nullptr; 867 if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) { 868 Operand *Src0 = Arith->getSrc(0); 869 Operand *Src1 = Arith->getSrc(1); 870 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) { 871 NewInst = InstArithmetic::create(Func, Arith->getOp(), 872 Arith->getDest(), Src0, Src1); 873 } 874 } else if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Next)) { 875 Operand *Src0 = Icmp->getSrc(0); 876 Operand *Src1 = Icmp->getSrc(1); 877 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) { 878 NewInst = InstIcmp::create(Func, Icmp->getCondition(), 879 Icmp->getDest(), Src0, Src1); 880 } 881 } else if (auto *Fcmp = llvm::dyn_cast<InstFcmp>(Next)) { 882 Operand *Src0 = Fcmp->getSrc(0); 883 Operand *Src1 = Fcmp->getSrc(1); 884 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) { 885 NewInst = InstFcmp::create(Func, Fcmp->getCondition(), 886 Fcmp->getDest(), Src0, Src1); 887 } 888 } else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) { 889 Operand *Src0 = Select->getTrueOperand(); 890 Operand *Src1 = Select->getFalseOperand(); 891 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) { 892 NewInst = InstSelect::create(Func, Select->getDest(), 893 Select->getCondition(), Src0, Src1); 894 } 895 } else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) { 896 // The load dest can always be folded into a Cast instruction. 897 auto *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0)); 898 if (Src0 == LoadDest) { 899 NewInst = InstCast::create(Func, Cast->getCastKind(), 900 Cast->getDest(), LoadSrc); 901 } 902 } 903 if (NewInst) { 904 CurInst->setDeleted(); 905 Next->setDeleted(); 906 Context.insert(NewInst); 907 // Update NewInst->LiveRangesEnded so that target lowering may 908 // benefit. Also update NewInst->HasSideEffects. 909 NewInst->spliceLivenessInfo(Next, CurInst); 910 } 911 } 912 Context.advanceCur(); 913 Context.advanceNext(); 914 } 915 } 916 Func->dump("After load optimization"); 917 } 918 919 template <typename TraitsType> 920 bool TargetX86Base<TraitsType>::doBranchOpt(Inst *I, const CfgNode *NextNode) { 921 if (auto *Br = llvm::dyn_cast<InstX86Br>(I)) { 922 return Br->optimizeBranch(NextNode); 923 } 924 return false; 925 } 926 927 template <typename TraitsType> 928 Variable *TargetX86Base<TraitsType>::getPhysicalRegister(RegNumT RegNum, 929 Type Ty) { 930 if (Ty == IceType_void) 931 Ty = IceType_i32; 932 if (PhysicalRegisters[Ty].empty()) 933 PhysicalRegisters[Ty].resize(Traits::RegisterSet::Reg_NUM); 934 assert(unsigned(RegNum) < PhysicalRegisters[Ty].size()); 935 Variable *Reg = PhysicalRegisters[Ty][RegNum]; 936 if (Reg == nullptr) { 937 Reg = Func->makeVariable(Ty); 938 Reg->setRegNum(RegNum); 939 PhysicalRegisters[Ty][RegNum] = Reg; 940 // Specially mark a named physical register as an "argument" so that it is 941 // considered live upon function entry. Otherwise it's possible to get 942 // liveness validation errors for saving callee-save registers. 943 Func->addImplicitArg(Reg); 944 // Don't bother tracking the live range of a named physical register. 945 Reg->setIgnoreLiveness(); 946 } 947 assert(Traits::getGprForType(Ty, RegNum) == RegNum); 948 return Reg; 949 } 950 951 template <typename TraitsType> 952 const char *TargetX86Base<TraitsType>::getRegName(RegNumT RegNum, 953 Type Ty) const { 954 return Traits::getRegName(Traits::getGprForType(Ty, RegNum)); 955 } 956 957 template <typename TraitsType> 958 void TargetX86Base<TraitsType>::emitVariable(const Variable *Var) const { 959 if (!BuildDefs::dump()) 960 return; 961 Ostream &Str = Ctx->getStrEmit(); 962 if (Var->hasReg()) { 963 const bool Is64BitSandboxing = Traits::Is64Bit && NeedSandboxing; 964 const Type VarType = (Var->isRematerializable() && Is64BitSandboxing) 965 ? IceType_i64 966 : Var->getType(); 967 Str << "%" << getRegName(Var->getRegNum(), VarType); 968 return; 969 } 970 if (Var->mustHaveReg()) { 971 llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() + 972 ") has no register assigned - function " + 973 Func->getFunctionName()); 974 } 975 const int32_t Offset = Var->getStackOffset(); 976 auto BaseRegNum = Var->getBaseRegNum(); 977 if (BaseRegNum.hasNoValue()) 978 BaseRegNum = getFrameOrStackReg(); 979 980 // Print in the form "Offset(%reg)", omitting Offset when it is 0. 981 if (getFlags().getDecorateAsm()) { 982 Str << Var->getSymbolicStackOffset(); 983 } else if (Offset != 0) { 984 Str << Offset; 985 } 986 const Type FrameSPTy = Traits::WordType; 987 Str << "(%" << getRegName(BaseRegNum, FrameSPTy) << ")"; 988 } 989 990 template <typename TraitsType> 991 typename TargetX86Base<TraitsType>::X86Address 992 TargetX86Base<TraitsType>::stackVarToAsmOperand(const Variable *Var) const { 993 if (Var->hasReg()) 994 llvm::report_fatal_error("Stack Variable has a register assigned"); 995 if (Var->mustHaveReg()) { 996 llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() + 997 ") has no register assigned - function " + 998 Func->getFunctionName()); 999 } 1000 int32_t Offset = Var->getStackOffset(); 1001 auto BaseRegNum = Var->getBaseRegNum(); 1002 if (Var->getBaseRegNum().hasNoValue()) { 1003 // If the stack pointer needs alignment, we must use the frame pointer for 1004 // arguments. For locals, getFrameOrStackReg will return the stack pointer 1005 // in this case. 1006 if (needsStackPointerAlignment() && Var->getIsArg()) { 1007 assert(hasFramePointer()); 1008 BaseRegNum = getFrameReg(); 1009 } else { 1010 BaseRegNum = getFrameOrStackReg(); 1011 } 1012 } 1013 return X86Address(Traits::getEncodedGPR(BaseRegNum), Offset, 1014 AssemblerFixup::NoFixup); 1015 } 1016 1017 template <typename TraitsType> 1018 void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) { 1019 // Stack frame layout: 1020 // 1021 // +------------------------+ ^ + 1022 // | 1. return address | | 1023 // +------------------------+ v - 1024 // | 2. preserved registers | 1025 // +------------------------+ <--- BasePointer (if used) 1026 // | 3. padding | 1027 // +------------------------+ 1028 // | 4. global spill area | 1029 // +------------------------+ 1030 // | 5. padding | 1031 // +------------------------+ 1032 // | 6. local spill area | 1033 // +------------------------+ 1034 // | 7. padding | 1035 // +------------------------+ 1036 // | 7.5 shadow (WinX64) | 1037 // +------------------------+ 1038 // | 8. allocas | 1039 // +------------------------+ 1040 // | 9. padding | 1041 // +------------------------+ 1042 // | 10. out args | 1043 // +------------------------+ <--- StackPointer 1044 // 1045 // The following variables record the size in bytes of the given areas: 1046 // * X86_RET_IP_SIZE_BYTES: area 1 1047 // * PreservedRegsSizeBytes: area 2 1048 // * SpillAreaPaddingBytes: area 3 1049 // * GlobalsSize: area 4 1050 // * LocalsSlotsPaddingBytes: area 5 1051 // * GlobalsAndSubsequentPaddingSize: areas 4 - 5 1052 // * LocalsSpillAreaSize: area 6 1053 // * FixedAllocaSizeBytes: areas 7 - 8 1054 // * SpillAreaSizeBytes: areas 3 - 10 1055 // * maxOutArgsSizeBytes(): areas 9 - 10 1056 1057 // Determine stack frame offsets for each Variable without a register 1058 // assignment. This can be done as one variable per stack slot. Or, do 1059 // coalescing by running the register allocator again with an infinite set of 1060 // registers (as a side effect, this gives variables a second chance at 1061 // physical register assignment). 1062 // 1063 // A middle ground approach is to leverage sparsity and allocate one block of 1064 // space on the frame for globals (variables with multi-block lifetime), and 1065 // one block to share for locals (single-block lifetime). 1066 1067 const SizeT ShadowStoreSize = getShadowStoreSize<Traits>(); 1068 1069 // StackPointer: points just past return address of calling function 1070 1071 Context.init(Node); 1072 Context.setInsertPoint(Context.getCur()); 1073 1074 SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None); 1075 RegsUsed = SmallBitVector(CalleeSaves.size()); 1076 VarList SortedSpilledVariables, VariablesLinkedToSpillSlots; 1077 size_t GlobalsSize = 0; 1078 // If there is a separate locals area, this represents that area. Otherwise 1079 // it counts any variable not counted by GlobalsSize. 1080 SpillAreaSizeBytes = 0; 1081 // If there is a separate locals area, this specifies the alignment for it. 1082 uint32_t LocalsSlotsAlignmentBytes = 0; 1083 // The entire spill locations area gets aligned to largest natural alignment 1084 // of the variables that have a spill slot. 1085 uint32_t SpillAreaAlignmentBytes = 0; 1086 // A spill slot linked to a variable with a stack slot should reuse that 1087 // stack slot. 1088 std::function<bool(Variable *)> TargetVarHook = 1089 [&VariablesLinkedToSpillSlots](Variable *Var) { 1090 // TODO(stichnot): Refactor this into the base class. 1091 Variable *Root = Var->getLinkedToStackRoot(); 1092 if (Root != nullptr) { 1093 assert(!Root->hasReg()); 1094 if (!Root->hasReg()) { 1095 VariablesLinkedToSpillSlots.push_back(Var); 1096 return true; 1097 } 1098 } 1099 return false; 1100 }; 1101 1102 // Compute the list of spilled variables and bounds for GlobalsSize, etc. 1103 getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize, 1104 &SpillAreaSizeBytes, &SpillAreaAlignmentBytes, 1105 &LocalsSlotsAlignmentBytes, TargetVarHook); 1106 uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes; 1107 SpillAreaSizeBytes += GlobalsSize; 1108 1109 // Add push instructions for preserved registers. 1110 uint32_t NumCallee = 0; 1111 size_t PreservedRegsSizeBytes = 0; 1112 SmallBitVector Pushed(CalleeSaves.size()); 1113 for (RegNumT i : RegNumBVIter(CalleeSaves)) { 1114 const auto Canonical = Traits::getBaseReg(i); 1115 assert(Canonical == Traits::getBaseReg(Canonical)); 1116 if (RegsUsed[i]) { 1117 Pushed[Canonical] = true; 1118 } 1119 } 1120 for (RegNumT RegNum : RegNumBVIter(Pushed)) { 1121 assert(RegNum == Traits::getBaseReg(RegNum)); 1122 ++NumCallee; 1123 if (Traits::isXmm(RegNum)) { 1124 PreservedRegsSizeBytes += 16; 1125 } else { 1126 PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType); 1127 } 1128 _push_reg(RegNum); 1129 } 1130 Ctx->statsUpdateRegistersSaved(NumCallee); 1131 1132 // StackPointer: points past preserved registers at start of spill area 1133 1134 // Generate "push frameptr; mov frameptr, stackptr" 1135 if (IsEbpBasedFrame) { 1136 assert((RegsUsed & getRegisterSet(RegSet_FramePointer, RegSet_None)) 1137 .count() == 0); 1138 PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType); 1139 _link_bp(); 1140 } 1141 1142 // Align the variables area. SpillAreaPaddingBytes is the size of the region 1143 // after the preserved registers and before the spill areas. 1144 // LocalsSlotsPaddingBytes is the amount of padding between the globals and 1145 // locals area if they are separate. 1146 assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes); 1147 uint32_t SpillAreaPaddingBytes = 0; 1148 uint32_t LocalsSlotsPaddingBytes = 0; 1149 alignStackSpillAreas(Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes, 1150 SpillAreaAlignmentBytes, GlobalsSize, 1151 LocalsSlotsAlignmentBytes, &SpillAreaPaddingBytes, 1152 &LocalsSlotsPaddingBytes); 1153 SpillAreaSizeBytes += SpillAreaPaddingBytes + LocalsSlotsPaddingBytes; 1154 uint32_t GlobalsAndSubsequentPaddingSize = 1155 GlobalsSize + LocalsSlotsPaddingBytes; 1156 1157 // Functions returning scalar floating point types may need to convert values 1158 // from an in-register xmm value to the top of the x87 floating point stack. 1159 // This is done by a movp[sd] and an fld[sd]. Ensure there is enough scratch 1160 // space on the stack for this. 1161 const Type ReturnType = Func->getReturnType(); 1162 if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) { 1163 if (isScalarFloatingType(ReturnType)) { 1164 // Avoid misaligned double-precision load/store. 1165 RequiredStackAlignment = std::max<size_t>( 1166 RequiredStackAlignment, Traits::X86_STACK_ALIGNMENT_BYTES); 1167 SpillAreaSizeBytes = 1168 std::max(typeWidthInBytesOnStack(ReturnType), SpillAreaSizeBytes); 1169 } 1170 } 1171 1172 RequiredStackAlignment = 1173 std::max<size_t>(RequiredStackAlignment, SpillAreaAlignmentBytes); 1174 1175 if (PrologEmitsFixedAllocas) { 1176 RequiredStackAlignment = 1177 std::max(RequiredStackAlignment, FixedAllocaAlignBytes); 1178 } 1179 1180 // Combine fixed allocations into SpillAreaSizeBytes if we are emitting the 1181 // fixed allocations in the prolog. 1182 if (PrologEmitsFixedAllocas) 1183 SpillAreaSizeBytes += FixedAllocaSizeBytes; 1184 1185 // Win64 ABI: add space for shadow store (aka home space) 1186 SpillAreaSizeBytes += ShadowStoreSize; 1187 1188 // Entering the function has made the stack pointer unaligned. Re-align it by 1189 // adjusting the stack size. 1190 // Note that StackOffset does not include spill area. It's the offset from the 1191 // base stack pointer (epb), whether we set it or not, to the the first stack 1192 // arg (if any). StackSize, on the other hand, does include the spill area. 1193 const uint32_t StackOffset = 1194 ShadowStoreSize + Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes; 1195 uint32_t StackSize = Utils::applyAlignment(StackOffset + SpillAreaSizeBytes, 1196 RequiredStackAlignment); 1197 StackSize = Utils::applyAlignment(StackSize + maxOutArgsSizeBytes(), 1198 RequiredStackAlignment); 1199 SpillAreaSizeBytes = StackSize - StackOffset; // Adjust for alignment, if any 1200 1201 if (SpillAreaSizeBytes) { 1202 emitStackProbe(SpillAreaSizeBytes); 1203 1204 // Generate "sub stackptr, SpillAreaSizeBytes" 1205 _sub_sp(Ctx->getConstantInt32(SpillAreaSizeBytes)); 1206 } 1207 1208 // StackPointer: points just past the spill area (end of stack frame) 1209 1210 // If the required alignment is greater than the stack pointer's guaranteed 1211 // alignment, align the stack pointer accordingly. 1212 if (RequiredStackAlignment > Traits::X86_STACK_ALIGNMENT_BYTES) { 1213 assert(IsEbpBasedFrame); 1214 _and(getPhysicalRegister(getStackReg(), Traits::WordType), 1215 Ctx->getConstantInt32(-RequiredStackAlignment)); 1216 } 1217 1218 // StackPointer: may have just been offset for alignment 1219 1220 // Account for known-frame-offset alloca instructions that were not already 1221 // combined into the prolog. 1222 if (!PrologEmitsFixedAllocas) 1223 SpillAreaSizeBytes += FixedAllocaSizeBytes; 1224 1225 Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes); 1226 1227 // Fill in stack offsets for stack args, and copy args into registers for 1228 // those that were register-allocated. Args are pushed right to left, so 1229 // Arg[0] is closest to the stack/frame pointer. 1230 RegNumT FrameOrStackReg = IsEbpBasedFrame ? getFrameReg() : getStackReg(); 1231 Variable *FramePtr = getPhysicalRegister(FrameOrStackReg, Traits::WordType); 1232 size_t BasicFrameOffset = StackOffset; 1233 if (!IsEbpBasedFrame) 1234 BasicFrameOffset += SpillAreaSizeBytes; 1235 1236 emitGetIP(Node); 1237 1238 const VarList &Args = Func->getArgs(); 1239 size_t InArgsSizeBytes = 0; 1240 unsigned NumXmmArgs = 0; 1241 unsigned NumGPRArgs = 0; 1242 for (SizeT i = 0, NumArgs = Args.size(); i < NumArgs; ++i) { 1243 Variable *Arg = Args[i]; 1244 // Skip arguments passed in registers. 1245 if (isVectorType(Arg->getType())) { 1246 if (Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs)) 1247 .hasValue()) { 1248 ++NumXmmArgs; 1249 continue; 1250 } 1251 } else if (isScalarFloatingType(Arg->getType())) { 1252 if (Traits::X86_PASS_SCALAR_FP_IN_XMM && 1253 Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs)) 1254 .hasValue()) { 1255 ++NumXmmArgs; 1256 continue; 1257 } 1258 } else { 1259 assert(isScalarIntegerType(Arg->getType())); 1260 if (Traits::getRegisterForGprArgNum(Traits::WordType, 1261 Traits::getArgIndex(i, NumGPRArgs)) 1262 .hasValue()) { 1263 ++NumGPRArgs; 1264 continue; 1265 } 1266 } 1267 // For esp-based frames where the allocas are done outside the prolog, the 1268 // esp value may not stabilize to its home value until after all the 1269 // fixed-size alloca instructions have executed. In this case, a stack 1270 // adjustment is needed when accessing in-args in order to copy them into 1271 // registers. 1272 size_t StackAdjBytes = 0; 1273 if (!IsEbpBasedFrame && !PrologEmitsFixedAllocas) 1274 StackAdjBytes -= FixedAllocaSizeBytes; 1275 finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, StackAdjBytes, 1276 InArgsSizeBytes); 1277 } 1278 1279 // Fill in stack offsets for locals. 1280 assignVarStackSlots(SortedSpilledVariables, SpillAreaPaddingBytes, 1281 SpillAreaSizeBytes, GlobalsAndSubsequentPaddingSize, 1282 IsEbpBasedFrame && !needsStackPointerAlignment()); 1283 // Assign stack offsets to variables that have been linked to spilled 1284 // variables. 1285 for (Variable *Var : VariablesLinkedToSpillSlots) { 1286 const Variable *Root = Var->getLinkedToStackRoot(); 1287 assert(Root != nullptr); 1288 Var->setStackOffset(Root->getStackOffset()); 1289 } 1290 this->HasComputedFrame = true; 1291 1292 if (BuildDefs::dump() && Func->isVerbose(IceV_Frame)) { 1293 OstreamLocker L(Func->getContext()); 1294 Ostream &Str = Func->getContext()->getStrDump(); 1295 1296 Str << "Stack layout:\n"; 1297 uint32_t EspAdjustmentPaddingSize = 1298 SpillAreaSizeBytes - LocalsSpillAreaSize - 1299 GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes - 1300 maxOutArgsSizeBytes(); 1301 Str << " in-args = " << InArgsSizeBytes << " bytes\n" 1302 << " return address = " << Traits::X86_RET_IP_SIZE_BYTES << " bytes\n" 1303 << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n" 1304 << " spill area padding = " << SpillAreaPaddingBytes << " bytes\n" 1305 << " globals spill area = " << GlobalsSize << " bytes\n" 1306 << " globals-locals spill areas intermediate padding = " 1307 << GlobalsAndSubsequentPaddingSize - GlobalsSize << " bytes\n" 1308 << " locals spill area = " << LocalsSpillAreaSize << " bytes\n" 1309 << " esp alignment padding = " << EspAdjustmentPaddingSize 1310 << " bytes\n"; 1311 1312 Str << "Stack details:\n" 1313 << " esp adjustment = " << SpillAreaSizeBytes << " bytes\n" 1314 << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n" 1315 << " outgoing args size = " << maxOutArgsSizeBytes() << " bytes\n" 1316 << " locals spill area alignment = " << LocalsSlotsAlignmentBytes 1317 << " bytes\n" 1318 << " is ebp based = " << IsEbpBasedFrame << "\n"; 1319 } 1320 } 1321 1322 /// Helper function for addProlog(). 1323 /// 1324 /// This assumes Arg is an argument passed on the stack. This sets the frame 1325 /// offset for Arg and updates InArgsSizeBytes according to Arg's width. For an 1326 /// I64 arg that has been split into Lo and Hi components, it calls itself 1327 /// recursively on the components, taking care to handle Lo first because of the 1328 /// little-endian architecture. Lastly, this function generates an instruction 1329 /// to copy Arg into its assigned register if applicable. 1330 template <typename TraitsType> 1331 void TargetX86Base<TraitsType>::finishArgumentLowering( 1332 Variable *Arg, Variable *FramePtr, size_t BasicFrameOffset, 1333 size_t StackAdjBytes, size_t &InArgsSizeBytes) { 1334 if (!Traits::Is64Bit) { 1335 if (auto *Arg64On32 = llvm::dyn_cast<Variable64On32>(Arg)) { 1336 Variable *Lo = Arg64On32->getLo(); 1337 Variable *Hi = Arg64On32->getHi(); 1338 finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, StackAdjBytes, 1339 InArgsSizeBytes); 1340 finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, StackAdjBytes, 1341 InArgsSizeBytes); 1342 return; 1343 } 1344 } 1345 Type Ty = Arg->getType(); 1346 if (isVectorType(Ty)) { 1347 InArgsSizeBytes = Traits::applyStackAlignment(InArgsSizeBytes); 1348 } 1349 Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes); 1350 InArgsSizeBytes += typeWidthInBytesOnStack(Ty); 1351 if (Arg->hasReg()) { 1352 assert(Ty != IceType_i64 || Traits::Is64Bit); 1353 auto *Mem = X86OperandMem::create( 1354 Func, Ty, FramePtr, 1355 Ctx->getConstantInt32(Arg->getStackOffset() + StackAdjBytes)); 1356 if (isVectorType(Arg->getType())) { 1357 _movp(Arg, Mem); 1358 } else { 1359 _mov(Arg, Mem); 1360 } 1361 // This argument-copying instruction uses an explicit X86OperandMem 1362 // operand instead of a Variable, so its fill-from-stack operation has to 1363 // be tracked separately for statistics. 1364 Ctx->statsUpdateFills(); 1365 } 1366 } 1367 1368 template <typename TraitsType> 1369 void TargetX86Base<TraitsType>::addEpilog(CfgNode *Node) { 1370 InstList &Insts = Node->getInsts(); 1371 InstList::reverse_iterator RI, E; 1372 for (RI = Insts.rbegin(), E = Insts.rend(); RI != E; ++RI) { 1373 if (llvm::isa<typename Traits::Insts::Ret>(*RI)) 1374 break; 1375 } 1376 if (RI == E) 1377 return; 1378 1379 // Convert the reverse_iterator position into its corresponding (forward) 1380 // iterator position. 1381 InstList::iterator InsertPoint = reverseToForwardIterator(RI); 1382 --InsertPoint; 1383 Context.init(Node); 1384 Context.setInsertPoint(InsertPoint); 1385 1386 if (IsEbpBasedFrame) { 1387 _unlink_bp(); 1388 } else { 1389 // add stackptr, SpillAreaSizeBytes 1390 if (SpillAreaSizeBytes != 0) { 1391 _add_sp(Ctx->getConstantInt32(SpillAreaSizeBytes)); 1392 } 1393 } 1394 1395 // Add pop instructions for preserved registers. 1396 SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None); 1397 SmallBitVector Popped(CalleeSaves.size()); 1398 for (int32_t i = CalleeSaves.size() - 1; i >= 0; --i) { 1399 const auto RegNum = RegNumT::fromInt(i); 1400 if (RegNum == getFrameReg() && IsEbpBasedFrame) 1401 continue; 1402 const RegNumT Canonical = Traits::getBaseReg(RegNum); 1403 if (CalleeSaves[i] && RegsUsed[i]) { 1404 Popped[Canonical] = true; 1405 } 1406 } 1407 for (int32_t i = Popped.size() - 1; i >= 0; --i) { 1408 if (!Popped[i]) 1409 continue; 1410 const auto RegNum = RegNumT::fromInt(i); 1411 assert(RegNum == Traits::getBaseReg(RegNum)); 1412 _pop_reg(RegNum); 1413 } 1414 1415 if (!NeedSandboxing) { 1416 return; 1417 } 1418 emitSandboxedReturn(); 1419 if (RI->getSrcSize()) { 1420 auto *RetValue = llvm::cast<Variable>(RI->getSrc(0)); 1421 Context.insert<InstFakeUse>(RetValue); 1422 } 1423 RI->setDeleted(); 1424 } 1425 1426 template <typename TraitsType> Type TargetX86Base<TraitsType>::stackSlotType() { 1427 return Traits::WordType; 1428 } 1429 1430 template <typename TraitsType> 1431 template <typename T> 1432 typename std::enable_if<!T::Is64Bit, Operand>::type * 1433 TargetX86Base<TraitsType>::loOperand(Operand *Operand) { 1434 assert(Operand->getType() == IceType_i64 || 1435 Operand->getType() == IceType_f64); 1436 if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64) 1437 return Operand; 1438 if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand)) 1439 return Var64On32->getLo(); 1440 if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) { 1441 auto *ConstInt = llvm::dyn_cast<ConstantInteger32>( 1442 Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue()))); 1443 // Check if we need to blind/pool the constant. 1444 return legalize(ConstInt); 1445 } 1446 if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Operand)) { 1447 auto *MemOperand = X86OperandMem::create( 1448 Func, IceType_i32, Mem->getBase(), Mem->getOffset(), Mem->getIndex(), 1449 Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsRebased()); 1450 // Test if we should randomize or pool the offset, if so randomize it or 1451 // pool it then create mem operand with the blinded/pooled constant. 1452 // Otherwise, return the mem operand as ordinary mem operand. 1453 return legalize(MemOperand); 1454 } 1455 llvm_unreachable("Unsupported operand type"); 1456 return nullptr; 1457 } 1458 1459 template <typename TraitsType> 1460 template <typename T> 1461 typename std::enable_if<!T::Is64Bit, Operand>::type * 1462 TargetX86Base<TraitsType>::hiOperand(Operand *Operand) { 1463 assert(Operand->getType() == IceType_i64 || 1464 Operand->getType() == IceType_f64); 1465 if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64) 1466 return Operand; 1467 if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand)) 1468 return Var64On32->getHi(); 1469 if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) { 1470 auto *ConstInt = llvm::dyn_cast<ConstantInteger32>( 1471 Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue() >> 32))); 1472 // Check if we need to blind/pool the constant. 1473 return legalize(ConstInt); 1474 } 1475 if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Operand)) { 1476 Constant *Offset = Mem->getOffset(); 1477 if (Offset == nullptr) { 1478 Offset = Ctx->getConstantInt32(4); 1479 } else if (auto *IntOffset = llvm::dyn_cast<ConstantInteger32>(Offset)) { 1480 Offset = Ctx->getConstantInt32(4 + IntOffset->getValue()); 1481 } else if (auto *SymOffset = llvm::dyn_cast<ConstantRelocatable>(Offset)) { 1482 assert(!Utils::WouldOverflowAdd(SymOffset->getOffset(), 4)); 1483 Offset = 1484 Ctx->getConstantSym(4 + SymOffset->getOffset(), SymOffset->getName()); 1485 } 1486 auto *MemOperand = X86OperandMem::create( 1487 Func, IceType_i32, Mem->getBase(), Offset, Mem->getIndex(), 1488 Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsRebased()); 1489 // Test if the Offset is an eligible i32 constants for randomization and 1490 // pooling. Blind/pool it if it is. Otherwise return as oridinary mem 1491 // operand. 1492 return legalize(MemOperand); 1493 } 1494 llvm_unreachable("Unsupported operand type"); 1495 return nullptr; 1496 } 1497 1498 template <typename TraitsType> 1499 SmallBitVector 1500 TargetX86Base<TraitsType>::getRegisterSet(RegSetMask Include, 1501 RegSetMask Exclude) const { 1502 return Traits::getRegisterSet(getFlags(), Include, Exclude); 1503 } 1504 1505 template <typename TraitsType> 1506 void TargetX86Base<TraitsType>::lowerAlloca(const InstAlloca *Instr) { 1507 // Conservatively require the stack to be aligned. Some stack adjustment 1508 // operations implemented below assume that the stack is aligned before the 1509 // alloca. All the alloca code ensures that the stack alignment is preserved 1510 // after the alloca. The stack alignment restriction can be relaxed in some 1511 // cases. 1512 RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment, 1513 Traits::X86_STACK_ALIGNMENT_BYTES); 1514 1515 // For default align=0, set it to the real value 1, to avoid any 1516 // bit-manipulation problems below. 1517 const uint32_t AlignmentParam = std::max(1u, Instr->getAlignInBytes()); 1518 1519 // LLVM enforces power of 2 alignment. 1520 assert(llvm::isPowerOf2_32(AlignmentParam)); 1521 assert(llvm::isPowerOf2_32(Traits::X86_STACK_ALIGNMENT_BYTES)); 1522 1523 const uint32_t Alignment = 1524 std::max(AlignmentParam, Traits::X86_STACK_ALIGNMENT_BYTES); 1525 const bool OverAligned = Alignment > Traits::X86_STACK_ALIGNMENT_BYTES; 1526 const bool OptM1 = Func->getOptLevel() == Opt_m1; 1527 const bool AllocaWithKnownOffset = Instr->getKnownFrameOffset(); 1528 const bool UseFramePointer = 1529 hasFramePointer() || OverAligned || !AllocaWithKnownOffset || OptM1; 1530 1531 if (UseFramePointer) 1532 setHasFramePointer(); 1533 1534 Variable *esp = getPhysicalRegister(getStackReg(), Traits::WordType); 1535 if (OverAligned) { 1536 _and(esp, Ctx->getConstantInt32(-Alignment)); 1537 } 1538 1539 Variable *Dest = Instr->getDest(); 1540 Operand *TotalSize = legalize(Instr->getSizeInBytes()); 1541 1542 if (const auto *ConstantTotalSize = 1543 llvm::dyn_cast<ConstantInteger32>(TotalSize)) { 1544 const uint32_t Value = 1545 Utils::applyAlignment(ConstantTotalSize->getValue(), Alignment); 1546 if (UseFramePointer) { 1547 _sub_sp(Ctx->getConstantInt32(Value)); 1548 } else { 1549 // If we don't need a Frame Pointer, this alloca has a known offset to the 1550 // stack pointer. We don't need adjust the stack pointer, nor assign any 1551 // value to Dest, as Dest is rematerializable. 1552 assert(Dest->isRematerializable()); 1553 FixedAllocaSizeBytes += Value; 1554 Context.insert<InstFakeDef>(Dest); 1555 } 1556 } else { 1557 // Non-constant sizes need to be adjusted to the next highest multiple of 1558 // the required alignment at runtime. 1559 Variable *T = nullptr; 1560 if (Traits::Is64Bit && TotalSize->getType() != IceType_i64 && 1561 !NeedSandboxing) { 1562 T = makeReg(IceType_i64); 1563 _movzx(T, TotalSize); 1564 } else { 1565 T = makeReg(IceType_i32); 1566 _mov(T, TotalSize); 1567 } 1568 _add(T, Ctx->getConstantInt32(Alignment - 1)); 1569 _and(T, Ctx->getConstantInt32(-Alignment)); 1570 _sub_sp(T); 1571 } 1572 // Add enough to the returned address to account for the out args area. 1573 uint32_t OutArgsSize = maxOutArgsSizeBytes(); 1574 if (OutArgsSize > 0) { 1575 Variable *T = makeReg(Dest->getType()); 1576 auto *CalculateOperand = X86OperandMem::create( 1577 Func, IceType_void, esp, Ctx->getConstantInt(IceType_i32, OutArgsSize)); 1578 _lea(T, CalculateOperand); 1579 _mov(Dest, T); 1580 } else { 1581 _mov(Dest, esp); 1582 } 1583 } 1584 1585 template <typename TraitsType> 1586 void TargetX86Base<TraitsType>::lowerArguments() { 1587 const bool OptM1 = Func->getOptLevel() == Opt_m1; 1588 VarList &Args = Func->getArgs(); 1589 unsigned NumXmmArgs = 0; 1590 bool XmmSlotsRemain = true; 1591 unsigned NumGprArgs = 0; 1592 bool GprSlotsRemain = true; 1593 1594 Context.init(Func->getEntryNode()); 1595 Context.setInsertPoint(Context.getCur()); 1596 1597 for (SizeT i = 0, End = Args.size(); 1598 i < End && (XmmSlotsRemain || GprSlotsRemain); ++i) { 1599 Variable *Arg = Args[i]; 1600 Type Ty = Arg->getType(); 1601 Variable *RegisterArg = nullptr; 1602 RegNumT RegNum; 1603 if (isVectorType(Ty)) { 1604 RegNum = 1605 Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs)); 1606 if (RegNum.hasNoValue()) { 1607 XmmSlotsRemain = false; 1608 continue; 1609 } 1610 ++NumXmmArgs; 1611 RegisterArg = Func->makeVariable(Ty); 1612 } else if (isScalarFloatingType(Ty)) { 1613 if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) { 1614 continue; 1615 } 1616 RegNum = 1617 Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, NumXmmArgs)); 1618 if (RegNum.hasNoValue()) { 1619 XmmSlotsRemain = false; 1620 continue; 1621 } 1622 ++NumXmmArgs; 1623 RegisterArg = Func->makeVariable(Ty); 1624 } else if (isScalarIntegerType(Ty)) { 1625 RegNum = Traits::getRegisterForGprArgNum( 1626 Ty, Traits::getArgIndex(i, NumGprArgs)); 1627 if (RegNum.hasNoValue()) { 1628 GprSlotsRemain = false; 1629 continue; 1630 } 1631 ++NumGprArgs; 1632 RegisterArg = Func->makeVariable(Ty); 1633 } 1634 assert(RegNum.hasValue()); 1635 assert(RegisterArg != nullptr); 1636 // Replace Arg in the argument list with the home register. Then generate 1637 // an instruction in the prolog to copy the home register to the assigned 1638 // location of Arg. 1639 if (BuildDefs::dump()) 1640 RegisterArg->setName(Func, "home_reg:" + Arg->getName()); 1641 RegisterArg->setRegNum(RegNum); 1642 RegisterArg->setIsArg(); 1643 Arg->setIsArg(false); 1644 1645 Args[i] = RegisterArg; 1646 // When not Om1, do the assignment through a temporary, instead of directly 1647 // from the pre-colored variable, so that a subsequent availabilityGet() 1648 // call has a chance to work. (In Om1, don't bother creating extra 1649 // instructions with extra variables to register-allocate.) 1650 if (OptM1) { 1651 Context.insert<InstAssign>(Arg, RegisterArg); 1652 } else { 1653 Variable *Tmp = makeReg(RegisterArg->getType()); 1654 Context.insert<InstAssign>(Tmp, RegisterArg); 1655 Context.insert<InstAssign>(Arg, Tmp); 1656 } 1657 } 1658 if (!OptM1) 1659 Context.availabilityUpdate(); 1660 } 1661 1662 /// Strength-reduce scalar integer multiplication by a constant (for i32 or 1663 /// narrower) for certain constants. The lea instruction can be used to multiply 1664 /// by 3, 5, or 9, and the lsh instruction can be used to multiply by powers of 1665 /// 2. These can be combined such that e.g. multiplying by 100 can be done as 2 1666 /// lea-based multiplies by 5, combined with left-shifting by 2. 1667 template <typename TraitsType> 1668 bool TargetX86Base<TraitsType>::optimizeScalarMul(Variable *Dest, Operand *Src0, 1669 int32_t Src1) { 1670 // Disable this optimization for Om1 and O0, just to keep things simple 1671 // there. 1672 if (Func->getOptLevel() < Opt_1) 1673 return false; 1674 Type Ty = Dest->getType(); 1675 if (Src1 == -1) { 1676 Variable *T = nullptr; 1677 _mov(T, Src0); 1678 _neg(T); 1679 _mov(Dest, T); 1680 return true; 1681 } 1682 if (Src1 == 0) { 1683 _mov(Dest, Ctx->getConstantZero(Ty)); 1684 return true; 1685 } 1686 if (Src1 == 1) { 1687 Variable *T = nullptr; 1688 _mov(T, Src0); 1689 _mov(Dest, T); 1690 return true; 1691 } 1692 // Don't bother with the edge case where Src1 == MININT. 1693 if (Src1 == -Src1) 1694 return false; 1695 const bool Src1IsNegative = Src1 < 0; 1696 if (Src1IsNegative) 1697 Src1 = -Src1; 1698 uint32_t Count9 = 0; 1699 uint32_t Count5 = 0; 1700 uint32_t Count3 = 0; 1701 uint32_t Count2 = 0; 1702 uint32_t CountOps = 0; 1703 while (Src1 > 1) { 1704 if (Src1 % 9 == 0) { 1705 ++CountOps; 1706 ++Count9; 1707 Src1 /= 9; 1708 } else if (Src1 % 5 == 0) { 1709 ++CountOps; 1710 ++Count5; 1711 Src1 /= 5; 1712 } else if (Src1 % 3 == 0) { 1713 ++CountOps; 1714 ++Count3; 1715 Src1 /= 3; 1716 } else if (Src1 % 2 == 0) { 1717 if (Count2 == 0) 1718 ++CountOps; 1719 ++Count2; 1720 Src1 /= 2; 1721 } else { 1722 return false; 1723 } 1724 } 1725 // Lea optimization only works for i16 and i32 types, not i8. 1726 if (Ty != IceType_i32 && !(Traits::Is64Bit && Ty == IceType_i64) && 1727 (Count3 || Count5 || Count9)) 1728 return false; 1729 // Limit the number of lea/shl operations for a single multiply, to a 1730 // somewhat arbitrary choice of 3. 1731 constexpr uint32_t MaxOpsForOptimizedMul = 3; 1732 if (CountOps > MaxOpsForOptimizedMul) 1733 return false; 1734 Variable *T = makeReg(Traits::WordType); 1735 if (typeWidthInBytes(Src0->getType()) < typeWidthInBytes(T->getType())) { 1736 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 1737 _movzx(T, Src0RM); 1738 } else { 1739 _mov(T, Src0); 1740 } 1741 Constant *Zero = Ctx->getConstantZero(IceType_i32); 1742 for (uint32_t i = 0; i < Count9; ++i) { 1743 constexpr uint16_t Shift = 3; // log2(9-1) 1744 _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift)); 1745 } 1746 for (uint32_t i = 0; i < Count5; ++i) { 1747 constexpr uint16_t Shift = 2; // log2(5-1) 1748 _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift)); 1749 } 1750 for (uint32_t i = 0; i < Count3; ++i) { 1751 constexpr uint16_t Shift = 1; // log2(3-1) 1752 _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift)); 1753 } 1754 if (Count2) { 1755 _shl(T, Ctx->getConstantInt(Ty, Count2)); 1756 } 1757 if (Src1IsNegative) 1758 _neg(T); 1759 _mov(Dest, T); 1760 return true; 1761 } 1762 1763 template <typename TraitsType> 1764 void TargetX86Base<TraitsType>::lowerShift64(InstArithmetic::OpKind Op, 1765 Operand *Src0Lo, Operand *Src0Hi, 1766 Operand *Src1Lo, Variable *DestLo, 1767 Variable *DestHi) { 1768 // TODO: Refactor the similarities between Shl, Lshr, and Ashr. 1769 Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr; 1770 Constant *Zero = Ctx->getConstantZero(IceType_i32); 1771 Constant *SignExtend = Ctx->getConstantInt32(0x1f); 1772 if (auto *ConstantShiftAmount = llvm::dyn_cast<ConstantInteger32>(Src1Lo)) { 1773 uint32_t ShiftAmount = ConstantShiftAmount->getValue(); 1774 if (ShiftAmount > 32) { 1775 Constant *ReducedShift = Ctx->getConstantInt32(ShiftAmount - 32); 1776 switch (Op) { 1777 default: 1778 assert(0 && "non-shift op"); 1779 break; 1780 case InstArithmetic::Shl: { 1781 // a=b<<c ==> 1782 // t2 = b.lo 1783 // t2 = shl t2, ShiftAmount-32 1784 // t3 = t2 1785 // t2 = 0 1786 _mov(T_2, Src0Lo); 1787 _shl(T_2, ReducedShift); 1788 _mov(DestHi, T_2); 1789 _mov(DestLo, Zero); 1790 } break; 1791 case InstArithmetic::Lshr: { 1792 // a=b>>c (unsigned) ==> 1793 // t2 = b.hi 1794 // t2 = shr t2, ShiftAmount-32 1795 // a.lo = t2 1796 // a.hi = 0 1797 _mov(T_2, Src0Hi); 1798 _shr(T_2, ReducedShift); 1799 _mov(DestLo, T_2); 1800 _mov(DestHi, Zero); 1801 } break; 1802 case InstArithmetic::Ashr: { 1803 // a=b>>c (signed) ==> 1804 // t3 = b.hi 1805 // t3 = sar t3, 0x1f 1806 // t2 = b.hi 1807 // t2 = shrd t2, t3, ShiftAmount-32 1808 // a.lo = t2 1809 // a.hi = t3 1810 _mov(T_3, Src0Hi); 1811 _sar(T_3, SignExtend); 1812 _mov(T_2, Src0Hi); 1813 _shrd(T_2, T_3, ReducedShift); 1814 _mov(DestLo, T_2); 1815 _mov(DestHi, T_3); 1816 } break; 1817 } 1818 } else if (ShiftAmount == 32) { 1819 switch (Op) { 1820 default: 1821 assert(0 && "non-shift op"); 1822 break; 1823 case InstArithmetic::Shl: { 1824 // a=b<<c ==> 1825 // t2 = b.lo 1826 // a.hi = t2 1827 // a.lo = 0 1828 _mov(T_2, Src0Lo); 1829 _mov(DestHi, T_2); 1830 _mov(DestLo, Zero); 1831 } break; 1832 case InstArithmetic::Lshr: { 1833 // a=b>>c (unsigned) ==> 1834 // t2 = b.hi 1835 // a.lo = t2 1836 // a.hi = 0 1837 _mov(T_2, Src0Hi); 1838 _mov(DestLo, T_2); 1839 _mov(DestHi, Zero); 1840 } break; 1841 case InstArithmetic::Ashr: { 1842 // a=b>>c (signed) ==> 1843 // t2 = b.hi 1844 // a.lo = t2 1845 // t3 = b.hi 1846 // t3 = sar t3, 0x1f 1847 // a.hi = t3 1848 _mov(T_2, Src0Hi); 1849 _mov(DestLo, T_2); 1850 _mov(T_3, Src0Hi); 1851 _sar(T_3, SignExtend); 1852 _mov(DestHi, T_3); 1853 } break; 1854 } 1855 } else { 1856 // COMMON PREFIX OF: a=b SHIFT_OP c ==> 1857 // t2 = b.lo 1858 // t3 = b.hi 1859 _mov(T_2, Src0Lo); 1860 _mov(T_3, Src0Hi); 1861 switch (Op) { 1862 default: 1863 assert(0 && "non-shift op"); 1864 break; 1865 case InstArithmetic::Shl: { 1866 // a=b<<c ==> 1867 // t3 = shld t3, t2, ShiftAmount 1868 // t2 = shl t2, ShiftAmount 1869 _shld(T_3, T_2, ConstantShiftAmount); 1870 _shl(T_2, ConstantShiftAmount); 1871 } break; 1872 case InstArithmetic::Lshr: { 1873 // a=b>>c (unsigned) ==> 1874 // t2 = shrd t2, t3, ShiftAmount 1875 // t3 = shr t3, ShiftAmount 1876 _shrd(T_2, T_3, ConstantShiftAmount); 1877 _shr(T_3, ConstantShiftAmount); 1878 } break; 1879 case InstArithmetic::Ashr: { 1880 // a=b>>c (signed) ==> 1881 // t2 = shrd t2, t3, ShiftAmount 1882 // t3 = sar t3, ShiftAmount 1883 _shrd(T_2, T_3, ConstantShiftAmount); 1884 _sar(T_3, ConstantShiftAmount); 1885 } break; 1886 } 1887 // COMMON SUFFIX OF: a=b SHIFT_OP c ==> 1888 // a.lo = t2 1889 // a.hi = t3 1890 _mov(DestLo, T_2); 1891 _mov(DestHi, T_3); 1892 } 1893 } else { 1894 // NON-CONSTANT CASES. 1895 Constant *BitTest = Ctx->getConstantInt32(0x20); 1896 InstX86Label *Label = InstX86Label::create(Func, this); 1897 // COMMON PREFIX OF: a=b SHIFT_OP c ==> 1898 // t1:ecx = c.lo & 0xff 1899 // t2 = b.lo 1900 // t3 = b.hi 1901 T_1 = copyToReg8(Src1Lo, Traits::RegisterSet::Reg_cl); 1902 _mov(T_2, Src0Lo); 1903 _mov(T_3, Src0Hi); 1904 switch (Op) { 1905 default: 1906 assert(0 && "non-shift op"); 1907 break; 1908 case InstArithmetic::Shl: { 1909 // a=b<<c ==> 1910 // t3 = shld t3, t2, t1 1911 // t2 = shl t2, t1 1912 // test t1, 0x20 1913 // je L1 1914 // use(t3) 1915 // t3 = t2 1916 // t2 = 0 1917 _shld(T_3, T_2, T_1); 1918 _shl(T_2, T_1); 1919 _test(T_1, BitTest); 1920 _br(Traits::Cond::Br_e, Label); 1921 // T_2 and T_3 are being assigned again because of the intra-block control 1922 // flow, so we need to use _redefined to avoid liveness problems. 1923 _redefined(_mov(T_3, T_2)); 1924 _redefined(_mov(T_2, Zero)); 1925 } break; 1926 case InstArithmetic::Lshr: { 1927 // a=b>>c (unsigned) ==> 1928 // t2 = shrd t2, t3, t1 1929 // t3 = shr t3, t1 1930 // test t1, 0x20 1931 // je L1 1932 // use(t2) 1933 // t2 = t3 1934 // t3 = 0 1935 _shrd(T_2, T_3, T_1); 1936 _shr(T_3, T_1); 1937 _test(T_1, BitTest); 1938 _br(Traits::Cond::Br_e, Label); 1939 // T_2 and T_3 are being assigned again because of the intra-block control 1940 // flow, so we need to use _redefined to avoid liveness problems. 1941 _redefined(_mov(T_2, T_3)); 1942 _redefined(_mov(T_3, Zero)); 1943 } break; 1944 case InstArithmetic::Ashr: { 1945 // a=b>>c (signed) ==> 1946 // t2 = shrd t2, t3, t1 1947 // t3 = sar t3, t1 1948 // test t1, 0x20 1949 // je L1 1950 // use(t2) 1951 // t2 = t3 1952 // t3 = sar t3, 0x1f 1953 Constant *SignExtend = Ctx->getConstantInt32(0x1f); 1954 _shrd(T_2, T_3, T_1); 1955 _sar(T_3, T_1); 1956 _test(T_1, BitTest); 1957 _br(Traits::Cond::Br_e, Label); 1958 // T_2 and T_3 are being assigned again because of the intra-block control 1959 // flow, so T_2 needs to use _redefined to avoid liveness problems. T_3 1960 // doesn't need special treatment because it is reassigned via _sar 1961 // instead of _mov. 1962 _redefined(_mov(T_2, T_3)); 1963 _sar(T_3, SignExtend); 1964 } break; 1965 } 1966 // COMMON SUFFIX OF: a=b SHIFT_OP c ==> 1967 // L1: 1968 // a.lo = t2 1969 // a.hi = t3 1970 Context.insert(Label); 1971 _mov(DestLo, T_2); 1972 _mov(DestHi, T_3); 1973 } 1974 } 1975 1976 template <typename TraitsType> 1977 void TargetX86Base<TraitsType>::lowerArithmetic(const InstArithmetic *Instr) { 1978 Variable *Dest = Instr->getDest(); 1979 if (Dest->isRematerializable()) { 1980 Context.insert<InstFakeDef>(Dest); 1981 return; 1982 } 1983 Type Ty = Dest->getType(); 1984 Operand *Src0 = legalize(Instr->getSrc(0)); 1985 Operand *Src1 = legalize(Instr->getSrc(1)); 1986 if (Instr->isCommutative()) { 1987 uint32_t SwapCount = 0; 1988 if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1)) { 1989 std::swap(Src0, Src1); 1990 ++SwapCount; 1991 } 1992 if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1)) { 1993 std::swap(Src0, Src1); 1994 ++SwapCount; 1995 } 1996 // Improve two-address code patterns by avoiding a copy to the dest 1997 // register when one of the source operands ends its lifetime here. 1998 if (!Instr->isLastUse(Src0) && Instr->isLastUse(Src1)) { 1999 std::swap(Src0, Src1); 2000 ++SwapCount; 2001 } 2002 assert(SwapCount <= 1); 2003 (void)SwapCount; 2004 } 2005 if (!Traits::Is64Bit && Ty == IceType_i64) { 2006 // These x86-32 helper-call-involved instructions are lowered in this 2007 // separate switch. This is because loOperand() and hiOperand() may insert 2008 // redundant instructions for constant blinding and pooling. Such redundant 2009 // instructions will fail liveness analysis under -Om1 setting. And, 2010 // actually these arguments do not need to be processed with loOperand() 2011 // and hiOperand() to be used. 2012 switch (Instr->getOp()) { 2013 case InstArithmetic::Udiv: 2014 case InstArithmetic::Sdiv: 2015 case InstArithmetic::Urem: 2016 case InstArithmetic::Srem: 2017 llvm::report_fatal_error("Helper call was expected"); 2018 return; 2019 default: 2020 break; 2021 } 2022 2023 auto *DestLo = llvm::cast<Variable>(loOperand(Dest)); 2024 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest)); 2025 Operand *Src0Lo = loOperand(Src0); 2026 Operand *Src0Hi = hiOperand(Src0); 2027 Operand *Src1Lo = loOperand(Src1); 2028 Operand *Src1Hi = hiOperand(Src1); 2029 Variable *T_Lo = nullptr, *T_Hi = nullptr; 2030 switch (Instr->getOp()) { 2031 case InstArithmetic::_num: 2032 llvm_unreachable("Unknown arithmetic operator"); 2033 break; 2034 case InstArithmetic::Add: 2035 _mov(T_Lo, Src0Lo); 2036 _add(T_Lo, Src1Lo); 2037 _mov(DestLo, T_Lo); 2038 _mov(T_Hi, Src0Hi); 2039 _adc(T_Hi, Src1Hi); 2040 _mov(DestHi, T_Hi); 2041 break; 2042 case InstArithmetic::And: 2043 _mov(T_Lo, Src0Lo); 2044 _and(T_Lo, Src1Lo); 2045 _mov(DestLo, T_Lo); 2046 _mov(T_Hi, Src0Hi); 2047 _and(T_Hi, Src1Hi); 2048 _mov(DestHi, T_Hi); 2049 break; 2050 case InstArithmetic::Or: 2051 _mov(T_Lo, Src0Lo); 2052 _or(T_Lo, Src1Lo); 2053 _mov(DestLo, T_Lo); 2054 _mov(T_Hi, Src0Hi); 2055 _or(T_Hi, Src1Hi); 2056 _mov(DestHi, T_Hi); 2057 break; 2058 case InstArithmetic::Xor: 2059 _mov(T_Lo, Src0Lo); 2060 _xor(T_Lo, Src1Lo); 2061 _mov(DestLo, T_Lo); 2062 _mov(T_Hi, Src0Hi); 2063 _xor(T_Hi, Src1Hi); 2064 _mov(DestHi, T_Hi); 2065 break; 2066 case InstArithmetic::Sub: 2067 _mov(T_Lo, Src0Lo); 2068 _sub(T_Lo, Src1Lo); 2069 _mov(DestLo, T_Lo); 2070 _mov(T_Hi, Src0Hi); 2071 _sbb(T_Hi, Src1Hi); 2072 _mov(DestHi, T_Hi); 2073 break; 2074 case InstArithmetic::Mul: { 2075 Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr; 2076 Variable *T_4Lo = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax); 2077 Variable *T_4Hi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx); 2078 // gcc does the following: 2079 // a=b*c ==> 2080 // t1 = b.hi; t1 *=(imul) c.lo 2081 // t2 = c.hi; t2 *=(imul) b.lo 2082 // t3:eax = b.lo 2083 // t4.hi:edx,t4.lo:eax = t3:eax *(mul) c.lo 2084 // a.lo = t4.lo 2085 // t4.hi += t1 2086 // t4.hi += t2 2087 // a.hi = t4.hi 2088 // The mul instruction cannot take an immediate operand. 2089 Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Mem); 2090 _mov(T_1, Src0Hi); 2091 _imul(T_1, Src1Lo); 2092 _mov(T_3, Src0Lo, Traits::RegisterSet::Reg_eax); 2093 _mul(T_4Lo, T_3, Src1Lo); 2094 // The mul instruction produces two dest variables, edx:eax. We create a 2095 // fake definition of edx to account for this. 2096 Context.insert<InstFakeDef>(T_4Hi, T_4Lo); 2097 Context.insert<InstFakeUse>(T_4Hi); 2098 _mov(DestLo, T_4Lo); 2099 _add(T_4Hi, T_1); 2100 _mov(T_2, Src1Hi); 2101 _imul(T_2, Src0Lo); 2102 _add(T_4Hi, T_2); 2103 _mov(DestHi, T_4Hi); 2104 } break; 2105 case InstArithmetic::Shl: 2106 case InstArithmetic::Lshr: 2107 case InstArithmetic::Ashr: 2108 lowerShift64(Instr->getOp(), Src0Lo, Src0Hi, Src1Lo, DestLo, DestHi); 2109 break; 2110 case InstArithmetic::Fadd: 2111 case InstArithmetic::Fsub: 2112 case InstArithmetic::Fmul: 2113 case InstArithmetic::Fdiv: 2114 case InstArithmetic::Frem: 2115 llvm_unreachable("FP instruction with i64 type"); 2116 break; 2117 case InstArithmetic::Udiv: 2118 case InstArithmetic::Sdiv: 2119 case InstArithmetic::Urem: 2120 case InstArithmetic::Srem: 2121 llvm_unreachable("Call-helper-involved instruction for i64 type \ 2122 should have already been handled before"); 2123 break; 2124 } 2125 return; 2126 } 2127 if (isVectorType(Ty)) { 2128 // TODO: Trap on integer divide and integer modulo by zero. See: 2129 // https://code.google.com/p/nativeclient/issues/detail?id=3899 2130 if (llvm::isa<X86OperandMem>(Src1)) 2131 Src1 = legalizeToReg(Src1); 2132 switch (Instr->getOp()) { 2133 case InstArithmetic::_num: 2134 llvm_unreachable("Unknown arithmetic operator"); 2135 break; 2136 case InstArithmetic::Add: { 2137 Variable *T = makeReg(Ty); 2138 _movp(T, Src0); 2139 _padd(T, Src1); 2140 _movp(Dest, T); 2141 } break; 2142 case InstArithmetic::And: { 2143 Variable *T = makeReg(Ty); 2144 _movp(T, Src0); 2145 _pand(T, Src1); 2146 _movp(Dest, T); 2147 } break; 2148 case InstArithmetic::Or: { 2149 Variable *T = makeReg(Ty); 2150 _movp(T, Src0); 2151 _por(T, Src1); 2152 _movp(Dest, T); 2153 } break; 2154 case InstArithmetic::Xor: { 2155 Variable *T = makeReg(Ty); 2156 _movp(T, Src0); 2157 _pxor(T, Src1); 2158 _movp(Dest, T); 2159 } break; 2160 case InstArithmetic::Sub: { 2161 Variable *T = makeReg(Ty); 2162 _movp(T, Src0); 2163 _psub(T, Src1); 2164 _movp(Dest, T); 2165 } break; 2166 case InstArithmetic::Mul: { 2167 bool TypesAreValidForPmull = Ty == IceType_v4i32 || Ty == IceType_v8i16; 2168 bool InstructionSetIsValidForPmull = 2169 Ty == IceType_v8i16 || InstructionSet >= Traits::SSE4_1; 2170 if (TypesAreValidForPmull && InstructionSetIsValidForPmull) { 2171 Variable *T = makeReg(Ty); 2172 _movp(T, Src0); 2173 _pmull(T, Src0 == Src1 ? T : Src1); 2174 _movp(Dest, T); 2175 } else if (Ty == IceType_v4i32) { 2176 // Lowering sequence: 2177 // Note: The mask arguments have index 0 on the left. 2178 // 2179 // movups T1, Src0 2180 // pshufd T2, Src0, {1,0,3,0} 2181 // pshufd T3, Src1, {1,0,3,0} 2182 // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]} 2183 // pmuludq T1, Src1 2184 // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]} 2185 // pmuludq T2, T3 2186 // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])} 2187 // shufps T1, T2, {0,2,0,2} 2188 // pshufd T4, T1, {0,2,1,3} 2189 // movups Dest, T4 2190 2191 // Mask that directs pshufd to create a vector with entries 2192 // Src[1, 0, 3, 0] 2193 constexpr unsigned Constant1030 = 0x31; 2194 Constant *Mask1030 = Ctx->getConstantInt32(Constant1030); 2195 // Mask that directs shufps to create a vector with entries 2196 // Dest[0, 2], Src[0, 2] 2197 constexpr unsigned Mask0202 = 0x88; 2198 // Mask that directs pshufd to create a vector with entries 2199 // Src[0, 2, 1, 3] 2200 constexpr unsigned Mask0213 = 0xd8; 2201 Variable *T1 = makeReg(IceType_v4i32); 2202 Variable *T2 = makeReg(IceType_v4i32); 2203 Variable *T3 = makeReg(IceType_v4i32); 2204 Variable *T4 = makeReg(IceType_v4i32); 2205 _movp(T1, Src0); 2206 _pshufd(T2, Src0, Mask1030); 2207 _pshufd(T3, Src1, Mask1030); 2208 _pmuludq(T1, Src1); 2209 _pmuludq(T2, T3); 2210 _shufps(T1, T2, Ctx->getConstantInt32(Mask0202)); 2211 _pshufd(T4, T1, Ctx->getConstantInt32(Mask0213)); 2212 _movp(Dest, T4); 2213 } else if (Ty == IceType_v16i8) { 2214 llvm::report_fatal_error("Scalarized operation was expected"); 2215 } else { 2216 llvm::report_fatal_error("Invalid vector multiply type"); 2217 } 2218 } break; 2219 case InstArithmetic::Shl: { 2220 assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized"); 2221 Variable *T = makeReg(Ty); 2222 _movp(T, Src0); 2223 _psll(T, Src1); 2224 _movp(Dest, T); 2225 } break; 2226 case InstArithmetic::Lshr: { 2227 assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized"); 2228 Variable *T = makeReg(Ty); 2229 _movp(T, Src0); 2230 _psrl(T, Src1); 2231 _movp(Dest, T); 2232 } break; 2233 case InstArithmetic::Ashr: { 2234 assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized"); 2235 Variable *T = makeReg(Ty); 2236 _movp(T, Src0); 2237 _psra(T, Src1); 2238 _movp(Dest, T); 2239 } break; 2240 case InstArithmetic::Udiv: 2241 case InstArithmetic::Urem: 2242 case InstArithmetic::Sdiv: 2243 case InstArithmetic::Srem: 2244 llvm::report_fatal_error("Scalarized operation was expected"); 2245 break; 2246 case InstArithmetic::Fadd: { 2247 Variable *T = makeReg(Ty); 2248 _movp(T, Src0); 2249 _addps(T, Src1); 2250 _movp(Dest, T); 2251 } break; 2252 case InstArithmetic::Fsub: { 2253 Variable *T = makeReg(Ty); 2254 _movp(T, Src0); 2255 _subps(T, Src1); 2256 _movp(Dest, T); 2257 } break; 2258 case InstArithmetic::Fmul: { 2259 Variable *T = makeReg(Ty); 2260 _movp(T, Src0); 2261 _mulps(T, Src0 == Src1 ? T : Src1); 2262 _movp(Dest, T); 2263 } break; 2264 case InstArithmetic::Fdiv: { 2265 Variable *T = makeReg(Ty); 2266 _movp(T, Src0); 2267 _divps(T, Src1); 2268 _movp(Dest, T); 2269 } break; 2270 case InstArithmetic::Frem: 2271 llvm::report_fatal_error("Scalarized operation was expected"); 2272 break; 2273 } 2274 return; 2275 } 2276 Variable *T_edx = nullptr; 2277 Variable *T = nullptr; 2278 switch (Instr->getOp()) { 2279 case InstArithmetic::_num: 2280 llvm_unreachable("Unknown arithmetic operator"); 2281 break; 2282 case InstArithmetic::Add: { 2283 const bool ValidType = 2284 Ty == IceType_i32 || (Ty == IceType_i64 && Traits::Is64Bit); 2285 auto *Const = llvm::dyn_cast<Constant>(Instr->getSrc(1)); 2286 const bool ValidKind = 2287 Const != nullptr && (llvm::isa<ConstantInteger32>(Const) || 2288 llvm::isa<ConstantRelocatable>(Const)); 2289 if (getFlags().getAggressiveLea() && ValidType && ValidKind) { 2290 auto *Var = legalizeToReg(Src0); 2291 auto *Mem = Traits::X86OperandMem::create(Func, IceType_void, Var, Const); 2292 T = makeReg(Ty); 2293 _lea(T, _sandbox_mem_reference(Mem)); 2294 _mov(Dest, T); 2295 break; 2296 } 2297 _mov(T, Src0); 2298 _add(T, Src1); 2299 _mov(Dest, T); 2300 } break; 2301 case InstArithmetic::And: 2302 _mov(T, Src0); 2303 _and(T, Src1); 2304 _mov(Dest, T); 2305 break; 2306 case InstArithmetic::Or: 2307 _mov(T, Src0); 2308 _or(T, Src1); 2309 _mov(Dest, T); 2310 break; 2311 case InstArithmetic::Xor: 2312 _mov(T, Src0); 2313 _xor(T, Src1); 2314 _mov(Dest, T); 2315 break; 2316 case InstArithmetic::Sub: 2317 _mov(T, Src0); 2318 _sub(T, Src1); 2319 _mov(Dest, T); 2320 break; 2321 case InstArithmetic::Mul: 2322 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) { 2323 if (optimizeScalarMul(Dest, Src0, C->getValue())) 2324 return; 2325 } 2326 // The 8-bit version of imul only allows the form "imul r/m8" where T must 2327 // be in al. 2328 if (isByteSizedArithType(Ty)) { 2329 _mov(T, Src0, Traits::RegisterSet::Reg_al); 2330 Src1 = legalize(Src1, Legal_Reg | Legal_Mem); 2331 _imul(T, Src0 == Src1 ? T : Src1); 2332 _mov(Dest, T); 2333 } else if (auto *ImmConst = llvm::dyn_cast<ConstantInteger32>(Src1)) { 2334 T = makeReg(Ty); 2335 _imul_imm(T, Src0, ImmConst); 2336 _mov(Dest, T); 2337 } else { 2338 _mov(T, Src0); 2339 _imul(T, Src0 == Src1 ? T : Src1); 2340 _mov(Dest, T); 2341 } 2342 break; 2343 case InstArithmetic::Shl: 2344 _mov(T, Src0); 2345 if (!llvm::isa<ConstantInteger32>(Src1) && 2346 !llvm::isa<ConstantInteger64>(Src1)) 2347 Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl); 2348 _shl(T, Src1); 2349 _mov(Dest, T); 2350 break; 2351 case InstArithmetic::Lshr: 2352 _mov(T, Src0); 2353 if (!llvm::isa<ConstantInteger32>(Src1) && 2354 !llvm::isa<ConstantInteger64>(Src1)) 2355 Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl); 2356 _shr(T, Src1); 2357 _mov(Dest, T); 2358 break; 2359 case InstArithmetic::Ashr: 2360 _mov(T, Src0); 2361 if (!llvm::isa<ConstantInteger32>(Src1) && 2362 !llvm::isa<ConstantInteger64>(Src1)) 2363 Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl); 2364 _sar(T, Src1); 2365 _mov(Dest, T); 2366 break; 2367 case InstArithmetic::Udiv: { 2368 // div and idiv are the few arithmetic operators that do not allow 2369 // immediates as the operand. 2370 Src1 = legalize(Src1, Legal_Reg | Legal_Mem); 2371 RegNumT Eax; 2372 RegNumT Edx; 2373 switch (Ty) { 2374 default: 2375 llvm::report_fatal_error("Bad type for udiv"); 2376 case IceType_i64: 2377 Eax = Traits::getRaxOrDie(); 2378 Edx = Traits::getRdxOrDie(); 2379 break; 2380 case IceType_i32: 2381 Eax = Traits::RegisterSet::Reg_eax; 2382 Edx = Traits::RegisterSet::Reg_edx; 2383 break; 2384 case IceType_i16: 2385 Eax = Traits::RegisterSet::Reg_ax; 2386 Edx = Traits::RegisterSet::Reg_dx; 2387 break; 2388 case IceType_i8: 2389 Eax = Traits::RegisterSet::Reg_al; 2390 Edx = Traits::RegisterSet::Reg_ah; 2391 break; 2392 } 2393 T_edx = makeReg(Ty, Edx); 2394 _mov(T, Src0, Eax); 2395 _mov(T_edx, Ctx->getConstantZero(Ty)); 2396 _div(T_edx, Src1, T); 2397 _redefined(Context.insert<InstFakeDef>(T, T_edx)); 2398 _mov(Dest, T); 2399 } break; 2400 case InstArithmetic::Sdiv: 2401 // TODO(stichnot): Enable this after doing better performance and cross 2402 // testing. 2403 if (false && Func->getOptLevel() >= Opt_1) { 2404 // Optimize division by constant power of 2, but not for Om1 or O0, just 2405 // to keep things simple there. 2406 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) { 2407 const int32_t Divisor = C->getValue(); 2408 const uint32_t UDivisor = Divisor; 2409 if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) { 2410 uint32_t LogDiv = llvm::Log2_32(UDivisor); 2411 // LLVM does the following for dest=src/(1<<log): 2412 // t=src 2413 // sar t,typewidth-1 // -1 if src is negative, 0 if not 2414 // shr t,typewidth-log 2415 // add t,src 2416 // sar t,log 2417 // dest=t 2418 uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty); 2419 _mov(T, Src0); 2420 // If for some reason we are dividing by 1, just treat it like an 2421 // assignment. 2422 if (LogDiv > 0) { 2423 // The initial sar is unnecessary when dividing by 2. 2424 if (LogDiv > 1) 2425 _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1)); 2426 _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv)); 2427 _add(T, Src0); 2428 _sar(T, Ctx->getConstantInt(Ty, LogDiv)); 2429 } 2430 _mov(Dest, T); 2431 return; 2432 } 2433 } 2434 } 2435 Src1 = legalize(Src1, Legal_Reg | Legal_Mem); 2436 switch (Ty) { 2437 default: 2438 llvm::report_fatal_error("Bad type for sdiv"); 2439 case IceType_i64: 2440 T_edx = makeReg(Ty, Traits::getRdxOrDie()); 2441 _mov(T, Src0, Traits::getRaxOrDie()); 2442 break; 2443 case IceType_i32: 2444 T_edx = makeReg(Ty, Traits::RegisterSet::Reg_edx); 2445 _mov(T, Src0, Traits::RegisterSet::Reg_eax); 2446 break; 2447 case IceType_i16: 2448 T_edx = makeReg(Ty, Traits::RegisterSet::Reg_dx); 2449 _mov(T, Src0, Traits::RegisterSet::Reg_ax); 2450 break; 2451 case IceType_i8: 2452 T_edx = makeReg(IceType_i16, Traits::RegisterSet::Reg_ax); 2453 _mov(T, Src0, Traits::RegisterSet::Reg_al); 2454 break; 2455 } 2456 _cbwdq(T_edx, T); 2457 _idiv(T_edx, Src1, T); 2458 _redefined(Context.insert<InstFakeDef>(T, T_edx)); 2459 _mov(Dest, T); 2460 break; 2461 case InstArithmetic::Urem: { 2462 Src1 = legalize(Src1, Legal_Reg | Legal_Mem); 2463 RegNumT Eax; 2464 RegNumT Edx; 2465 switch (Ty) { 2466 default: 2467 llvm::report_fatal_error("Bad type for urem"); 2468 case IceType_i64: 2469 Eax = Traits::getRaxOrDie(); 2470 Edx = Traits::getRdxOrDie(); 2471 break; 2472 case IceType_i32: 2473 Eax = Traits::RegisterSet::Reg_eax; 2474 Edx = Traits::RegisterSet::Reg_edx; 2475 break; 2476 case IceType_i16: 2477 Eax = Traits::RegisterSet::Reg_ax; 2478 Edx = Traits::RegisterSet::Reg_dx; 2479 break; 2480 case IceType_i8: 2481 Eax = Traits::RegisterSet::Reg_al; 2482 Edx = Traits::RegisterSet::Reg_ah; 2483 break; 2484 } 2485 T_edx = makeReg(Ty, Edx); 2486 _mov(T_edx, Ctx->getConstantZero(Ty)); 2487 _mov(T, Src0, Eax); 2488 _div(T, Src1, T_edx); 2489 _redefined(Context.insert<InstFakeDef>(T_edx, T)); 2490 if (Ty == IceType_i8) { 2491 // Register ah must be moved into one of {al,bl,cl,dl} before it can be 2492 // moved into a general 8-bit register. 2493 auto *T_AhRcvr = makeReg(Ty); 2494 T_AhRcvr->setRegClass(RCX86_IsAhRcvr); 2495 _mov(T_AhRcvr, T_edx); 2496 T_edx = T_AhRcvr; 2497 } 2498 _mov(Dest, T_edx); 2499 } break; 2500 case InstArithmetic::Srem: { 2501 // TODO(stichnot): Enable this after doing better performance and cross 2502 // testing. 2503 if (false && Func->getOptLevel() >= Opt_1) { 2504 // Optimize mod by constant power of 2, but not for Om1 or O0, just to 2505 // keep things simple there. 2506 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) { 2507 const int32_t Divisor = C->getValue(); 2508 const uint32_t UDivisor = Divisor; 2509 if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) { 2510 uint32_t LogDiv = llvm::Log2_32(UDivisor); 2511 // LLVM does the following for dest=src%(1<<log): 2512 // t=src 2513 // sar t,typewidth-1 // -1 if src is negative, 0 if not 2514 // shr t,typewidth-log 2515 // add t,src 2516 // and t, -(1<<log) 2517 // sub t,src 2518 // neg t 2519 // dest=t 2520 uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty); 2521 // If for some reason we are dividing by 1, just assign 0. 2522 if (LogDiv == 0) { 2523 _mov(Dest, Ctx->getConstantZero(Ty)); 2524 return; 2525 } 2526 _mov(T, Src0); 2527 // The initial sar is unnecessary when dividing by 2. 2528 if (LogDiv > 1) 2529 _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1)); 2530 _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv)); 2531 _add(T, Src0); 2532 _and(T, Ctx->getConstantInt(Ty, -(1 << LogDiv))); 2533 _sub(T, Src0); 2534 _neg(T); 2535 _mov(Dest, T); 2536 return; 2537 } 2538 } 2539 } 2540 Src1 = legalize(Src1, Legal_Reg | Legal_Mem); 2541 RegNumT Eax; 2542 RegNumT Edx; 2543 switch (Ty) { 2544 default: 2545 llvm::report_fatal_error("Bad type for srem"); 2546 case IceType_i64: 2547 Eax = Traits::getRaxOrDie(); 2548 Edx = Traits::getRdxOrDie(); 2549 break; 2550 case IceType_i32: 2551 Eax = Traits::RegisterSet::Reg_eax; 2552 Edx = Traits::RegisterSet::Reg_edx; 2553 break; 2554 case IceType_i16: 2555 Eax = Traits::RegisterSet::Reg_ax; 2556 Edx = Traits::RegisterSet::Reg_dx; 2557 break; 2558 case IceType_i8: 2559 Eax = Traits::RegisterSet::Reg_al; 2560 Edx = Traits::RegisterSet::Reg_ah; 2561 break; 2562 } 2563 T_edx = makeReg(Ty, Edx); 2564 _mov(T, Src0, Eax); 2565 _cbwdq(T_edx, T); 2566 _idiv(T, Src1, T_edx); 2567 _redefined(Context.insert<InstFakeDef>(T_edx, T)); 2568 if (Ty == IceType_i8) { 2569 // Register ah must be moved into one of {al,bl,cl,dl} before it can be 2570 // moved into a general 8-bit register. 2571 auto *T_AhRcvr = makeReg(Ty); 2572 T_AhRcvr->setRegClass(RCX86_IsAhRcvr); 2573 _mov(T_AhRcvr, T_edx); 2574 T_edx = T_AhRcvr; 2575 } 2576 _mov(Dest, T_edx); 2577 } break; 2578 case InstArithmetic::Fadd: 2579 _mov(T, Src0); 2580 _addss(T, Src1); 2581 _mov(Dest, T); 2582 break; 2583 case InstArithmetic::Fsub: 2584 _mov(T, Src0); 2585 _subss(T, Src1); 2586 _mov(Dest, T); 2587 break; 2588 case InstArithmetic::Fmul: 2589 _mov(T, Src0); 2590 _mulss(T, Src0 == Src1 ? T : Src1); 2591 _mov(Dest, T); 2592 break; 2593 case InstArithmetic::Fdiv: 2594 _mov(T, Src0); 2595 _divss(T, Src1); 2596 _mov(Dest, T); 2597 break; 2598 case InstArithmetic::Frem: 2599 llvm::report_fatal_error("Helper call was expected"); 2600 break; 2601 } 2602 } 2603 2604 template <typename TraitsType> 2605 void TargetX86Base<TraitsType>::lowerAssign(const InstAssign *Instr) { 2606 Variable *Dest = Instr->getDest(); 2607 if (Dest->isRematerializable()) { 2608 Context.insert<InstFakeDef>(Dest); 2609 return; 2610 } 2611 Operand *Src = Instr->getSrc(0); 2612 assert(Dest->getType() == Src->getType()); 2613 lowerMove(Dest, Src, false); 2614 } 2615 2616 template <typename TraitsType> 2617 void TargetX86Base<TraitsType>::lowerBr(const InstBr *Br) { 2618 if (Br->isUnconditional()) { 2619 _br(Br->getTargetUnconditional()); 2620 return; 2621 } 2622 Operand *Cond = Br->getCondition(); 2623 2624 // Handle folding opportunities. 2625 if (const Inst *Producer = FoldingInfo.getProducerFor(Cond)) { 2626 assert(Producer->isDeleted()); 2627 switch (BoolFolding<Traits>::getProducerKind(Producer)) { 2628 default: 2629 break; 2630 case BoolFolding<Traits>::PK_Icmp32: 2631 case BoolFolding<Traits>::PK_Icmp64: { 2632 lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Br); 2633 return; 2634 } 2635 case BoolFolding<Traits>::PK_Fcmp: { 2636 lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Br); 2637 return; 2638 } 2639 case BoolFolding<Traits>::PK_Arith: { 2640 lowerArithAndConsumer(llvm::cast<InstArithmetic>(Producer), Br); 2641 return; 2642 } 2643 } 2644 } 2645 Operand *Src0 = legalize(Cond, Legal_Reg | Legal_Mem); 2646 Constant *Zero = Ctx->getConstantZero(IceType_i32); 2647 _cmp(Src0, Zero); 2648 _br(Traits::Cond::Br_ne, Br->getTargetTrue(), Br->getTargetFalse()); 2649 } 2650 2651 // constexprMax returns a (constexpr) max(S0, S1), and it is used for defining 2652 // OperandList in lowerCall. std::max() is supposed to work, but it doesn't. 2653 inline constexpr SizeT constexprMax(SizeT S0, SizeT S1) { 2654 return S0 < S1 ? S1 : S0; 2655 } 2656 2657 template <typename TraitsType> 2658 void TargetX86Base<TraitsType>::lowerCall(const InstCall *Instr) { 2659 // Common x86 calling convention lowering: 2660 // 2661 // * At the point before the call, the stack must be aligned to 16 bytes. 2662 // 2663 // * Non-register arguments are pushed onto the stack in right-to-left order, 2664 // such that the left-most argument ends up on the top of the stack at the 2665 // lowest memory address. 2666 // 2667 // * Stack arguments of vector type are aligned to start at the next highest 2668 // multiple of 16 bytes. Other stack arguments are aligned to the next word 2669 // size boundary (4 or 8 bytes, respectively). 2670 RequiredStackAlignment = std::max<size_t>(RequiredStackAlignment, 2671 Traits::X86_STACK_ALIGNMENT_BYTES); 2672 2673 constexpr SizeT MaxOperands = 2674 constexprMax(Traits::X86_MAX_XMM_ARGS, Traits::X86_MAX_GPR_ARGS); 2675 using OperandList = llvm::SmallVector<Operand *, MaxOperands>; 2676 2677 OperandList XmmArgs; 2678 llvm::SmallVector<SizeT, MaxOperands> XmmArgIndices; 2679 CfgVector<std::pair<const Type, Operand *>> GprArgs; 2680 CfgVector<SizeT> GprArgIndices; 2681 OperandList StackArgs, StackArgLocations; 2682 uint32_t ParameterAreaSizeBytes = 0; 2683 2684 ParameterAreaSizeBytes += getShadowStoreSize<Traits>(); 2685 2686 // Classify each argument operand according to the location where the argument 2687 // is passed. 2688 for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) { 2689 Operand *Arg = Instr->getArg(i); 2690 const Type Ty = Arg->getType(); 2691 // The PNaCl ABI requires the width of arguments to be at least 32 bits. 2692 assert(typeWidthInBytes(Ty) >= 4); 2693 if (isVectorType(Ty) && 2694 Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, XmmArgs.size())) 2695 .hasValue()) { 2696 XmmArgs.push_back(Arg); 2697 XmmArgIndices.push_back(i); 2698 } else if (isScalarFloatingType(Ty) && Traits::X86_PASS_SCALAR_FP_IN_XMM && 2699 Traits::getRegisterForXmmArgNum( 2700 Traits::getArgIndex(i, XmmArgs.size())) 2701 .hasValue()) { 2702 XmmArgs.push_back(Arg); 2703 XmmArgIndices.push_back(i); 2704 } else if (isScalarIntegerType(Ty) && 2705 Traits::getRegisterForGprArgNum( 2706 Ty, Traits::getArgIndex(i, GprArgs.size())) 2707 .hasValue()) { 2708 GprArgs.emplace_back(Ty, Arg); 2709 GprArgIndices.push_back(i); 2710 } else { 2711 // Place on stack. 2712 StackArgs.push_back(Arg); 2713 if (isVectorType(Arg->getType())) { 2714 ParameterAreaSizeBytes = 2715 Traits::applyStackAlignment(ParameterAreaSizeBytes); 2716 } 2717 Variable *esp = getPhysicalRegister(getStackReg(), Traits::WordType); 2718 Constant *Loc = Ctx->getConstantInt32(ParameterAreaSizeBytes); 2719 StackArgLocations.push_back( 2720 Traits::X86OperandMem::create(Func, Ty, esp, Loc)); 2721 ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType()); 2722 } 2723 } 2724 // Ensure there is enough space for the fstp/movs for floating returns. 2725 Variable *Dest = Instr->getDest(); 2726 const Type DestTy = Dest ? Dest->getType() : IceType_void; 2727 if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) { 2728 if (isScalarFloatingType(DestTy)) { 2729 ParameterAreaSizeBytes = 2730 std::max(static_cast<size_t>(ParameterAreaSizeBytes), 2731 typeWidthInBytesOnStack(DestTy)); 2732 } 2733 } 2734 // Adjust the parameter area so that the stack is aligned. It is assumed that 2735 // the stack is already aligned at the start of the calling sequence. 2736 ParameterAreaSizeBytes = Traits::applyStackAlignment(ParameterAreaSizeBytes); 2737 assert(ParameterAreaSizeBytes <= maxOutArgsSizeBytes()); 2738 // Copy arguments that are passed on the stack to the appropriate stack 2739 // locations. We make sure legalize() is called on each argument at this 2740 // point, to allow availabilityGet() to work. 2741 for (SizeT i = 0, NumStackArgs = StackArgs.size(); i < NumStackArgs; ++i) { 2742 lowerStore( 2743 InstStore::create(Func, legalize(StackArgs[i]), StackArgLocations[i])); 2744 } 2745 // Copy arguments to be passed in registers to the appropriate registers. 2746 for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) { 2747 XmmArgs[i] = legalizeToReg(legalize(XmmArgs[i]), 2748 Traits::getRegisterForXmmArgNum( 2749 Traits::getArgIndex(XmmArgIndices[i], i))); 2750 } 2751 // Materialize moves for arguments passed in GPRs. 2752 for (SizeT i = 0, NumGprArgs = GprArgs.size(); i < NumGprArgs; ++i) { 2753 const Type SignatureTy = GprArgs[i].first; 2754 Operand *Arg = 2755 legalize(GprArgs[i].second, Legal_Default | Legal_Rematerializable); 2756 GprArgs[i].second = legalizeToReg( 2757 Arg, Traits::getRegisterForGprArgNum( 2758 Arg->getType(), Traits::getArgIndex(GprArgIndices[i], i))); 2759 assert(SignatureTy == IceType_i64 || SignatureTy == IceType_i32); 2760 assert(SignatureTy == Arg->getType()); 2761 (void)SignatureTy; 2762 } 2763 // Generate a FakeUse of register arguments so that they do not get dead code 2764 // eliminated as a result of the FakeKill of scratch registers after the call. 2765 // These need to be right before the call instruction. 2766 for (auto *Arg : XmmArgs) { 2767 Context.insert<InstFakeUse>(llvm::cast<Variable>(Arg)); 2768 } 2769 for (auto &ArgPair : GprArgs) { 2770 Context.insert<InstFakeUse>(llvm::cast<Variable>(ArgPair.second)); 2771 } 2772 // Generate the call instruction. Assign its result to a temporary with high 2773 // register allocation weight. 2774 // ReturnReg doubles as ReturnRegLo as necessary. 2775 Variable *ReturnReg = nullptr; 2776 Variable *ReturnRegHi = nullptr; 2777 if (Dest) { 2778 switch (DestTy) { 2779 case IceType_NUM: 2780 case IceType_void: 2781 case IceType_i1: 2782 case IceType_i8: 2783 case IceType_i16: 2784 llvm::report_fatal_error("Invalid Call dest type"); 2785 break; 2786 case IceType_i32: 2787 ReturnReg = makeReg(DestTy, Traits::RegisterSet::Reg_eax); 2788 break; 2789 case IceType_i64: 2790 if (Traits::Is64Bit) { 2791 ReturnReg = makeReg(IceType_i64, Traits::getRaxOrDie()); 2792 } else { 2793 ReturnReg = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax); 2794 ReturnRegHi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx); 2795 } 2796 break; 2797 case IceType_f32: 2798 case IceType_f64: 2799 if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) { 2800 // Leave ReturnReg==ReturnRegHi==nullptr, and capture the result with 2801 // the fstp instruction. 2802 break; 2803 } 2804 // Fallthrough intended. 2805 case IceType_v4i1: 2806 case IceType_v8i1: 2807 case IceType_v16i1: 2808 case IceType_v16i8: 2809 case IceType_v8i16: 2810 case IceType_v4i32: 2811 case IceType_v4f32: 2812 ReturnReg = makeReg(DestTy, Traits::RegisterSet::Reg_xmm0); 2813 break; 2814 } 2815 } 2816 // Emit the call to the function. 2817 Operand *CallTarget = 2818 legalize(Instr->getCallTarget(), Legal_Reg | Legal_Imm | Legal_AddrAbs); 2819 size_t NumVariadicFpArgs = Instr->isVariadic() ? XmmArgs.size() : 0; 2820 Inst *NewCall = emitCallToTarget(CallTarget, ReturnReg, NumVariadicFpArgs); 2821 // Keep the upper return register live on 32-bit platform. 2822 if (ReturnRegHi) 2823 Context.insert<InstFakeDef>(ReturnRegHi); 2824 // Mark the call as killing all the caller-save registers. 2825 Context.insert<InstFakeKill>(NewCall); 2826 // Handle x86-32 floating point returns. 2827 if (Dest != nullptr && isScalarFloatingType(DestTy) && 2828 !Traits::X86_PASS_SCALAR_FP_IN_XMM) { 2829 // Special treatment for an FP function which returns its result in st(0). 2830 // If Dest ends up being a physical xmm register, the fstp emit code will 2831 // route st(0) through the space reserved in the function argument area 2832 // we allocated. 2833 _fstp(Dest); 2834 // Create a fake use of Dest in case it actually isn't used, because st(0) 2835 // still needs to be popped. 2836 Context.insert<InstFakeUse>(Dest); 2837 } 2838 // Generate a FakeUse to keep the call live if necessary. 2839 if (Instr->hasSideEffects() && ReturnReg) { 2840 Context.insert<InstFakeUse>(ReturnReg); 2841 } 2842 // Process the return value, if any. 2843 if (Dest == nullptr) 2844 return; 2845 // Assign the result of the call to Dest. Route it through a temporary so 2846 // that the local register availability peephole can be subsequently used. 2847 Variable *Tmp = nullptr; 2848 if (isVectorType(DestTy)) { 2849 assert(ReturnReg && "Vector type requires a return register"); 2850 Tmp = makeReg(DestTy); 2851 _movp(Tmp, ReturnReg); 2852 _movp(Dest, Tmp); 2853 } else if (isScalarFloatingType(DestTy)) { 2854 if (Traits::X86_PASS_SCALAR_FP_IN_XMM) { 2855 assert(ReturnReg && "FP type requires a return register"); 2856 _mov(Tmp, ReturnReg); 2857 _mov(Dest, Tmp); 2858 } 2859 } else { 2860 assert(isScalarIntegerType(DestTy)); 2861 assert(ReturnReg && "Integer type requires a return register"); 2862 if (DestTy == IceType_i64 && !Traits::Is64Bit) { 2863 assert(ReturnRegHi && "64-bit type requires two return registers"); 2864 auto *Dest64On32 = llvm::cast<Variable64On32>(Dest); 2865 Variable *DestLo = Dest64On32->getLo(); 2866 Variable *DestHi = Dest64On32->getHi(); 2867 _mov(Tmp, ReturnReg); 2868 _mov(DestLo, Tmp); 2869 Variable *TmpHi = nullptr; 2870 _mov(TmpHi, ReturnRegHi); 2871 _mov(DestHi, TmpHi); 2872 } else { 2873 _mov(Tmp, ReturnReg); 2874 _mov(Dest, Tmp); 2875 } 2876 } 2877 } 2878 2879 template <typename TraitsType> 2880 void TargetX86Base<TraitsType>::lowerCast(const InstCast *Instr) { 2881 // a = cast(b) ==> t=cast(b); a=t; (link t->b, link a->t, no overlap) 2882 InstCast::OpKind CastKind = Instr->getCastKind(); 2883 Variable *Dest = Instr->getDest(); 2884 Type DestTy = Dest->getType(); 2885 switch (CastKind) { 2886 default: 2887 Func->setError("Cast type not supported"); 2888 return; 2889 case InstCast::Sext: { 2890 // Src0RM is the source operand legalized to physical register or memory, 2891 // but not immediate, since the relevant x86 native instructions don't 2892 // allow an immediate operand. If the operand is an immediate, we could 2893 // consider computing the strength-reduced result at translation time, but 2894 // we're unlikely to see something like that in the bitcode that the 2895 // optimizer wouldn't have already taken care of. 2896 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem); 2897 if (isVectorType(DestTy)) { 2898 if (DestTy == IceType_v16i8) { 2899 // onemask = materialize(1,1,...); dst = (src & onemask) > 0 2900 Variable *OneMask = makeVectorOfOnes(DestTy); 2901 Variable *T = makeReg(DestTy); 2902 _movp(T, Src0RM); 2903 _pand(T, OneMask); 2904 Variable *Zeros = makeVectorOfZeros(DestTy); 2905 _pcmpgt(T, Zeros); 2906 _movp(Dest, T); 2907 } else { 2908 /// width = width(elty) - 1; dest = (src << width) >> width 2909 SizeT ShiftAmount = 2910 Traits::X86_CHAR_BIT * typeWidthInBytes(typeElementType(DestTy)) - 2911 1; 2912 Constant *ShiftConstant = Ctx->getConstantInt8(ShiftAmount); 2913 Variable *T = makeReg(DestTy); 2914 _movp(T, Src0RM); 2915 _psll(T, ShiftConstant); 2916 _psra(T, ShiftConstant); 2917 _movp(Dest, T); 2918 } 2919 } else if (!Traits::Is64Bit && DestTy == IceType_i64) { 2920 // t1=movsx src; t2=t1; t2=sar t2, 31; dst.lo=t1; dst.hi=t2 2921 Constant *Shift = Ctx->getConstantInt32(31); 2922 auto *DestLo = llvm::cast<Variable>(loOperand(Dest)); 2923 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest)); 2924 Variable *T_Lo = makeReg(DestLo->getType()); 2925 if (Src0RM->getType() == IceType_i32) { 2926 _mov(T_Lo, Src0RM); 2927 } else if (Src0RM->getType() == IceType_i1) { 2928 _movzx(T_Lo, Src0RM); 2929 _shl(T_Lo, Shift); 2930 _sar(T_Lo, Shift); 2931 } else { 2932 _movsx(T_Lo, Src0RM); 2933 } 2934 _mov(DestLo, T_Lo); 2935 Variable *T_Hi = nullptr; 2936 _mov(T_Hi, T_Lo); 2937 if (Src0RM->getType() != IceType_i1) 2938 // For i1, the sar instruction is already done above. 2939 _sar(T_Hi, Shift); 2940 _mov(DestHi, T_Hi); 2941 } else if (Src0RM->getType() == IceType_i1) { 2942 // t1 = src 2943 // shl t1, dst_bitwidth - 1 2944 // sar t1, dst_bitwidth - 1 2945 // dst = t1 2946 size_t DestBits = Traits::X86_CHAR_BIT * typeWidthInBytes(DestTy); 2947 Constant *ShiftAmount = Ctx->getConstantInt32(DestBits - 1); 2948 Variable *T = makeReg(DestTy); 2949 if (typeWidthInBytes(DestTy) <= typeWidthInBytes(Src0RM->getType())) { 2950 _mov(T, Src0RM); 2951 } else { 2952 // Widen the source using movsx or movzx. (It doesn't matter which one, 2953 // since the following shl/sar overwrite the bits.) 2954 _movzx(T, Src0RM); 2955 } 2956 _shl(T, ShiftAmount); 2957 _sar(T, ShiftAmount); 2958 _mov(Dest, T); 2959 } else { 2960 // t1 = movsx src; dst = t1 2961 Variable *T = makeReg(DestTy); 2962 _movsx(T, Src0RM); 2963 _mov(Dest, T); 2964 } 2965 break; 2966 } 2967 case InstCast::Zext: { 2968 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem); 2969 if (isVectorType(DestTy)) { 2970 // onemask = materialize(1,1,...); dest = onemask & src 2971 Variable *OneMask = makeVectorOfOnes(DestTy); 2972 Variable *T = makeReg(DestTy); 2973 _movp(T, Src0RM); 2974 _pand(T, OneMask); 2975 _movp(Dest, T); 2976 } else if (!Traits::Is64Bit && DestTy == IceType_i64) { 2977 // t1=movzx src; dst.lo=t1; dst.hi=0 2978 Constant *Zero = Ctx->getConstantZero(IceType_i32); 2979 auto *DestLo = llvm::cast<Variable>(loOperand(Dest)); 2980 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest)); 2981 Variable *Tmp = makeReg(DestLo->getType()); 2982 if (Src0RM->getType() == IceType_i32) { 2983 _mov(Tmp, Src0RM); 2984 } else { 2985 _movzx(Tmp, Src0RM); 2986 } 2987 _mov(DestLo, Tmp); 2988 _mov(DestHi, Zero); 2989 } else if (Src0RM->getType() == IceType_i1) { 2990 // t = Src0RM; Dest = t 2991 Variable *T = nullptr; 2992 if (DestTy == IceType_i8) { 2993 _mov(T, Src0RM); 2994 } else { 2995 assert(DestTy != IceType_i1); 2996 assert(Traits::Is64Bit || DestTy != IceType_i64); 2997 // Use 32-bit for both 16-bit and 32-bit, since 32-bit ops are shorter. 2998 // In x86-64 we need to widen T to 64-bits to ensure that T -- if 2999 // written to the stack (i.e., in -Om1) will be fully zero-extended. 3000 T = makeReg(DestTy == IceType_i64 ? IceType_i64 : IceType_i32); 3001 _movzx(T, Src0RM); 3002 } 3003 _mov(Dest, T); 3004 } else { 3005 // t1 = movzx src; dst = t1 3006 Variable *T = makeReg(DestTy); 3007 _movzx(T, Src0RM); 3008 _mov(Dest, T); 3009 } 3010 break; 3011 } 3012 case InstCast::Trunc: { 3013 if (isVectorType(DestTy)) { 3014 // onemask = materialize(1,1,...); dst = src & onemask 3015 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem); 3016 Type Src0Ty = Src0RM->getType(); 3017 Variable *OneMask = makeVectorOfOnes(Src0Ty); 3018 Variable *T = makeReg(DestTy); 3019 _movp(T, Src0RM); 3020 _pand(T, OneMask); 3021 _movp(Dest, T); 3022 } else if (DestTy == IceType_i1 || DestTy == IceType_i8) { 3023 // Make sure we truncate from and into valid registers. 3024 Operand *Src0 = legalizeUndef(Instr->getSrc(0)); 3025 if (!Traits::Is64Bit && Src0->getType() == IceType_i64) 3026 Src0 = loOperand(Src0); 3027 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 3028 Variable *T = copyToReg8(Src0RM); 3029 if (DestTy == IceType_i1) 3030 _and(T, Ctx->getConstantInt1(1)); 3031 _mov(Dest, T); 3032 } else { 3033 Operand *Src0 = legalizeUndef(Instr->getSrc(0)); 3034 if (!Traits::Is64Bit && Src0->getType() == IceType_i64) 3035 Src0 = loOperand(Src0); 3036 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 3037 // t1 = trunc Src0RM; Dest = t1 3038 Variable *T = makeReg(DestTy); 3039 _mov(T, Src0RM); 3040 _mov(Dest, T); 3041 } 3042 break; 3043 } 3044 case InstCast::Fptrunc: 3045 case InstCast::Fpext: { 3046 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem); 3047 // t1 = cvt Src0RM; Dest = t1 3048 Variable *T = makeReg(DestTy); 3049 _cvt(T, Src0RM, Traits::Insts::Cvt::Float2float); 3050 _mov(Dest, T); 3051 break; 3052 } 3053 case InstCast::Fptosi: 3054 if (isVectorType(DestTy)) { 3055 assert(DestTy == IceType_v4i32); 3056 assert(Instr->getSrc(0)->getType() == IceType_v4f32); 3057 Operand *Src0R = legalizeToReg(Instr->getSrc(0)); 3058 Variable *T = makeReg(DestTy); 3059 _cvt(T, Src0R, Traits::Insts::Cvt::Tps2dq); 3060 _movp(Dest, T); 3061 } else if (!Traits::Is64Bit && DestTy == IceType_i64) { 3062 llvm::report_fatal_error("Helper call was expected"); 3063 } else { 3064 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem); 3065 // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type 3066 Variable *T_1 = nullptr; 3067 if (Traits::Is64Bit && DestTy == IceType_i64) { 3068 T_1 = makeReg(IceType_i64); 3069 } else { 3070 assert(DestTy != IceType_i64); 3071 T_1 = makeReg(IceType_i32); 3072 } 3073 // cvt() requires its integer argument to be a GPR. 3074 Variable *T_2 = makeReg(DestTy); 3075 if (isByteSizedType(DestTy)) { 3076 assert(T_1->getType() == IceType_i32); 3077 T_1->setRegClass(RCX86_Is32To8); 3078 T_2->setRegClass(RCX86_IsTrunc8Rcvr); 3079 } 3080 _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si); 3081 _mov(T_2, T_1); // T_1 and T_2 may have different integer types 3082 if (DestTy == IceType_i1) 3083 _and(T_2, Ctx->getConstantInt1(1)); 3084 _mov(Dest, T_2); 3085 } 3086 break; 3087 case InstCast::Fptoui: 3088 if (isVectorType(DestTy)) { 3089 llvm::report_fatal_error("Helper call was expected"); 3090 } else if (DestTy == IceType_i64 || 3091 (!Traits::Is64Bit && DestTy == IceType_i32)) { 3092 llvm::report_fatal_error("Helper call was expected"); 3093 } else { 3094 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem); 3095 // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type 3096 assert(DestTy != IceType_i64); 3097 Variable *T_1 = nullptr; 3098 if (Traits::Is64Bit && DestTy == IceType_i32) { 3099 T_1 = makeReg(IceType_i64); 3100 } else { 3101 assert(DestTy != IceType_i32); 3102 T_1 = makeReg(IceType_i32); 3103 } 3104 Variable *T_2 = makeReg(DestTy); 3105 if (isByteSizedType(DestTy)) { 3106 assert(T_1->getType() == IceType_i32); 3107 T_1->setRegClass(RCX86_Is32To8); 3108 T_2->setRegClass(RCX86_IsTrunc8Rcvr); 3109 } 3110 _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si); 3111 _mov(T_2, T_1); // T_1 and T_2 may have different integer types 3112 if (DestTy == IceType_i1) 3113 _and(T_2, Ctx->getConstantInt1(1)); 3114 _mov(Dest, T_2); 3115 } 3116 break; 3117 case InstCast::Sitofp: 3118 if (isVectorType(DestTy)) { 3119 assert(DestTy == IceType_v4f32); 3120 assert(Instr->getSrc(0)->getType() == IceType_v4i32); 3121 Operand *Src0R = legalizeToReg(Instr->getSrc(0)); 3122 Variable *T = makeReg(DestTy); 3123 _cvt(T, Src0R, Traits::Insts::Cvt::Dq2ps); 3124 _movp(Dest, T); 3125 } else if (!Traits::Is64Bit && Instr->getSrc(0)->getType() == IceType_i64) { 3126 llvm::report_fatal_error("Helper call was expected"); 3127 } else { 3128 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem); 3129 // Sign-extend the operand. 3130 // t1.i32 = movsx Src0RM; t2 = Cvt t1.i32; Dest = t2 3131 Variable *T_1 = nullptr; 3132 if (Traits::Is64Bit && Src0RM->getType() == IceType_i64) { 3133 T_1 = makeReg(IceType_i64); 3134 } else { 3135 assert(Src0RM->getType() != IceType_i64); 3136 T_1 = makeReg(IceType_i32); 3137 } 3138 Variable *T_2 = makeReg(DestTy); 3139 if (Src0RM->getType() == T_1->getType()) 3140 _mov(T_1, Src0RM); 3141 else 3142 _movsx(T_1, Src0RM); 3143 _cvt(T_2, T_1, Traits::Insts::Cvt::Si2ss); 3144 _mov(Dest, T_2); 3145 } 3146 break; 3147 case InstCast::Uitofp: { 3148 Operand *Src0 = Instr->getSrc(0); 3149 if (isVectorType(Src0->getType())) { 3150 llvm::report_fatal_error("Helper call was expected"); 3151 } else if (Src0->getType() == IceType_i64 || 3152 (!Traits::Is64Bit && Src0->getType() == IceType_i32)) { 3153 llvm::report_fatal_error("Helper call was expected"); 3154 } else { 3155 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 3156 // Zero-extend the operand. 3157 // t1.i32 = movzx Src0RM; t2 = Cvt t1.i32; Dest = t2 3158 Variable *T_1 = nullptr; 3159 if (Traits::Is64Bit && Src0RM->getType() == IceType_i32) { 3160 T_1 = makeReg(IceType_i64); 3161 } else { 3162 assert(Src0RM->getType() != IceType_i64); 3163 assert(Traits::Is64Bit || Src0RM->getType() != IceType_i32); 3164 T_1 = makeReg(IceType_i32); 3165 } 3166 Variable *T_2 = makeReg(DestTy); 3167 if (Src0RM->getType() == T_1->getType()) 3168 _mov(T_1, Src0RM); 3169 else 3170 _movzx(T_1, Src0RM)->setMustKeep(); 3171 _cvt(T_2, T_1, Traits::Insts::Cvt::Si2ss); 3172 _mov(Dest, T_2); 3173 } 3174 break; 3175 } 3176 case InstCast::Bitcast: { 3177 Operand *Src0 = Instr->getSrc(0); 3178 if (DestTy == Src0->getType()) { 3179 auto *Assign = InstAssign::create(Func, Dest, Src0); 3180 lowerAssign(Assign); 3181 return; 3182 } 3183 switch (DestTy) { 3184 default: 3185 llvm_unreachable("Unexpected Bitcast dest type"); 3186 case IceType_i8: { 3187 llvm::report_fatal_error("Helper call was expected"); 3188 } break; 3189 case IceType_i16: { 3190 llvm::report_fatal_error("Helper call was expected"); 3191 } break; 3192 case IceType_i32: 3193 case IceType_f32: { 3194 Variable *Src0R = legalizeToReg(Src0); 3195 Variable *T = makeReg(DestTy); 3196 _movd(T, Src0R); 3197 _mov(Dest, T); 3198 } break; 3199 case IceType_i64: { 3200 assert(Src0->getType() == IceType_f64); 3201 if (Traits::Is64Bit) { 3202 Variable *Src0R = legalizeToReg(Src0); 3203 Variable *T = makeReg(IceType_i64); 3204 _movd(T, Src0R); 3205 _mov(Dest, T); 3206 } else { 3207 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 3208 // a.i64 = bitcast b.f64 ==> 3209 // s.f64 = spill b.f64 3210 // t_lo.i32 = lo(s.f64) 3211 // a_lo.i32 = t_lo.i32 3212 // t_hi.i32 = hi(s.f64) 3213 // a_hi.i32 = t_hi.i32 3214 Operand *SpillLo, *SpillHi; 3215 if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0RM)) { 3216 Variable *Spill = Func->makeVariable(IceType_f64); 3217 Spill->setLinkedTo(Src0Var); 3218 Spill->setMustNotHaveReg(); 3219 _movq(Spill, Src0RM); 3220 SpillLo = Traits::VariableSplit::create(Func, Spill, 3221 Traits::VariableSplit::Low); 3222 SpillHi = Traits::VariableSplit::create(Func, Spill, 3223 Traits::VariableSplit::High); 3224 } else { 3225 SpillLo = loOperand(Src0RM); 3226 SpillHi = hiOperand(Src0RM); 3227 } 3228 3229 auto *DestLo = llvm::cast<Variable>(loOperand(Dest)); 3230 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest)); 3231 Variable *T_Lo = makeReg(IceType_i32); 3232 Variable *T_Hi = makeReg(IceType_i32); 3233 3234 _mov(T_Lo, SpillLo); 3235 _mov(DestLo, T_Lo); 3236 _mov(T_Hi, SpillHi); 3237 _mov(DestHi, T_Hi); 3238 } 3239 } break; 3240 case IceType_f64: { 3241 assert(Src0->getType() == IceType_i64); 3242 if (Traits::Is64Bit) { 3243 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 3244 Variable *T = makeReg(IceType_f64); 3245 _movd(T, Src0RM); 3246 _mov(Dest, T); 3247 } else { 3248 Src0 = legalize(Src0); 3249 if (llvm::isa<X86OperandMem>(Src0)) { 3250 Variable *T = makeReg(DestTy); 3251 _movq(T, Src0); 3252 _movq(Dest, T); 3253 break; 3254 } 3255 // a.f64 = bitcast b.i64 ==> 3256 // t_lo.i32 = b_lo.i32 3257 // FakeDef(s.f64) 3258 // lo(s.f64) = t_lo.i32 3259 // t_hi.i32 = b_hi.i32 3260 // hi(s.f64) = t_hi.i32 3261 // a.f64 = s.f64 3262 Variable *Spill = Func->makeVariable(IceType_f64); 3263 Spill->setLinkedTo(Dest); 3264 Spill->setMustNotHaveReg(); 3265 3266 Variable *T_Lo = nullptr, *T_Hi = nullptr; 3267 auto *SpillLo = Traits::VariableSplit::create( 3268 Func, Spill, Traits::VariableSplit::Low); 3269 auto *SpillHi = Traits::VariableSplit::create( 3270 Func, Spill, Traits::VariableSplit::High); 3271 _mov(T_Lo, loOperand(Src0)); 3272 // Technically, the Spill is defined after the _store happens, but 3273 // SpillLo is considered a "use" of Spill so define Spill before it is 3274 // used. 3275 Context.insert<InstFakeDef>(Spill); 3276 _store(T_Lo, SpillLo); 3277 _mov(T_Hi, hiOperand(Src0)); 3278 _store(T_Hi, SpillHi); 3279 _movq(Dest, Spill); 3280 } 3281 } break; 3282 case IceType_v8i1: { 3283 llvm::report_fatal_error("Helper call was expected"); 3284 } break; 3285 case IceType_v16i1: { 3286 llvm::report_fatal_error("Helper call was expected"); 3287 } break; 3288 case IceType_v8i16: 3289 case IceType_v16i8: 3290 case IceType_v4i32: 3291 case IceType_v4f32: { 3292 if (Src0->getType() == IceType_i32) { 3293 // Bitcast requires equal type sizes, which isn't strictly the case 3294 // between scalars and vectors, but to emulate v4i8 vectors one has to 3295 // use v16i8 vectors. 3296 assert(getFlags().getApplicationBinaryInterface() != ABI_PNaCl && 3297 "PNaCl only supports real 128-bit vectors"); 3298 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 3299 Variable *T = makeReg(DestTy); 3300 _movd(T, Src0RM); 3301 _mov(Dest, T); 3302 } else { 3303 _movp(Dest, legalizeToReg(Src0)); 3304 } 3305 } break; 3306 } 3307 break; 3308 } 3309 } 3310 } 3311 3312 template <typename TraitsType> 3313 void TargetX86Base<TraitsType>::lowerExtractElement( 3314 const InstExtractElement *Instr) { 3315 Operand *SourceVectNotLegalized = Instr->getSrc(0); 3316 auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(1)); 3317 // Only constant indices are allowed in PNaCl IR. 3318 assert(ElementIndex); 3319 3320 unsigned Index = ElementIndex->getValue(); 3321 Type Ty = SourceVectNotLegalized->getType(); 3322 Type ElementTy = typeElementType(Ty); 3323 Type InVectorElementTy = Traits::getInVectorElementType(Ty); 3324 3325 // TODO(wala): Determine the best lowering sequences for each type. 3326 bool CanUsePextr = Ty == IceType_v8i16 || Ty == IceType_v8i1 || 3327 (InstructionSet >= Traits::SSE4_1 && Ty != IceType_v4f32); 3328 Variable *ExtractedElementR = 3329 makeReg(CanUsePextr ? IceType_i32 : InVectorElementTy); 3330 if (CanUsePextr) { 3331 // Use pextrb, pextrw, or pextrd. The "b" and "w" versions clear the upper 3332 // bits of the destination register, so we represent this by always 3333 // extracting into an i32 register. The _mov into Dest below will do 3334 // truncation as necessary. 3335 Constant *Mask = Ctx->getConstantInt32(Index); 3336 Variable *SourceVectR = legalizeToReg(SourceVectNotLegalized); 3337 _pextr(ExtractedElementR, SourceVectR, Mask); 3338 } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { 3339 // Use pshufd and movd/movss. 3340 Variable *T = nullptr; 3341 if (Index) { 3342 // The shuffle only needs to occur if the element to be extracted is not 3343 // at the lowest index. 3344 Constant *Mask = Ctx->getConstantInt32(Index); 3345 T = makeReg(Ty); 3346 _pshufd(T, legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem), Mask); 3347 } else { 3348 T = legalizeToReg(SourceVectNotLegalized); 3349 } 3350 3351 if (InVectorElementTy == IceType_i32) { 3352 _movd(ExtractedElementR, T); 3353 } else { // Ty == IceType_f32 3354 // TODO(wala): _movss is only used here because _mov does not allow a 3355 // vector source and a scalar destination. _mov should be able to be 3356 // used here. 3357 // _movss is a binary instruction, so the FakeDef is needed to keep the 3358 // live range analysis consistent. 3359 Context.insert<InstFakeDef>(ExtractedElementR); 3360 _movss(ExtractedElementR, T); 3361 } 3362 } else { 3363 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); 3364 // Spill the value to a stack slot and do the extraction in memory. 3365 // 3366 // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support 3367 // for legalizing to mem is implemented. 3368 Variable *Slot = Func->makeVariable(Ty); 3369 Slot->setMustNotHaveReg(); 3370 _movp(Slot, legalizeToReg(SourceVectNotLegalized)); 3371 3372 // Compute the location of the element in memory. 3373 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); 3374 X86OperandMem *Loc = 3375 getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset); 3376 _mov(ExtractedElementR, Loc); 3377 } 3378 3379 if (ElementTy == IceType_i1) { 3380 // Truncate extracted integers to i1s if necessary. 3381 Variable *T = makeReg(IceType_i1); 3382 InstCast *Cast = 3383 InstCast::create(Func, InstCast::Trunc, T, ExtractedElementR); 3384 lowerCast(Cast); 3385 ExtractedElementR = T; 3386 } 3387 3388 // Copy the element to the destination. 3389 Variable *Dest = Instr->getDest(); 3390 _mov(Dest, ExtractedElementR); 3391 } 3392 3393 template <typename TraitsType> 3394 void TargetX86Base<TraitsType>::lowerFcmp(const InstFcmp *Fcmp) { 3395 Variable *Dest = Fcmp->getDest(); 3396 3397 if (isVectorType(Dest->getType())) { 3398 lowerFcmpVector(Fcmp); 3399 } else { 3400 constexpr Inst *Consumer = nullptr; 3401 lowerFcmpAndConsumer(Fcmp, Consumer); 3402 } 3403 } 3404 3405 template <typename TraitsType> 3406 void TargetX86Base<TraitsType>::lowerFcmpAndConsumer(const InstFcmp *Fcmp, 3407 const Inst *Consumer) { 3408 Operand *Src0 = Fcmp->getSrc(0); 3409 Operand *Src1 = Fcmp->getSrc(1); 3410 Variable *Dest = Fcmp->getDest(); 3411 3412 if (Consumer != nullptr) { 3413 if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) { 3414 if (lowerOptimizeFcmpSelect(Fcmp, Select)) 3415 return; 3416 } 3417 } 3418 3419 if (isVectorType(Dest->getType())) { 3420 lowerFcmp(Fcmp); 3421 if (Consumer != nullptr) 3422 lowerSelectVector(llvm::cast<InstSelect>(Consumer)); 3423 return; 3424 } 3425 3426 // Lowering a = fcmp cond, b, c 3427 // ucomiss b, c /* only if C1 != Br_None */ 3428 // /* but swap b,c order if SwapOperands==true */ 3429 // mov a, <default> 3430 // j<C1> label /* only if C1 != Br_None */ 3431 // j<C2> label /* only if C2 != Br_None */ 3432 // FakeUse(a) /* only if C1 != Br_None */ 3433 // mov a, !<default> /* only if C1 != Br_None */ 3434 // label: /* only if C1 != Br_None */ 3435 // 3436 // setcc lowering when C1 != Br_None && C2 == Br_None: 3437 // ucomiss b, c /* but swap b,c order if SwapOperands==true */ 3438 // setcc a, C1 3439 InstFcmp::FCond Condition = Fcmp->getCondition(); 3440 assert(static_cast<size_t>(Condition) < Traits::TableFcmpSize); 3441 if (Traits::TableFcmp[Condition].SwapScalarOperands) 3442 std::swap(Src0, Src1); 3443 const bool HasC1 = (Traits::TableFcmp[Condition].C1 != Traits::Cond::Br_None); 3444 const bool HasC2 = (Traits::TableFcmp[Condition].C2 != Traits::Cond::Br_None); 3445 if (HasC1) { 3446 Src0 = legalize(Src0); 3447 Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 3448 Variable *T = nullptr; 3449 _mov(T, Src0); 3450 _ucomiss(T, Src1RM); 3451 if (!HasC2) { 3452 assert(Traits::TableFcmp[Condition].Default); 3453 setccOrConsumer(Traits::TableFcmp[Condition].C1, Dest, Consumer); 3454 return; 3455 } 3456 } 3457 int32_t IntDefault = Traits::TableFcmp[Condition].Default; 3458 if (Consumer == nullptr) { 3459 Constant *Default = Ctx->getConstantInt(Dest->getType(), IntDefault); 3460 _mov(Dest, Default); 3461 if (HasC1) { 3462 InstX86Label *Label = InstX86Label::create(Func, this); 3463 _br(Traits::TableFcmp[Condition].C1, Label); 3464 if (HasC2) { 3465 _br(Traits::TableFcmp[Condition].C2, Label); 3466 } 3467 Constant *NonDefault = Ctx->getConstantInt(Dest->getType(), !IntDefault); 3468 _redefined(_mov(Dest, NonDefault)); 3469 Context.insert(Label); 3470 } 3471 return; 3472 } 3473 if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) { 3474 CfgNode *TrueSucc = Br->getTargetTrue(); 3475 CfgNode *FalseSucc = Br->getTargetFalse(); 3476 if (IntDefault != 0) 3477 std::swap(TrueSucc, FalseSucc); 3478 if (HasC1) { 3479 _br(Traits::TableFcmp[Condition].C1, FalseSucc); 3480 if (HasC2) { 3481 _br(Traits::TableFcmp[Condition].C2, FalseSucc); 3482 } 3483 _br(TrueSucc); 3484 return; 3485 } 3486 _br(FalseSucc); 3487 return; 3488 } 3489 if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) { 3490 Operand *SrcT = Select->getTrueOperand(); 3491 Operand *SrcF = Select->getFalseOperand(); 3492 Variable *SelectDest = Select->getDest(); 3493 if (IntDefault != 0) 3494 std::swap(SrcT, SrcF); 3495 lowerMove(SelectDest, SrcF, false); 3496 if (HasC1) { 3497 InstX86Label *Label = InstX86Label::create(Func, this); 3498 _br(Traits::TableFcmp[Condition].C1, Label); 3499 if (HasC2) { 3500 _br(Traits::TableFcmp[Condition].C2, Label); 3501 } 3502 static constexpr bool IsRedefinition = true; 3503 lowerMove(SelectDest, SrcT, IsRedefinition); 3504 Context.insert(Label); 3505 } 3506 return; 3507 } 3508 llvm::report_fatal_error("Unexpected consumer type"); 3509 } 3510 3511 template <typename TraitsType> 3512 void TargetX86Base<TraitsType>::lowerFcmpVector(const InstFcmp *Fcmp) { 3513 Operand *Src0 = Fcmp->getSrc(0); 3514 Operand *Src1 = Fcmp->getSrc(1); 3515 Variable *Dest = Fcmp->getDest(); 3516 3517 if (!isVectorType(Dest->getType())) 3518 llvm::report_fatal_error("Expected vector compare"); 3519 3520 InstFcmp::FCond Condition = Fcmp->getCondition(); 3521 assert(static_cast<size_t>(Condition) < Traits::TableFcmpSize); 3522 3523 if (Traits::TableFcmp[Condition].SwapVectorOperands) 3524 std::swap(Src0, Src1); 3525 3526 Variable *T = nullptr; 3527 3528 if (Condition == InstFcmp::True) { 3529 // makeVectorOfOnes() requires an integer vector type. 3530 T = makeVectorOfMinusOnes(IceType_v4i32); 3531 } else if (Condition == InstFcmp::False) { 3532 T = makeVectorOfZeros(Dest->getType()); 3533 } else { 3534 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 3535 Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 3536 if (llvm::isa<X86OperandMem>(Src1RM)) 3537 Src1RM = legalizeToReg(Src1RM); 3538 3539 switch (Condition) { 3540 default: { 3541 const CmppsCond Predicate = Traits::TableFcmp[Condition].Predicate; 3542 assert(Predicate != Traits::Cond::Cmpps_Invalid); 3543 T = makeReg(Src0RM->getType()); 3544 _movp(T, Src0RM); 3545 _cmpps(T, Src1RM, Predicate); 3546 } break; 3547 case InstFcmp::One: { 3548 // Check both unequal and ordered. 3549 T = makeReg(Src0RM->getType()); 3550 Variable *T2 = makeReg(Src0RM->getType()); 3551 _movp(T, Src0RM); 3552 _cmpps(T, Src1RM, Traits::Cond::Cmpps_neq); 3553 _movp(T2, Src0RM); 3554 _cmpps(T2, Src1RM, Traits::Cond::Cmpps_ord); 3555 _pand(T, T2); 3556 } break; 3557 case InstFcmp::Ueq: { 3558 // Check both equal or unordered. 3559 T = makeReg(Src0RM->getType()); 3560 Variable *T2 = makeReg(Src0RM->getType()); 3561 _movp(T, Src0RM); 3562 _cmpps(T, Src1RM, Traits::Cond::Cmpps_eq); 3563 _movp(T2, Src0RM); 3564 _cmpps(T2, Src1RM, Traits::Cond::Cmpps_unord); 3565 _por(T, T2); 3566 } break; 3567 } 3568 } 3569 3570 assert(T != nullptr); 3571 _movp(Dest, T); 3572 eliminateNextVectorSextInstruction(Dest); 3573 } 3574 3575 inline bool isZero(const Operand *Opnd) { 3576 if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Opnd)) 3577 return C64->getValue() == 0; 3578 if (auto *C32 = llvm::dyn_cast<ConstantInteger32>(Opnd)) 3579 return C32->getValue() == 0; 3580 return false; 3581 } 3582 3583 template <typename TraitsType> 3584 void TargetX86Base<TraitsType>::lowerIcmpAndConsumer(const InstIcmp *Icmp, 3585 const Inst *Consumer) { 3586 Operand *Src0 = legalize(Icmp->getSrc(0)); 3587 Operand *Src1 = legalize(Icmp->getSrc(1)); 3588 Variable *Dest = Icmp->getDest(); 3589 3590 if (isVectorType(Dest->getType())) { 3591 lowerIcmp(Icmp); 3592 if (Consumer != nullptr) 3593 lowerSelectVector(llvm::cast<InstSelect>(Consumer)); 3594 return; 3595 } 3596 3597 if (!Traits::Is64Bit && Src0->getType() == IceType_i64) { 3598 lowerIcmp64(Icmp, Consumer); 3599 return; 3600 } 3601 3602 // cmp b, c 3603 if (isZero(Src1)) { 3604 switch (Icmp->getCondition()) { 3605 default: 3606 break; 3607 case InstIcmp::Uge: 3608 movOrConsumer(true, Dest, Consumer); 3609 return; 3610 case InstIcmp::Ult: 3611 movOrConsumer(false, Dest, Consumer); 3612 return; 3613 } 3614 } 3615 Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1); 3616 _cmp(Src0RM, Src1); 3617 setccOrConsumer(Traits::getIcmp32Mapping(Icmp->getCondition()), Dest, 3618 Consumer); 3619 } 3620 3621 template <typename TraitsType> 3622 void TargetX86Base<TraitsType>::lowerIcmpVector(const InstIcmp *Icmp) { 3623 Operand *Src0 = legalize(Icmp->getSrc(0)); 3624 Operand *Src1 = legalize(Icmp->getSrc(1)); 3625 Variable *Dest = Icmp->getDest(); 3626 3627 if (!isVectorType(Dest->getType())) 3628 llvm::report_fatal_error("Expected a vector compare"); 3629 3630 Type Ty = Src0->getType(); 3631 // Promote i1 vectors to 128 bit integer vector types. 3632 if (typeElementType(Ty) == IceType_i1) { 3633 Type NewTy = IceType_NUM; 3634 switch (Ty) { 3635 default: 3636 llvm::report_fatal_error("unexpected type"); 3637 break; 3638 case IceType_v4i1: 3639 NewTy = IceType_v4i32; 3640 break; 3641 case IceType_v8i1: 3642 NewTy = IceType_v8i16; 3643 break; 3644 case IceType_v16i1: 3645 NewTy = IceType_v16i8; 3646 break; 3647 } 3648 Variable *NewSrc0 = Func->makeVariable(NewTy); 3649 Variable *NewSrc1 = Func->makeVariable(NewTy); 3650 lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc0, Src0)); 3651 lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc1, Src1)); 3652 Src0 = NewSrc0; 3653 Src1 = NewSrc1; 3654 Ty = NewTy; 3655 } 3656 3657 InstIcmp::ICond Condition = Icmp->getCondition(); 3658 3659 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 3660 Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 3661 3662 // SSE2 only has signed comparison operations. Transform unsigned inputs in 3663 // a manner that allows for the use of signed comparison operations by 3664 // flipping the high order bits. 3665 if (Condition == InstIcmp::Ugt || Condition == InstIcmp::Uge || 3666 Condition == InstIcmp::Ult || Condition == InstIcmp::Ule) { 3667 Variable *T0 = makeReg(Ty); 3668 Variable *T1 = makeReg(Ty); 3669 Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty); 3670 _movp(T0, Src0RM); 3671 _pxor(T0, HighOrderBits); 3672 _movp(T1, Src1RM); 3673 _pxor(T1, HighOrderBits); 3674 Src0RM = T0; 3675 Src1RM = T1; 3676 } 3677 3678 Variable *T = makeReg(Ty); 3679 switch (Condition) { 3680 default: 3681 llvm_unreachable("unexpected condition"); 3682 break; 3683 case InstIcmp::Eq: { 3684 if (llvm::isa<X86OperandMem>(Src1RM)) 3685 Src1RM = legalizeToReg(Src1RM); 3686 _movp(T, Src0RM); 3687 _pcmpeq(T, Src1RM); 3688 } break; 3689 case InstIcmp::Ne: { 3690 if (llvm::isa<X86OperandMem>(Src1RM)) 3691 Src1RM = legalizeToReg(Src1RM); 3692 _movp(T, Src0RM); 3693 _pcmpeq(T, Src1RM); 3694 Variable *MinusOne = makeVectorOfMinusOnes(Ty); 3695 _pxor(T, MinusOne); 3696 } break; 3697 case InstIcmp::Ugt: 3698 case InstIcmp::Sgt: { 3699 if (llvm::isa<X86OperandMem>(Src1RM)) 3700 Src1RM = legalizeToReg(Src1RM); 3701 _movp(T, Src0RM); 3702 _pcmpgt(T, Src1RM); 3703 } break; 3704 case InstIcmp::Uge: 3705 case InstIcmp::Sge: { 3706 // !(Src1RM > Src0RM) 3707 if (llvm::isa<X86OperandMem>(Src0RM)) 3708 Src0RM = legalizeToReg(Src0RM); 3709 _movp(T, Src1RM); 3710 _pcmpgt(T, Src0RM); 3711 Variable *MinusOne = makeVectorOfMinusOnes(Ty); 3712 _pxor(T, MinusOne); 3713 } break; 3714 case InstIcmp::Ult: 3715 case InstIcmp::Slt: { 3716 if (llvm::isa<X86OperandMem>(Src0RM)) 3717 Src0RM = legalizeToReg(Src0RM); 3718 _movp(T, Src1RM); 3719 _pcmpgt(T, Src0RM); 3720 } break; 3721 case InstIcmp::Ule: 3722 case InstIcmp::Sle: { 3723 // !(Src0RM > Src1RM) 3724 if (llvm::isa<X86OperandMem>(Src1RM)) 3725 Src1RM = legalizeToReg(Src1RM); 3726 _movp(T, Src0RM); 3727 _pcmpgt(T, Src1RM); 3728 Variable *MinusOne = makeVectorOfMinusOnes(Ty); 3729 _pxor(T, MinusOne); 3730 } break; 3731 } 3732 3733 _movp(Dest, T); 3734 eliminateNextVectorSextInstruction(Dest); 3735 } 3736 3737 template <typename TraitsType> 3738 template <typename T> 3739 typename std::enable_if<!T::Is64Bit, void>::type 3740 TargetX86Base<TraitsType>::lowerIcmp64(const InstIcmp *Icmp, 3741 const Inst *Consumer) { 3742 // a=icmp cond, b, c ==> cmp b,c; a=1; br cond,L1; FakeUse(a); a=0; L1: 3743 Operand *Src0 = legalize(Icmp->getSrc(0)); 3744 Operand *Src1 = legalize(Icmp->getSrc(1)); 3745 Variable *Dest = Icmp->getDest(); 3746 InstIcmp::ICond Condition = Icmp->getCondition(); 3747 assert(static_cast<size_t>(Condition) < Traits::TableIcmp64Size); 3748 Operand *Src0LoRM = nullptr; 3749 Operand *Src0HiRM = nullptr; 3750 // Legalize the portions of Src0 that are going to be needed. 3751 if (isZero(Src1)) { 3752 switch (Condition) { 3753 default: 3754 llvm_unreachable("unexpected condition"); 3755 break; 3756 // These two are not optimized, so we fall through to the general case, 3757 // which needs the upper and lower halves legalized. 3758 case InstIcmp::Sgt: 3759 case InstIcmp::Sle: 3760 // These four compare after performing an "or" of the high and low half, so 3761 // they need the upper and lower halves legalized. 3762 case InstIcmp::Eq: 3763 case InstIcmp::Ule: 3764 case InstIcmp::Ne: 3765 case InstIcmp::Ugt: 3766 Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem); 3767 // These two test only the high half's sign bit, so they need only 3768 // the upper half legalized. 3769 case InstIcmp::Sge: 3770 case InstIcmp::Slt: 3771 Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem); 3772 break; 3773 3774 // These two move constants and hence need no legalization. 3775 case InstIcmp::Uge: 3776 case InstIcmp::Ult: 3777 break; 3778 } 3779 } else { 3780 Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem); 3781 Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem); 3782 } 3783 // Optimize comparisons with zero. 3784 if (isZero(Src1)) { 3785 Constant *SignMask = Ctx->getConstantInt32(0x80000000); 3786 Variable *Temp = nullptr; 3787 switch (Condition) { 3788 default: 3789 llvm_unreachable("unexpected condition"); 3790 break; 3791 case InstIcmp::Eq: 3792 case InstIcmp::Ule: 3793 // Mov Src0HiRM first, because it was legalized most recently, and will 3794 // sometimes avoid a move before the OR. 3795 _mov(Temp, Src0HiRM); 3796 _or(Temp, Src0LoRM); 3797 Context.insert<InstFakeUse>(Temp); 3798 setccOrConsumer(Traits::Cond::Br_e, Dest, Consumer); 3799 return; 3800 case InstIcmp::Ne: 3801 case InstIcmp::Ugt: 3802 // Mov Src0HiRM first, because it was legalized most recently, and will 3803 // sometimes avoid a move before the OR. 3804 _mov(Temp, Src0HiRM); 3805 _or(Temp, Src0LoRM); 3806 Context.insert<InstFakeUse>(Temp); 3807 setccOrConsumer(Traits::Cond::Br_ne, Dest, Consumer); 3808 return; 3809 case InstIcmp::Uge: 3810 movOrConsumer(true, Dest, Consumer); 3811 return; 3812 case InstIcmp::Ult: 3813 movOrConsumer(false, Dest, Consumer); 3814 return; 3815 case InstIcmp::Sgt: 3816 break; 3817 case InstIcmp::Sge: 3818 _test(Src0HiRM, SignMask); 3819 setccOrConsumer(Traits::Cond::Br_e, Dest, Consumer); 3820 return; 3821 case InstIcmp::Slt: 3822 _test(Src0HiRM, SignMask); 3823 setccOrConsumer(Traits::Cond::Br_ne, Dest, Consumer); 3824 return; 3825 case InstIcmp::Sle: 3826 break; 3827 } 3828 } 3829 // Handle general compares. 3830 Operand *Src1LoRI = legalize(loOperand(Src1), Legal_Reg | Legal_Imm); 3831 Operand *Src1HiRI = legalize(hiOperand(Src1), Legal_Reg | Legal_Imm); 3832 if (Consumer == nullptr) { 3833 Constant *Zero = Ctx->getConstantInt(Dest->getType(), 0); 3834 Constant *One = Ctx->getConstantInt(Dest->getType(), 1); 3835 InstX86Label *LabelFalse = InstX86Label::create(Func, this); 3836 InstX86Label *LabelTrue = InstX86Label::create(Func, this); 3837 _mov(Dest, One); 3838 _cmp(Src0HiRM, Src1HiRI); 3839 if (Traits::TableIcmp64[Condition].C1 != Traits::Cond::Br_None) 3840 _br(Traits::TableIcmp64[Condition].C1, LabelTrue); 3841 if (Traits::TableIcmp64[Condition].C2 != Traits::Cond::Br_None) 3842 _br(Traits::TableIcmp64[Condition].C2, LabelFalse); 3843 _cmp(Src0LoRM, Src1LoRI); 3844 _br(Traits::TableIcmp64[Condition].C3, LabelTrue); 3845 Context.insert(LabelFalse); 3846 _redefined(_mov(Dest, Zero)); 3847 Context.insert(LabelTrue); 3848 return; 3849 } 3850 if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) { 3851 _cmp(Src0HiRM, Src1HiRI); 3852 if (Traits::TableIcmp64[Condition].C1 != Traits::Cond::Br_None) 3853 _br(Traits::TableIcmp64[Condition].C1, Br->getTargetTrue()); 3854 if (Traits::TableIcmp64[Condition].C2 != Traits::Cond::Br_None) 3855 _br(Traits::TableIcmp64[Condition].C2, Br->getTargetFalse()); 3856 _cmp(Src0LoRM, Src1LoRI); 3857 _br(Traits::TableIcmp64[Condition].C3, Br->getTargetTrue(), 3858 Br->getTargetFalse()); 3859 return; 3860 } 3861 if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) { 3862 Operand *SrcT = Select->getTrueOperand(); 3863 Operand *SrcF = Select->getFalseOperand(); 3864 Variable *SelectDest = Select->getDest(); 3865 InstX86Label *LabelFalse = InstX86Label::create(Func, this); 3866 InstX86Label *LabelTrue = InstX86Label::create(Func, this); 3867 lowerMove(SelectDest, SrcT, false); 3868 _cmp(Src0HiRM, Src1HiRI); 3869 if (Traits::TableIcmp64[Condition].C1 != Traits::Cond::Br_None) 3870 _br(Traits::TableIcmp64[Condition].C1, LabelTrue); 3871 if (Traits::TableIcmp64[Condition].C2 != Traits::Cond::Br_None) 3872 _br(Traits::TableIcmp64[Condition].C2, LabelFalse); 3873 _cmp(Src0LoRM, Src1LoRI); 3874 _br(Traits::TableIcmp64[Condition].C3, LabelTrue); 3875 Context.insert(LabelFalse); 3876 static constexpr bool IsRedefinition = true; 3877 lowerMove(SelectDest, SrcF, IsRedefinition); 3878 Context.insert(LabelTrue); 3879 return; 3880 } 3881 llvm::report_fatal_error("Unexpected consumer type"); 3882 } 3883 3884 template <typename TraitsType> 3885 void TargetX86Base<TraitsType>::setccOrConsumer(BrCond Condition, 3886 Variable *Dest, 3887 const Inst *Consumer) { 3888 if (Consumer == nullptr) { 3889 _setcc(Dest, Condition); 3890 return; 3891 } 3892 if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) { 3893 _br(Condition, Br->getTargetTrue(), Br->getTargetFalse()); 3894 return; 3895 } 3896 if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) { 3897 Operand *SrcT = Select->getTrueOperand(); 3898 Operand *SrcF = Select->getFalseOperand(); 3899 Variable *SelectDest = Select->getDest(); 3900 lowerSelectMove(SelectDest, Condition, SrcT, SrcF); 3901 return; 3902 } 3903 llvm::report_fatal_error("Unexpected consumer type"); 3904 } 3905 3906 template <typename TraitsType> 3907 void TargetX86Base<TraitsType>::movOrConsumer(bool IcmpResult, Variable *Dest, 3908 const Inst *Consumer) { 3909 if (Consumer == nullptr) { 3910 _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0))); 3911 return; 3912 } 3913 if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) { 3914 // TODO(sehr,stichnot): This could be done with a single unconditional 3915 // branch instruction, but subzero doesn't know how to handle the resulting 3916 // control flow graph changes now. Make it do so to eliminate mov and cmp. 3917 _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0))); 3918 _cmp(Dest, Ctx->getConstantInt(Dest->getType(), 0)); 3919 _br(Traits::Cond::Br_ne, Br->getTargetTrue(), Br->getTargetFalse()); 3920 return; 3921 } 3922 if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) { 3923 Operand *Src = nullptr; 3924 if (IcmpResult) { 3925 Src = legalize(Select->getTrueOperand(), Legal_Reg | Legal_Imm); 3926 } else { 3927 Src = legalize(Select->getFalseOperand(), Legal_Reg | Legal_Imm); 3928 } 3929 Variable *SelectDest = Select->getDest(); 3930 lowerMove(SelectDest, Src, false); 3931 return; 3932 } 3933 llvm::report_fatal_error("Unexpected consumer type"); 3934 } 3935 3936 template <typename TraitsType> 3937 void TargetX86Base<TraitsType>::lowerArithAndConsumer( 3938 const InstArithmetic *Arith, const Inst *Consumer) { 3939 Variable *T = nullptr; 3940 Operand *Src0 = legalize(Arith->getSrc(0)); 3941 Operand *Src1 = legalize(Arith->getSrc(1)); 3942 Variable *Dest = Arith->getDest(); 3943 switch (Arith->getOp()) { 3944 default: 3945 llvm_unreachable("arithmetic operator not AND or OR"); 3946 break; 3947 case InstArithmetic::And: 3948 _mov(T, Src0); 3949 // Test cannot have an address in the second position. Since T is 3950 // guaranteed to be a register and Src1 could be a memory load, ensure 3951 // that the second argument is a register. 3952 if (llvm::isa<Constant>(Src1)) 3953 _test(T, Src1); 3954 else 3955 _test(Src1, T); 3956 break; 3957 case InstArithmetic::Or: 3958 _mov(T, Src0); 3959 _or(T, Src1); 3960 break; 3961 } 3962 3963 if (Consumer == nullptr) { 3964 llvm::report_fatal_error("Expected a consumer instruction"); 3965 } 3966 if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) { 3967 Context.insert<InstFakeUse>(T); 3968 Context.insert<InstFakeDef>(Dest); 3969 _br(Traits::Cond::Br_ne, Br->getTargetTrue(), Br->getTargetFalse()); 3970 return; 3971 } 3972 llvm::report_fatal_error("Unexpected consumer type"); 3973 } 3974 3975 template <typename TraitsType> 3976 void TargetX86Base<TraitsType>::lowerInsertElement( 3977 const InstInsertElement *Instr) { 3978 Operand *SourceVectNotLegalized = Instr->getSrc(0); 3979 Operand *ElementToInsertNotLegalized = Instr->getSrc(1); 3980 auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(2)); 3981 // Only constant indices are allowed in PNaCl IR. 3982 assert(ElementIndex); 3983 unsigned Index = ElementIndex->getValue(); 3984 assert(Index < typeNumElements(SourceVectNotLegalized->getType())); 3985 3986 Type Ty = SourceVectNotLegalized->getType(); 3987 Type ElementTy = typeElementType(Ty); 3988 Type InVectorElementTy = Traits::getInVectorElementType(Ty); 3989 3990 if (ElementTy == IceType_i1) { 3991 // Expand the element to the appropriate size for it to be inserted in the 3992 // vector. 3993 Variable *Expanded = Func->makeVariable(InVectorElementTy); 3994 auto *Cast = InstCast::create(Func, InstCast::Zext, Expanded, 3995 ElementToInsertNotLegalized); 3996 lowerCast(Cast); 3997 ElementToInsertNotLegalized = Expanded; 3998 } 3999 4000 if (Ty == IceType_v8i16 || Ty == IceType_v8i1 || 4001 InstructionSet >= Traits::SSE4_1) { 4002 // Use insertps, pinsrb, pinsrw, or pinsrd. 4003 Operand *ElementRM = 4004 legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem); 4005 Operand *SourceVectRM = 4006 legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem); 4007 Variable *T = makeReg(Ty); 4008 _movp(T, SourceVectRM); 4009 if (Ty == IceType_v4f32) { 4010 _insertps(T, ElementRM, Ctx->getConstantInt32(Index << 4)); 4011 } else { 4012 // For the pinsrb and pinsrw instructions, when the source operand is a 4013 // register, it must be a full r32 register like eax, and not ax/al/ah. 4014 // For filetype=asm, InstX86Pinsr<TraitsType>::emit() compensates for 4015 // the use 4016 // of r16 and r8 by converting them through getBaseReg(), while emitIAS() 4017 // validates that the original and base register encodings are the same. 4018 if (ElementRM->getType() == IceType_i8 && 4019 llvm::isa<Variable>(ElementRM)) { 4020 // Don't use ah/bh/ch/dh for pinsrb. 4021 ElementRM = copyToReg8(ElementRM); 4022 } 4023 _pinsr(T, ElementRM, Ctx->getConstantInt32(Index)); 4024 } 4025 _movp(Instr->getDest(), T); 4026 } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { 4027 // Use shufps or movss. 4028 Variable *ElementR = nullptr; 4029 Operand *SourceVectRM = 4030 legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem); 4031 4032 if (InVectorElementTy == IceType_f32) { 4033 // ElementR will be in an XMM register since it is floating point. 4034 ElementR = legalizeToReg(ElementToInsertNotLegalized); 4035 } else { 4036 // Copy an integer to an XMM register. 4037 Operand *T = legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem); 4038 ElementR = makeReg(Ty); 4039 _movd(ElementR, T); 4040 } 4041 4042 if (Index == 0) { 4043 Variable *T = makeReg(Ty); 4044 _movp(T, SourceVectRM); 4045 _movss(T, ElementR); 4046 _movp(Instr->getDest(), T); 4047 return; 4048 } 4049 4050 // shufps treats the source and destination operands as vectors of four 4051 // doublewords. The destination's two high doublewords are selected from 4052 // the source operand and the two low doublewords are selected from the 4053 // (original value of) the destination operand. An insertelement operation 4054 // can be effected with a sequence of two shufps operations with 4055 // appropriate masks. In all cases below, Element[0] is being inserted into 4056 // SourceVectOperand. Indices are ordered from left to right. 4057 // 4058 // insertelement into index 1 (result is stored in ElementR): 4059 // ElementR := ElementR[0, 0] SourceVectRM[0, 0] 4060 // ElementR := ElementR[3, 0] SourceVectRM[2, 3] 4061 // 4062 // insertelement into index 2 (result is stored in T): 4063 // T := SourceVectRM 4064 // ElementR := ElementR[0, 0] T[0, 3] 4065 // T := T[0, 1] ElementR[0, 3] 4066 // 4067 // insertelement into index 3 (result is stored in T): 4068 // T := SourceVectRM 4069 // ElementR := ElementR[0, 0] T[0, 2] 4070 // T := T[0, 1] ElementR[3, 0] 4071 const unsigned char Mask1[3] = {0, 192, 128}; 4072 const unsigned char Mask2[3] = {227, 196, 52}; 4073 4074 Constant *Mask1Constant = Ctx->getConstantInt32(Mask1[Index - 1]); 4075 Constant *Mask2Constant = Ctx->getConstantInt32(Mask2[Index - 1]); 4076 4077 if (Index == 1) { 4078 _shufps(ElementR, SourceVectRM, Mask1Constant); 4079 _shufps(ElementR, SourceVectRM, Mask2Constant); 4080 _movp(Instr->getDest(), ElementR); 4081 } else { 4082 Variable *T = makeReg(Ty); 4083 _movp(T, SourceVectRM); 4084 _shufps(ElementR, T, Mask1Constant); 4085 _shufps(T, ElementR, Mask2Constant); 4086 _movp(Instr->getDest(), T); 4087 } 4088 } else { 4089 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); 4090 // Spill the value to a stack slot and perform the insertion in memory. 4091 // 4092 // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support 4093 // for legalizing to mem is implemented. 4094 Variable *Slot = Func->makeVariable(Ty); 4095 Slot->setMustNotHaveReg(); 4096 _movp(Slot, legalizeToReg(SourceVectNotLegalized)); 4097 4098 // Compute the location of the position to insert in memory. 4099 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); 4100 X86OperandMem *Loc = 4101 getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset); 4102 _store(legalizeToReg(ElementToInsertNotLegalized), Loc); 4103 4104 Variable *T = makeReg(Ty); 4105 _movp(T, Slot); 4106 _movp(Instr->getDest(), T); 4107 } 4108 } 4109 4110 template <typename TraitsType> 4111 void TargetX86Base<TraitsType>::lowerIntrinsicCall( 4112 const InstIntrinsicCall *Instr) { 4113 switch (Intrinsics::IntrinsicID ID = Instr->getIntrinsicInfo().ID) { 4114 case Intrinsics::AtomicCmpxchg: { 4115 if (!Intrinsics::isMemoryOrderValid( 4116 ID, getConstantMemoryOrder(Instr->getArg(3)), 4117 getConstantMemoryOrder(Instr->getArg(4)))) { 4118 Func->setError("Unexpected memory ordering for AtomicCmpxchg"); 4119 return; 4120 } 4121 Variable *DestPrev = Instr->getDest(); 4122 Operand *PtrToMem = legalize(Instr->getArg(0)); 4123 Operand *Expected = legalize(Instr->getArg(1)); 4124 Operand *Desired = legalize(Instr->getArg(2)); 4125 if (tryOptimizedCmpxchgCmpBr(DestPrev, PtrToMem, Expected, Desired)) 4126 return; 4127 lowerAtomicCmpxchg(DestPrev, PtrToMem, Expected, Desired); 4128 return; 4129 } 4130 case Intrinsics::AtomicFence: 4131 if (!Intrinsics::isMemoryOrderValid( 4132 ID, getConstantMemoryOrder(Instr->getArg(0)))) { 4133 Func->setError("Unexpected memory ordering for AtomicFence"); 4134 return; 4135 } 4136 _mfence(); 4137 return; 4138 case Intrinsics::AtomicFenceAll: 4139 // NOTE: FenceAll should prevent and load/store from being moved across the 4140 // fence (both atomic and non-atomic). The InstX8632Mfence instruction is 4141 // currently marked coarsely as "HasSideEffects". 4142 _mfence(); 4143 return; 4144 case Intrinsics::AtomicIsLockFree: { 4145 // X86 is always lock free for 8/16/32/64 bit accesses. 4146 // TODO(jvoung): Since the result is constant when given a constant byte 4147 // size, this opens up DCE opportunities. 4148 Operand *ByteSize = Instr->getArg(0); 4149 Variable *Dest = Instr->getDest(); 4150 if (auto *CI = llvm::dyn_cast<ConstantInteger32>(ByteSize)) { 4151 Constant *Result; 4152 switch (CI->getValue()) { 4153 default: 4154 // Some x86-64 processors support the cmpxchg16b instruction, which can 4155 // make 16-byte operations lock free (when used with the LOCK prefix). 4156 // However, that's not supported in 32-bit mode, so just return 0 even 4157 // for large sizes. 4158 Result = Ctx->getConstantZero(IceType_i32); 4159 break; 4160 case 1: 4161 case 2: 4162 case 4: 4163 case 8: 4164 Result = Ctx->getConstantInt32(1); 4165 break; 4166 } 4167 _mov(Dest, Result); 4168 return; 4169 } 4170 // The PNaCl ABI requires the byte size to be a compile-time constant. 4171 Func->setError("AtomicIsLockFree byte size should be compile-time const"); 4172 return; 4173 } 4174 case Intrinsics::AtomicLoad: { 4175 // We require the memory address to be naturally aligned. Given that is the 4176 // case, then normal loads are atomic. 4177 if (!Intrinsics::isMemoryOrderValid( 4178 ID, getConstantMemoryOrder(Instr->getArg(1)))) { 4179 Func->setError("Unexpected memory ordering for AtomicLoad"); 4180 return; 4181 } 4182 Variable *Dest = Instr->getDest(); 4183 if (!Traits::Is64Bit) { 4184 if (auto *Dest64On32 = llvm::dyn_cast<Variable64On32>(Dest)) { 4185 // Follow what GCC does and use a movq instead of what lowerLoad() 4186 // normally does (split the load into two). Thus, this skips 4187 // load/arithmetic op folding. Load/arithmetic folding can't happen 4188 // anyway, since this is x86-32 and integer arithmetic only happens on 4189 // 32-bit quantities. 4190 Variable *T = makeReg(IceType_f64); 4191 X86OperandMem *Addr = formMemoryOperand(Instr->getArg(0), IceType_f64); 4192 _movq(T, Addr); 4193 // Then cast the bits back out of the XMM register to the i64 Dest. 4194 auto *Cast = InstCast::create(Func, InstCast::Bitcast, Dest, T); 4195 lowerCast(Cast); 4196 // Make sure that the atomic load isn't elided when unused. 4197 Context.insert<InstFakeUse>(Dest64On32->getLo()); 4198 Context.insert<InstFakeUse>(Dest64On32->getHi()); 4199 return; 4200 } 4201 } 4202 auto *Load = InstLoad::create(Func, Dest, Instr->getArg(0)); 4203 lowerLoad(Load); 4204 // Make sure the atomic load isn't elided when unused, by adding a FakeUse. 4205 // Since lowerLoad may fuse the load w/ an arithmetic instruction, insert 4206 // the FakeUse on the last-inserted instruction's dest. 4207 Context.insert<InstFakeUse>(Context.getLastInserted()->getDest()); 4208 return; 4209 } 4210 case Intrinsics::AtomicRMW: 4211 if (!Intrinsics::isMemoryOrderValid( 4212 ID, getConstantMemoryOrder(Instr->getArg(3)))) { 4213 Func->setError("Unexpected memory ordering for AtomicRMW"); 4214 return; 4215 } 4216 lowerAtomicRMW( 4217 Instr->getDest(), 4218 static_cast<uint32_t>( 4219 llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()), 4220 Instr->getArg(1), Instr->getArg(2)); 4221 return; 4222 case Intrinsics::AtomicStore: { 4223 if (!Intrinsics::isMemoryOrderValid( 4224 ID, getConstantMemoryOrder(Instr->getArg(2)))) { 4225 Func->setError("Unexpected memory ordering for AtomicStore"); 4226 return; 4227 } 4228 // We require the memory address to be naturally aligned. Given that is the 4229 // case, then normal stores are atomic. Add a fence after the store to make 4230 // it visible. 4231 Operand *Value = Instr->getArg(0); 4232 Operand *Ptr = Instr->getArg(1); 4233 if (!Traits::Is64Bit && Value->getType() == IceType_i64) { 4234 // Use a movq instead of what lowerStore() normally does (split the store 4235 // into two), following what GCC does. Cast the bits from int -> to an 4236 // xmm register first. 4237 Variable *T = makeReg(IceType_f64); 4238 auto *Cast = InstCast::create(Func, InstCast::Bitcast, T, Value); 4239 lowerCast(Cast); 4240 // Then store XMM w/ a movq. 4241 X86OperandMem *Addr = formMemoryOperand(Ptr, IceType_f64); 4242 _storeq(T, Addr); 4243 _mfence(); 4244 return; 4245 } 4246 auto *Store = InstStore::create(Func, Value, Ptr); 4247 lowerStore(Store); 4248 _mfence(); 4249 return; 4250 } 4251 case Intrinsics::Bswap: { 4252 Variable *Dest = Instr->getDest(); 4253 Operand *Val = Instr->getArg(0); 4254 // In 32-bit mode, bswap only works on 32-bit arguments, and the argument 4255 // must be a register. Use rotate left for 16-bit bswap. 4256 if (!Traits::Is64Bit && Val->getType() == IceType_i64) { 4257 Val = legalizeUndef(Val); 4258 Variable *T_Lo = legalizeToReg(loOperand(Val)); 4259 Variable *T_Hi = legalizeToReg(hiOperand(Val)); 4260 auto *DestLo = llvm::cast<Variable>(loOperand(Dest)); 4261 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest)); 4262 _bswap(T_Lo); 4263 _bswap(T_Hi); 4264 _mov(DestLo, T_Hi); 4265 _mov(DestHi, T_Lo); 4266 } else if ((Traits::Is64Bit && Val->getType() == IceType_i64) || 4267 Val->getType() == IceType_i32) { 4268 Variable *T = legalizeToReg(Val); 4269 _bswap(T); 4270 _mov(Dest, T); 4271 } else { 4272 assert(Val->getType() == IceType_i16); 4273 Constant *Eight = Ctx->getConstantInt16(8); 4274 Variable *T = nullptr; 4275 Val = legalize(Val); 4276 _mov(T, Val); 4277 _rol(T, Eight); 4278 _mov(Dest, T); 4279 } 4280 return; 4281 } 4282 case Intrinsics::Ctpop: { 4283 Variable *Dest = Instr->getDest(); 4284 Variable *T = nullptr; 4285 Operand *Val = Instr->getArg(0); 4286 Type ValTy = Val->getType(); 4287 assert(ValTy == IceType_i32 || ValTy == IceType_i64); 4288 4289 if (!Traits::Is64Bit) { 4290 T = Dest; 4291 } else { 4292 T = makeReg(IceType_i64); 4293 if (ValTy == IceType_i32) { 4294 // in x86-64, __popcountsi2 is not defined, so we cheat a bit by 4295 // converting it to a 64-bit value, and using ctpop_i64. _movzx should 4296 // ensure we will not have any bits set on Val's upper 32 bits. 4297 Variable *V = makeReg(IceType_i64); 4298 Operand *ValRM = legalize(Val, Legal_Reg | Legal_Mem); 4299 _movzx(V, ValRM); 4300 Val = V; 4301 } 4302 ValTy = IceType_i64; 4303 } 4304 4305 InstCall *Call = 4306 makeHelperCall(ValTy == IceType_i32 ? RuntimeHelper::H_call_ctpop_i32 4307 : RuntimeHelper::H_call_ctpop_i64, 4308 T, 1); 4309 Call->addArg(Val); 4310 lowerCall(Call); 4311 // The popcount helpers always return 32-bit values, while the intrinsic's 4312 // signature matches the native POPCNT instruction and fills a 64-bit reg 4313 // (in 64-bit mode). Thus, clear the upper bits of the dest just in case 4314 // the user doesn't do that in the IR. If the user does that in the IR, 4315 // then this zero'ing instruction is dead and gets optimized out. 4316 if (!Traits::Is64Bit) { 4317 assert(T == Dest); 4318 if (Val->getType() == IceType_i64) { 4319 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest)); 4320 Constant *Zero = Ctx->getConstantZero(IceType_i32); 4321 _mov(DestHi, Zero); 4322 } 4323 } else { 4324 assert(Val->getType() == IceType_i64); 4325 // T is 64 bit. It needs to be copied to dest. We need to: 4326 // 4327 // T_1.32 = trunc T.64 to i32 4328 // T_2.64 = zext T_1.32 to i64 4329 // Dest.<<right_size>> = T_2.<<right_size>> 4330 // 4331 // which ensures the upper 32 bits will always be cleared. Just doing a 4332 // 4333 // mov Dest.32 = trunc T.32 to i32 4334 // 4335 // is dangerous because there's a chance the compiler will optimize this 4336 // copy out. To use _movzx we need two new registers (one 32-, and 4337 // another 64-bit wide.) 4338 Variable *T_1 = makeReg(IceType_i32); 4339 _mov(T_1, T); 4340 Variable *T_2 = makeReg(IceType_i64); 4341 _movzx(T_2, T_1); 4342 _mov(Dest, T_2); 4343 } 4344 return; 4345 } 4346 case Intrinsics::Ctlz: { 4347 // The "is zero undef" parameter is ignored and we always return a 4348 // well-defined value. 4349 Operand *Val = legalize(Instr->getArg(0)); 4350 Operand *FirstVal; 4351 Operand *SecondVal = nullptr; 4352 if (!Traits::Is64Bit && Val->getType() == IceType_i64) { 4353 FirstVal = loOperand(Val); 4354 SecondVal = hiOperand(Val); 4355 } else { 4356 FirstVal = Val; 4357 } 4358 constexpr bool IsCttz = false; 4359 lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal, 4360 SecondVal); 4361 return; 4362 } 4363 case Intrinsics::Cttz: { 4364 // The "is zero undef" parameter is ignored and we always return a 4365 // well-defined value. 4366 Operand *Val = legalize(Instr->getArg(0)); 4367 Operand *FirstVal; 4368 Operand *SecondVal = nullptr; 4369 if (!Traits::Is64Bit && Val->getType() == IceType_i64) { 4370 FirstVal = hiOperand(Val); 4371 SecondVal = loOperand(Val); 4372 } else { 4373 FirstVal = Val; 4374 } 4375 constexpr bool IsCttz = true; 4376 lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal, 4377 SecondVal); 4378 return; 4379 } 4380 case Intrinsics::Fabs: { 4381 Operand *Src = legalize(Instr->getArg(0)); 4382 Type Ty = Src->getType(); 4383 Variable *Dest = Instr->getDest(); 4384 Variable *T = makeVectorOfFabsMask(Ty); 4385 // The pand instruction operates on an m128 memory operand, so if Src is an 4386 // f32 or f64, we need to make sure it's in a register. 4387 if (isVectorType(Ty)) { 4388 if (llvm::isa<X86OperandMem>(Src)) 4389 Src = legalizeToReg(Src); 4390 } else { 4391 Src = legalizeToReg(Src); 4392 } 4393 _pand(T, Src); 4394 if (isVectorType(Ty)) 4395 _movp(Dest, T); 4396 else 4397 _mov(Dest, T); 4398 return; 4399 } 4400 case Intrinsics::Longjmp: { 4401 InstCall *Call = makeHelperCall(RuntimeHelper::H_call_longjmp, nullptr, 2); 4402 Call->addArg(Instr->getArg(0)); 4403 Call->addArg(Instr->getArg(1)); 4404 lowerCall(Call); 4405 return; 4406 } 4407 case Intrinsics::Memcpy: { 4408 lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); 4409 return; 4410 } 4411 case Intrinsics::Memmove: { 4412 lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); 4413 return; 4414 } 4415 case Intrinsics::Memset: { 4416 lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); 4417 return; 4418 } 4419 case Intrinsics::NaClReadTP: { 4420 if (NeedSandboxing) { 4421 Operand *Src = 4422 dispatchToConcrete(&ConcreteTarget::createNaClReadTPSrcOperand); 4423 Variable *Dest = Instr->getDest(); 4424 Variable *T = nullptr; 4425 _mov(T, Src); 4426 _mov(Dest, T); 4427 } else { 4428 InstCall *Call = 4429 makeHelperCall(RuntimeHelper::H_call_read_tp, Instr->getDest(), 0); 4430 lowerCall(Call); 4431 } 4432 return; 4433 } 4434 case Intrinsics::Setjmp: { 4435 InstCall *Call = 4436 makeHelperCall(RuntimeHelper::H_call_setjmp, Instr->getDest(), 1); 4437 Call->addArg(Instr->getArg(0)); 4438 lowerCall(Call); 4439 return; 4440 } 4441 case Intrinsics::Sqrt: { 4442 assert(isScalarFloatingType(Instr->getDest()->getType()) || 4443 getFlags().getApplicationBinaryInterface() != ::Ice::ABI_PNaCl); 4444 Operand *Src = legalize(Instr->getArg(0)); 4445 Variable *Dest = Instr->getDest(); 4446 Variable *T = makeReg(Dest->getType()); 4447 _sqrt(T, Src); 4448 if (isVectorType(Dest->getType())) { 4449 _movp(Dest, T); 4450 } else { 4451 _mov(Dest, T); 4452 } 4453 return; 4454 } 4455 case Intrinsics::Stacksave: { 4456 if (!Traits::Is64Bit || !NeedSandboxing) { 4457 Variable *esp = Func->getTarget()->getPhysicalRegister(getStackReg(), 4458 Traits::WordType); 4459 Variable *Dest = Instr->getDest(); 4460 _mov(Dest, esp); 4461 return; 4462 } 4463 Variable *esp = Func->getTarget()->getPhysicalRegister( 4464 Traits::RegisterSet::Reg_esp, IceType_i32); 4465 Variable *Dest = Instr->getDest(); 4466 _mov(Dest, esp); 4467 4468 return; 4469 } 4470 case Intrinsics::Stackrestore: { 4471 Operand *Src = Instr->getArg(0); 4472 _mov_sp(Src); 4473 return; 4474 } 4475 4476 case Intrinsics::Trap: 4477 _ud2(); 4478 return; 4479 case Intrinsics::LoadSubVector: { 4480 assert(llvm::isa<ConstantInteger32>(Instr->getArg(1)) && 4481 "LoadSubVector second argument must be a constant"); 4482 Variable *Dest = Instr->getDest(); 4483 Type Ty = Dest->getType(); 4484 auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(1)); 4485 Operand *Addr = Instr->getArg(0); 4486 X86OperandMem *Src = formMemoryOperand(Addr, Ty); 4487 doMockBoundsCheck(Src); 4488 4489 if (Dest->isRematerializable()) { 4490 Context.insert<InstFakeDef>(Dest); 4491 return; 4492 } 4493 4494 auto *T = makeReg(Ty); 4495 switch (SubVectorSize->getValue()) { 4496 case 4: 4497 _movd(T, Src); 4498 break; 4499 case 8: 4500 _movq(T, Src); 4501 break; 4502 default: 4503 Func->setError("Unexpected size for LoadSubVector"); 4504 return; 4505 } 4506 _movp(Dest, T); 4507 return; 4508 } 4509 case Intrinsics::StoreSubVector: { 4510 assert(llvm::isa<ConstantInteger32>(Instr->getArg(2)) && 4511 "StoreSubVector third argument must be a constant"); 4512 auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(2)); 4513 Operand *Value = Instr->getArg(0); 4514 Operand *Addr = Instr->getArg(1); 4515 X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType()); 4516 doMockBoundsCheck(NewAddr); 4517 4518 Value = legalizeToReg(Value); 4519 4520 switch (SubVectorSize->getValue()) { 4521 case 4: 4522 _stored(Value, NewAddr); 4523 break; 4524 case 8: 4525 _storeq(Value, NewAddr); 4526 break; 4527 default: 4528 Func->setError("Unexpected size for StoreSubVector"); 4529 return; 4530 } 4531 return; 4532 } 4533 case Intrinsics::VectorPackSigned: { 4534 Operand *Src0 = Instr->getArg(0); 4535 Operand *Src1 = Instr->getArg(1); 4536 Variable *Dest = Instr->getDest(); 4537 auto *T = makeReg(Src0->getType()); 4538 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 4539 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 4540 _movp(T, Src0RM); 4541 _packss(T, Src1RM); 4542 _movp(Dest, T); 4543 return; 4544 } 4545 case Intrinsics::VectorPackUnsigned: { 4546 Operand *Src0 = Instr->getArg(0); 4547 Operand *Src1 = Instr->getArg(1); 4548 Variable *Dest = Instr->getDest(); 4549 auto *T = makeReg(Src0->getType()); 4550 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 4551 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 4552 _movp(T, Src0RM); 4553 _packus(T, Src1RM); 4554 _movp(Dest, T); 4555 return; 4556 } 4557 case Intrinsics::SignMask: { 4558 Operand *SrcReg = legalizeToReg(Instr->getArg(0)); 4559 Variable *Dest = Instr->getDest(); 4560 Variable *T = makeReg(IceType_i32); 4561 if (SrcReg->getType() == IceType_v4f32 || 4562 SrcReg->getType() == IceType_v4i32 || 4563 SrcReg->getType() == IceType_v16i8) { 4564 _movmsk(T, SrcReg); 4565 } else { 4566 // TODO(capn): We could implement v8i16 sign mask using packsswb/pmovmskb 4567 llvm::report_fatal_error("Invalid type for SignMask intrinsic"); 4568 } 4569 _mov(Dest, T); 4570 return; 4571 } 4572 case Intrinsics::MultiplyHighSigned: { 4573 Operand *Src0 = Instr->getArg(0); 4574 Operand *Src1 = Instr->getArg(1); 4575 Variable *Dest = Instr->getDest(); 4576 auto *T = makeReg(Dest->getType()); 4577 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 4578 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 4579 _movp(T, Src0RM); 4580 _pmulhw(T, Src1RM); 4581 _movp(Dest, T); 4582 return; 4583 } 4584 case Intrinsics::MultiplyHighUnsigned: { 4585 Operand *Src0 = Instr->getArg(0); 4586 Operand *Src1 = Instr->getArg(1); 4587 Variable *Dest = Instr->getDest(); 4588 auto *T = makeReg(Dest->getType()); 4589 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 4590 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 4591 _movp(T, Src0RM); 4592 _pmulhuw(T, Src1RM); 4593 _movp(Dest, T); 4594 return; 4595 } 4596 case Intrinsics::MultiplyAddPairs: { 4597 Operand *Src0 = Instr->getArg(0); 4598 Operand *Src1 = Instr->getArg(1); 4599 Variable *Dest = Instr->getDest(); 4600 auto *T = makeReg(Dest->getType()); 4601 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 4602 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 4603 _movp(T, Src0RM); 4604 _pmaddwd(T, Src1RM); 4605 _movp(Dest, T); 4606 return; 4607 } 4608 case Intrinsics::AddSaturateSigned: { 4609 Operand *Src0 = Instr->getArg(0); 4610 Operand *Src1 = Instr->getArg(1); 4611 Variable *Dest = Instr->getDest(); 4612 auto *T = makeReg(Dest->getType()); 4613 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 4614 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 4615 _movp(T, Src0RM); 4616 _padds(T, Src1RM); 4617 _movp(Dest, T); 4618 return; 4619 } 4620 case Intrinsics::SubtractSaturateSigned: { 4621 Operand *Src0 = Instr->getArg(0); 4622 Operand *Src1 = Instr->getArg(1); 4623 Variable *Dest = Instr->getDest(); 4624 auto *T = makeReg(Dest->getType()); 4625 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 4626 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 4627 _movp(T, Src0RM); 4628 _psubs(T, Src1RM); 4629 _movp(Dest, T); 4630 return; 4631 } 4632 case Intrinsics::AddSaturateUnsigned: { 4633 Operand *Src0 = Instr->getArg(0); 4634 Operand *Src1 = Instr->getArg(1); 4635 Variable *Dest = Instr->getDest(); 4636 auto *T = makeReg(Dest->getType()); 4637 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 4638 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 4639 _movp(T, Src0RM); 4640 _paddus(T, Src1RM); 4641 _movp(Dest, T); 4642 return; 4643 } 4644 case Intrinsics::SubtractSaturateUnsigned: { 4645 Operand *Src0 = Instr->getArg(0); 4646 Operand *Src1 = Instr->getArg(1); 4647 Variable *Dest = Instr->getDest(); 4648 auto *T = makeReg(Dest->getType()); 4649 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 4650 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 4651 _movp(T, Src0RM); 4652 _psubus(T, Src1RM); 4653 _movp(Dest, T); 4654 return; 4655 } 4656 case Intrinsics::Nearbyint: { 4657 Operand *Src = Instr->getArg(0); 4658 Variable *Dest = Instr->getDest(); 4659 Type DestTy = Dest->getType(); 4660 if (isVectorType(DestTy)) { 4661 assert(DestTy == IceType_v4i32); 4662 assert(Src->getType() == IceType_v4f32); 4663 Operand *Src0R = legalizeToReg(Src); 4664 Variable *T = makeReg(DestTy); 4665 _cvt(T, Src0R, Traits::Insts::Cvt::Ps2dq); 4666 _movp(Dest, T); 4667 } else if (!Traits::Is64Bit && DestTy == IceType_i64) { 4668 llvm::report_fatal_error("Helper call was expected"); 4669 } else { 4670 Operand *Src0RM = legalize(Src, Legal_Reg | Legal_Mem); 4671 // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type 4672 Variable *T_1 = nullptr; 4673 if (Traits::Is64Bit && DestTy == IceType_i64) { 4674 T_1 = makeReg(IceType_i64); 4675 } else { 4676 assert(DestTy != IceType_i64); 4677 T_1 = makeReg(IceType_i32); 4678 } 4679 // cvt() requires its integer argument to be a GPR. 4680 Variable *T_2 = makeReg(DestTy); 4681 if (isByteSizedType(DestTy)) { 4682 assert(T_1->getType() == IceType_i32); 4683 T_1->setRegClass(RCX86_Is32To8); 4684 T_2->setRegClass(RCX86_IsTrunc8Rcvr); 4685 } 4686 _cvt(T_1, Src0RM, Traits::Insts::Cvt::Ss2si); 4687 _mov(T_2, T_1); // T_1 and T_2 may have different integer types 4688 if (DestTy == IceType_i1) 4689 _and(T_2, Ctx->getConstantInt1(1)); 4690 _mov(Dest, T_2); 4691 } 4692 return; 4693 } 4694 case Intrinsics::Round: { 4695 assert(InstructionSet >= Traits::SSE4_1); 4696 Variable *Dest = Instr->getDest(); 4697 Operand *Src = Instr->getArg(0); 4698 Operand *Mode = Instr->getArg(1); 4699 assert(llvm::isa<ConstantInteger32>(Mode) && 4700 "Round last argument must be a constant"); 4701 auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem); 4702 int32_t Imm = llvm::cast<ConstantInteger32>(Mode)->getValue(); 4703 (void)Imm; 4704 assert(Imm >= 0 && Imm < 4 && "Invalid rounding mode"); 4705 auto *T = makeReg(Dest->getType()); 4706 _round(T, SrcRM, Mode); 4707 _movp(Dest, T); 4708 return; 4709 } 4710 default: // UnknownIntrinsic 4711 Func->setError("Unexpected intrinsic"); 4712 return; 4713 } 4714 return; 4715 } 4716 4717 template <typename TraitsType> 4718 void TargetX86Base<TraitsType>::lowerAtomicCmpxchg(Variable *DestPrev, 4719 Operand *Ptr, 4720 Operand *Expected, 4721 Operand *Desired) { 4722 Type Ty = Expected->getType(); 4723 if (!Traits::Is64Bit && Ty == IceType_i64) { 4724 // Reserve the pre-colored registers first, before adding any more 4725 // infinite-weight variables from formMemoryOperand's legalization. 4726 Variable *T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx); 4727 Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax); 4728 Variable *T_ecx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx); 4729 Variable *T_ebx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ebx); 4730 _mov(T_eax, loOperand(Expected)); 4731 _mov(T_edx, hiOperand(Expected)); 4732 _mov(T_ebx, loOperand(Desired)); 4733 _mov(T_ecx, hiOperand(Desired)); 4734 X86OperandMem *Addr = formMemoryOperand(Ptr, Ty); 4735 constexpr bool Locked = true; 4736 _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked); 4737 auto *DestLo = llvm::cast<Variable>(loOperand(DestPrev)); 4738 auto *DestHi = llvm::cast<Variable>(hiOperand(DestPrev)); 4739 _mov(DestLo, T_eax); 4740 _mov(DestHi, T_edx); 4741 return; 4742 } 4743 RegNumT Eax; 4744 switch (Ty) { 4745 default: 4746 llvm::report_fatal_error("Bad type for cmpxchg"); 4747 case IceType_i64: 4748 Eax = Traits::getRaxOrDie(); 4749 break; 4750 case IceType_i32: 4751 Eax = Traits::RegisterSet::Reg_eax; 4752 break; 4753 case IceType_i16: 4754 Eax = Traits::RegisterSet::Reg_ax; 4755 break; 4756 case IceType_i8: 4757 Eax = Traits::RegisterSet::Reg_al; 4758 break; 4759 } 4760 Variable *T_eax = makeReg(Ty, Eax); 4761 _mov(T_eax, Expected); 4762 X86OperandMem *Addr = formMemoryOperand(Ptr, Ty); 4763 Variable *DesiredReg = legalizeToReg(Desired); 4764 constexpr bool Locked = true; 4765 _cmpxchg(Addr, T_eax, DesiredReg, Locked); 4766 _mov(DestPrev, T_eax); 4767 } 4768 4769 template <typename TraitsType> 4770 bool TargetX86Base<TraitsType>::tryOptimizedCmpxchgCmpBr(Variable *Dest, 4771 Operand *PtrToMem, 4772 Operand *Expected, 4773 Operand *Desired) { 4774 if (Func->getOptLevel() == Opt_m1) 4775 return false; 4776 // Peek ahead a few instructions and see how Dest is used. 4777 // It's very common to have: 4778 // 4779 // %x = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* ptr, i32 %expected, ...) 4780 // [%y_phi = ...] // list of phi stores 4781 // %p = icmp eq i32 %x, %expected 4782 // br i1 %p, label %l1, label %l2 4783 // 4784 // which we can optimize into: 4785 // 4786 // %x = <cmpxchg code> 4787 // [%y_phi = ...] // list of phi stores 4788 // br eq, %l1, %l2 4789 InstList::iterator I = Context.getCur(); 4790 // I is currently the InstIntrinsicCall. Peek past that. 4791 // This assumes that the atomic cmpxchg has not been lowered yet, 4792 // so that the instructions seen in the scan from "Cur" is simple. 4793 assert(llvm::isa<InstIntrinsicCall>(*I)); 4794 Inst *NextInst = Context.getNextInst(I); 4795 if (!NextInst) 4796 return false; 4797 // There might be phi assignments right before the compare+branch, since this 4798 // could be a backward branch for a loop. This placement of assignments is 4799 // determined by placePhiStores(). 4800 CfgVector<InstAssign *> PhiAssigns; 4801 while (auto *PhiAssign = llvm::dyn_cast<InstAssign>(NextInst)) { 4802 if (PhiAssign->getDest() == Dest) 4803 return false; 4804 PhiAssigns.push_back(PhiAssign); 4805 NextInst = Context.getNextInst(I); 4806 if (!NextInst) 4807 return false; 4808 } 4809 if (auto *NextCmp = llvm::dyn_cast<InstIcmp>(NextInst)) { 4810 if (!(NextCmp->getCondition() == InstIcmp::Eq && 4811 ((NextCmp->getSrc(0) == Dest && NextCmp->getSrc(1) == Expected) || 4812 (NextCmp->getSrc(1) == Dest && NextCmp->getSrc(0) == Expected)))) { 4813 return false; 4814 } 4815 NextInst = Context.getNextInst(I); 4816 if (!NextInst) 4817 return false; 4818 if (auto *NextBr = llvm::dyn_cast<InstBr>(NextInst)) { 4819 if (!NextBr->isUnconditional() && 4820 NextCmp->getDest() == NextBr->getCondition() && 4821 NextBr->isLastUse(NextCmp->getDest())) { 4822 lowerAtomicCmpxchg(Dest, PtrToMem, Expected, Desired); 4823 for (size_t i = 0; i < PhiAssigns.size(); ++i) { 4824 // Lower the phi assignments now, before the branch (same placement 4825 // as before). 4826 InstAssign *PhiAssign = PhiAssigns[i]; 4827 PhiAssign->setDeleted(); 4828 lowerAssign(PhiAssign); 4829 Context.advanceNext(); 4830 } 4831 _br(Traits::Cond::Br_e, NextBr->getTargetTrue(), 4832 NextBr->getTargetFalse()); 4833 // Skip over the old compare and branch, by deleting them. 4834 NextCmp->setDeleted(); 4835 NextBr->setDeleted(); 4836 Context.advanceNext(); 4837 Context.advanceNext(); 4838 return true; 4839 } 4840 } 4841 } 4842 return false; 4843 } 4844 4845 template <typename TraitsType> 4846 void TargetX86Base<TraitsType>::lowerAtomicRMW(Variable *Dest, 4847 uint32_t Operation, Operand *Ptr, 4848 Operand *Val) { 4849 bool NeedsCmpxchg = false; 4850 LowerBinOp Op_Lo = nullptr; 4851 LowerBinOp Op_Hi = nullptr; 4852 switch (Operation) { 4853 default: 4854 Func->setError("Unknown AtomicRMW operation"); 4855 return; 4856 case Intrinsics::AtomicAdd: { 4857 if (!Traits::Is64Bit && Dest->getType() == IceType_i64) { 4858 // All the fall-through paths must set this to true, but use this 4859 // for asserting. 4860 NeedsCmpxchg = true; 4861 Op_Lo = &TargetX86Base<TraitsType>::_add; 4862 Op_Hi = &TargetX86Base<TraitsType>::_adc; 4863 break; 4864 } 4865 X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType()); 4866 constexpr bool Locked = true; 4867 Variable *T = nullptr; 4868 _mov(T, Val); 4869 _xadd(Addr, T, Locked); 4870 _mov(Dest, T); 4871 return; 4872 } 4873 case Intrinsics::AtomicSub: { 4874 if (!Traits::Is64Bit && Dest->getType() == IceType_i64) { 4875 NeedsCmpxchg = true; 4876 Op_Lo = &TargetX86Base<TraitsType>::_sub; 4877 Op_Hi = &TargetX86Base<TraitsType>::_sbb; 4878 break; 4879 } 4880 X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType()); 4881 constexpr bool Locked = true; 4882 Variable *T = nullptr; 4883 _mov(T, Val); 4884 _neg(T); 4885 _xadd(Addr, T, Locked); 4886 _mov(Dest, T); 4887 return; 4888 } 4889 case Intrinsics::AtomicOr: 4890 // TODO(jvoung): If Dest is null or dead, then some of these 4891 // operations do not need an "exchange", but just a locked op. 4892 // That appears to be "worth" it for sub, or, and, and xor. 4893 // xadd is probably fine vs lock add for add, and xchg is fine 4894 // vs an atomic store. 4895 NeedsCmpxchg = true; 4896 Op_Lo = &TargetX86Base<TraitsType>::_or; 4897 Op_Hi = &TargetX86Base<TraitsType>::_or; 4898 break; 4899 case Intrinsics::AtomicAnd: 4900 NeedsCmpxchg = true; 4901 Op_Lo = &TargetX86Base<TraitsType>::_and; 4902 Op_Hi = &TargetX86Base<TraitsType>::_and; 4903 break; 4904 case Intrinsics::AtomicXor: 4905 NeedsCmpxchg = true; 4906 Op_Lo = &TargetX86Base<TraitsType>::_xor; 4907 Op_Hi = &TargetX86Base<TraitsType>::_xor; 4908 break; 4909 case Intrinsics::AtomicExchange: 4910 if (!Traits::Is64Bit && Dest->getType() == IceType_i64) { 4911 NeedsCmpxchg = true; 4912 // NeedsCmpxchg, but no real Op_Lo/Op_Hi need to be done. The values 4913 // just need to be moved to the ecx and ebx registers. 4914 Op_Lo = nullptr; 4915 Op_Hi = nullptr; 4916 break; 4917 } 4918 X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType()); 4919 Variable *T = nullptr; 4920 _mov(T, Val); 4921 _xchg(Addr, T); 4922 _mov(Dest, T); 4923 return; 4924 } 4925 // Otherwise, we need a cmpxchg loop. 4926 (void)NeedsCmpxchg; 4927 assert(NeedsCmpxchg); 4928 expandAtomicRMWAsCmpxchg(Op_Lo, Op_Hi, Dest, Ptr, Val); 4929 } 4930 4931 template <typename TraitsType> 4932 void TargetX86Base<TraitsType>::expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo, 4933 LowerBinOp Op_Hi, 4934 Variable *Dest, 4935 Operand *Ptr, 4936 Operand *Val) { 4937 // Expand a more complex RMW operation as a cmpxchg loop: 4938 // For 64-bit: 4939 // mov eax, [ptr] 4940 // mov edx, [ptr + 4] 4941 // .LABEL: 4942 // mov ebx, eax 4943 // <Op_Lo> ebx, <desired_adj_lo> 4944 // mov ecx, edx 4945 // <Op_Hi> ecx, <desired_adj_hi> 4946 // lock cmpxchg8b [ptr] 4947 // jne .LABEL 4948 // mov <dest_lo>, eax 4949 // mov <dest_lo>, edx 4950 // 4951 // For 32-bit: 4952 // mov eax, [ptr] 4953 // .LABEL: 4954 // mov <reg>, eax 4955 // op <reg>, [desired_adj] 4956 // lock cmpxchg [ptr], <reg> 4957 // jne .LABEL 4958 // mov <dest>, eax 4959 // 4960 // If Op_{Lo,Hi} are nullptr, then just copy the value. 4961 Val = legalize(Val); 4962 Type Ty = Val->getType(); 4963 if (!Traits::Is64Bit && Ty == IceType_i64) { 4964 Variable *T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx); 4965 Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax); 4966 X86OperandMem *Addr = formMemoryOperand(Ptr, Ty); 4967 _mov(T_eax, loOperand(Addr)); 4968 _mov(T_edx, hiOperand(Addr)); 4969 Variable *T_ecx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx); 4970 Variable *T_ebx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ebx); 4971 InstX86Label *Label = InstX86Label::create(Func, this); 4972 const bool IsXchg8b = Op_Lo == nullptr && Op_Hi == nullptr; 4973 if (!IsXchg8b) { 4974 Context.insert(Label); 4975 _mov(T_ebx, T_eax); 4976 (this->*Op_Lo)(T_ebx, loOperand(Val)); 4977 _mov(T_ecx, T_edx); 4978 (this->*Op_Hi)(T_ecx, hiOperand(Val)); 4979 } else { 4980 // This is for xchg, which doesn't need an actual Op_Lo/Op_Hi. 4981 // It just needs the Val loaded into ebx and ecx. 4982 // That can also be done before the loop. 4983 _mov(T_ebx, loOperand(Val)); 4984 _mov(T_ecx, hiOperand(Val)); 4985 Context.insert(Label); 4986 } 4987 constexpr bool Locked = true; 4988 _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked); 4989 _br(Traits::Cond::Br_ne, Label); 4990 if (!IsXchg8b) { 4991 // If Val is a variable, model the extended live range of Val through 4992 // the end of the loop, since it will be re-used by the loop. 4993 if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) { 4994 auto *ValLo = llvm::cast<Variable>(loOperand(ValVar)); 4995 auto *ValHi = llvm::cast<Variable>(hiOperand(ValVar)); 4996 Context.insert<InstFakeUse>(ValLo); 4997 Context.insert<InstFakeUse>(ValHi); 4998 } 4999 } else { 5000 // For xchg, the loop is slightly smaller and ebx/ecx are used. 5001 Context.insert<InstFakeUse>(T_ebx); 5002 Context.insert<InstFakeUse>(T_ecx); 5003 } 5004 // The address base (if any) is also reused in the loop. 5005 if (Variable *Base = Addr->getBase()) 5006 Context.insert<InstFakeUse>(Base); 5007 auto *DestLo = llvm::cast<Variable>(loOperand(Dest)); 5008 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest)); 5009 _mov(DestLo, T_eax); 5010 _mov(DestHi, T_edx); 5011 return; 5012 } 5013 X86OperandMem *Addr = formMemoryOperand(Ptr, Ty); 5014 RegNumT Eax; 5015 switch (Ty) { 5016 default: 5017 llvm::report_fatal_error("Bad type for atomicRMW"); 5018 case IceType_i64: 5019 Eax = Traits::getRaxOrDie(); 5020 break; 5021 case IceType_i32: 5022 Eax = Traits::RegisterSet::Reg_eax; 5023 break; 5024 case IceType_i16: 5025 Eax = Traits::RegisterSet::Reg_ax; 5026 break; 5027 case IceType_i8: 5028 Eax = Traits::RegisterSet::Reg_al; 5029 break; 5030 } 5031 Variable *T_eax = makeReg(Ty, Eax); 5032 _mov(T_eax, Addr); 5033 auto *Label = Context.insert<InstX86Label>(this); 5034 // We want to pick a different register for T than Eax, so don't use 5035 // _mov(T == nullptr, T_eax). 5036 Variable *T = makeReg(Ty); 5037 _mov(T, T_eax); 5038 (this->*Op_Lo)(T, Val); 5039 constexpr bool Locked = true; 5040 _cmpxchg(Addr, T_eax, T, Locked); 5041 _br(Traits::Cond::Br_ne, Label); 5042 // If Val is a variable, model the extended live range of Val through 5043 // the end of the loop, since it will be re-used by the loop. 5044 if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) { 5045 Context.insert<InstFakeUse>(ValVar); 5046 } 5047 // The address base (if any) is also reused in the loop. 5048 if (Variable *Base = Addr->getBase()) 5049 Context.insert<InstFakeUse>(Base); 5050 _mov(Dest, T_eax); 5051 } 5052 5053 /// Lowers count {trailing, leading} zeros intrinsic. 5054 /// 5055 /// We could do constant folding here, but that should have 5056 /// been done by the front-end/middle-end optimizations. 5057 template <typename TraitsType> 5058 void TargetX86Base<TraitsType>::lowerCountZeros(bool Cttz, Type Ty, 5059 Variable *Dest, 5060 Operand *FirstVal, 5061 Operand *SecondVal) { 5062 // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI). 5063 // Then the instructions will handle the Val == 0 case much more simply 5064 // and won't require conversion from bit position to number of zeros. 5065 // 5066 // Otherwise: 5067 // bsr IF_NOT_ZERO, Val 5068 // mov T_DEST, ((Ty == i32) ? 63 : 127) 5069 // cmovne T_DEST, IF_NOT_ZERO 5070 // xor T_DEST, ((Ty == i32) ? 31 : 63) 5071 // mov DEST, T_DEST 5072 // 5073 // NOTE: T_DEST must be a register because cmov requires its dest to be a 5074 // register. Also, bsf and bsr require their dest to be a register. 5075 // 5076 // The xor DEST, C(31|63) converts a bit position to # of leading zeroes. 5077 // E.g., for 000... 00001100, bsr will say that the most significant bit 5078 // set is at position 3, while the number of leading zeros is 28. Xor is 5079 // like (M - N) for N <= M, and converts 63 to 32, and 127 to 64 (for the 5080 // all-zeros case). 5081 // 5082 // X8632 only: Similar for 64-bit, but start w/ speculating that the upper 32 5083 // bits are all zero, and compute the result for that case (checking the 5084 // lower 32 bits). Then actually compute the result for the upper bits and 5085 // cmov in the result from the lower computation if the earlier speculation 5086 // was correct. 5087 // 5088 // Cttz, is similar, but uses bsf instead, and doesn't require the xor 5089 // bit position conversion, and the speculation is reversed. 5090 5091 // TODO(jpp): refactor this method. 5092 assert(Ty == IceType_i32 || Ty == IceType_i64); 5093 const Type DestTy = Traits::Is64Bit ? Dest->getType() : IceType_i32; 5094 Variable *T = makeReg(DestTy); 5095 Operand *FirstValRM = legalize(FirstVal, Legal_Mem | Legal_Reg); 5096 if (Cttz) { 5097 _bsf(T, FirstValRM); 5098 } else { 5099 _bsr(T, FirstValRM); 5100 } 5101 Variable *T_Dest = makeReg(DestTy); 5102 Constant *_31 = Ctx->getConstantInt32(31); 5103 Constant *_32 = Ctx->getConstantInt(DestTy, 32); 5104 Constant *_63 = Ctx->getConstantInt(DestTy, 63); 5105 Constant *_64 = Ctx->getConstantInt(DestTy, 64); 5106 if (Cttz) { 5107 if (DestTy == IceType_i64) { 5108 _mov(T_Dest, _64); 5109 } else { 5110 _mov(T_Dest, _32); 5111 } 5112 } else { 5113 Constant *_127 = Ctx->getConstantInt(DestTy, 127); 5114 if (DestTy == IceType_i64) { 5115 _mov(T_Dest, _127); 5116 } else { 5117 _mov(T_Dest, _63); 5118 } 5119 } 5120 _cmov(T_Dest, T, Traits::Cond::Br_ne); 5121 if (!Cttz) { 5122 if (DestTy == IceType_i64) { 5123 // Even though there's a _63 available at this point, that constant might 5124 // not be an i32, which will cause the xor emission to fail. 5125 Constant *_63 = Ctx->getConstantInt32(63); 5126 _xor(T_Dest, _63); 5127 } else { 5128 _xor(T_Dest, _31); 5129 } 5130 } 5131 if (Traits::Is64Bit || Ty == IceType_i32) { 5132 _mov(Dest, T_Dest); 5133 return; 5134 } 5135 _add(T_Dest, _32); 5136 auto *DestLo = llvm::cast<Variable>(loOperand(Dest)); 5137 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest)); 5138 // Will be using "test" on this, so we need a registerized variable. 5139 Variable *SecondVar = legalizeToReg(SecondVal); 5140 Variable *T_Dest2 = makeReg(IceType_i32); 5141 if (Cttz) { 5142 _bsf(T_Dest2, SecondVar); 5143 } else { 5144 _bsr(T_Dest2, SecondVar); 5145 _xor(T_Dest2, _31); 5146 } 5147 _test(SecondVar, SecondVar); 5148 _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e); 5149 _mov(DestLo, T_Dest2); 5150 _mov(DestHi, Ctx->getConstantZero(IceType_i32)); 5151 } 5152 5153 template <typename TraitsType> 5154 void TargetX86Base<TraitsType>::typedLoad(Type Ty, Variable *Dest, 5155 Variable *Base, Constant *Offset) { 5156 // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to 5157 // legalize Mem properly. 5158 if (Offset) 5159 assert(!llvm::isa<ConstantRelocatable>(Offset)); 5160 5161 auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset); 5162 5163 if (isVectorType(Ty)) 5164 _movp(Dest, Mem); 5165 else if (Ty == IceType_f64) 5166 _movq(Dest, Mem); 5167 else 5168 _mov(Dest, Mem); 5169 } 5170 5171 template <typename TraitsType> 5172 void TargetX86Base<TraitsType>::typedStore(Type Ty, Variable *Value, 5173 Variable *Base, Constant *Offset) { 5174 // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to 5175 // legalize Mem properly. 5176 if (Offset) 5177 assert(!llvm::isa<ConstantRelocatable>(Offset)); 5178 5179 auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset); 5180 5181 if (isVectorType(Ty)) 5182 _storep(Value, Mem); 5183 else if (Ty == IceType_f64) 5184 _storeq(Value, Mem); 5185 else 5186 _store(Value, Mem); 5187 } 5188 5189 template <typename TraitsType> 5190 void TargetX86Base<TraitsType>::copyMemory(Type Ty, Variable *Dest, 5191 Variable *Src, int32_t OffsetAmt) { 5192 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr; 5193 // TODO(ascull): this or add nullptr test to _movp, _movq 5194 Variable *Data = makeReg(Ty); 5195 5196 typedLoad(Ty, Data, Src, Offset); 5197 typedStore(Ty, Data, Dest, Offset); 5198 } 5199 5200 template <typename TraitsType> 5201 void TargetX86Base<TraitsType>::lowerMemcpy(Operand *Dest, Operand *Src, 5202 Operand *Count) { 5203 // There is a load and store for each chunk in the unroll 5204 constexpr uint32_t BytesPerStorep = 16; 5205 5206 // Check if the operands are constants 5207 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); 5208 const bool IsCountConst = CountConst != nullptr; 5209 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; 5210 5211 if (shouldOptimizeMemIntrins() && IsCountConst && 5212 CountValue <= BytesPerStorep * Traits::MEMCPY_UNROLL_LIMIT) { 5213 // Unlikely, but nothing to do if it does happen 5214 if (CountValue == 0) 5215 return; 5216 5217 Variable *SrcBase = legalizeToReg(Src); 5218 Variable *DestBase = legalizeToReg(Dest); 5219 5220 // Find the largest type that can be used and use it as much as possible in 5221 // reverse order. Then handle any remainder with overlapping copies. Since 5222 // the remainder will be at the end, there will be reduced pressure on the 5223 // memory unit as the accesses to the same memory are far apart. 5224 Type Ty = largestTypeInSize(CountValue); 5225 uint32_t TyWidth = typeWidthInBytes(Ty); 5226 5227 uint32_t RemainingBytes = CountValue; 5228 int32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth; 5229 while (RemainingBytes >= TyWidth) { 5230 copyMemory(Ty, DestBase, SrcBase, Offset); 5231 RemainingBytes -= TyWidth; 5232 Offset -= TyWidth; 5233 } 5234 5235 if (RemainingBytes == 0) 5236 return; 5237 5238 // Lower the remaining bytes. Adjust to larger types in order to make use 5239 // of overlaps in the copies. 5240 Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes); 5241 Offset = CountValue - typeWidthInBytes(LeftOverTy); 5242 copyMemory(LeftOverTy, DestBase, SrcBase, Offset); 5243 return; 5244 } 5245 5246 // Fall back on a function call 5247 InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memcpy, nullptr, 3); 5248 Call->addArg(Dest); 5249 Call->addArg(Src); 5250 Call->addArg(Count); 5251 lowerCall(Call); 5252 } 5253 5254 template <typename TraitsType> 5255 void TargetX86Base<TraitsType>::lowerMemmove(Operand *Dest, Operand *Src, 5256 Operand *Count) { 5257 // There is a load and store for each chunk in the unroll 5258 constexpr uint32_t BytesPerStorep = 16; 5259 5260 // Check if the operands are constants 5261 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); 5262 const bool IsCountConst = CountConst != nullptr; 5263 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; 5264 5265 if (shouldOptimizeMemIntrins() && IsCountConst && 5266 CountValue <= BytesPerStorep * Traits::MEMMOVE_UNROLL_LIMIT) { 5267 // Unlikely, but nothing to do if it does happen 5268 if (CountValue == 0) 5269 return; 5270 5271 Variable *SrcBase = legalizeToReg(Src); 5272 Variable *DestBase = legalizeToReg(Dest); 5273 5274 std::tuple<Type, Constant *, Variable *> 5275 Moves[Traits::MEMMOVE_UNROLL_LIMIT]; 5276 Constant *Offset; 5277 Variable *Reg; 5278 5279 // Copy the data into registers as the source and destination could overlap 5280 // so make sure not to clobber the memory. This also means overlapping 5281 // moves can be used as we are taking a safe snapshot of the memory. 5282 Type Ty = largestTypeInSize(CountValue); 5283 uint32_t TyWidth = typeWidthInBytes(Ty); 5284 5285 uint32_t RemainingBytes = CountValue; 5286 int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth; 5287 size_t N = 0; 5288 while (RemainingBytes >= TyWidth) { 5289 assert(N <= Traits::MEMMOVE_UNROLL_LIMIT); 5290 Offset = Ctx->getConstantInt32(OffsetAmt); 5291 Reg = makeReg(Ty); 5292 typedLoad(Ty, Reg, SrcBase, Offset); 5293 RemainingBytes -= TyWidth; 5294 OffsetAmt -= TyWidth; 5295 Moves[N++] = std::make_tuple(Ty, Offset, Reg); 5296 } 5297 5298 if (RemainingBytes != 0) { 5299 // Lower the remaining bytes. Adjust to larger types in order to make use 5300 // of overlaps in the copies. 5301 assert(N <= Traits::MEMMOVE_UNROLL_LIMIT); 5302 Ty = firstTypeThatFitsSize(RemainingBytes); 5303 Offset = Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty)); 5304 Reg = makeReg(Ty); 5305 typedLoad(Ty, Reg, SrcBase, Offset); 5306 Moves[N++] = std::make_tuple(Ty, Offset, Reg); 5307 } 5308 5309 // Copy the data out into the destination memory 5310 for (size_t i = 0; i < N; ++i) { 5311 std::tie(Ty, Offset, Reg) = Moves[i]; 5312 typedStore(Ty, Reg, DestBase, Offset); 5313 } 5314 5315 return; 5316 } 5317 5318 // Fall back on a function call 5319 InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memmove, nullptr, 3); 5320 Call->addArg(Dest); 5321 Call->addArg(Src); 5322 Call->addArg(Count); 5323 lowerCall(Call); 5324 } 5325 5326 template <typename TraitsType> 5327 void TargetX86Base<TraitsType>::lowerMemset(Operand *Dest, Operand *Val, 5328 Operand *Count) { 5329 constexpr uint32_t BytesPerStorep = 16; 5330 constexpr uint32_t BytesPerStoreq = 8; 5331 constexpr uint32_t BytesPerStorei32 = 4; 5332 assert(Val->getType() == IceType_i8); 5333 5334 // Check if the operands are constants 5335 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); 5336 const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val); 5337 const bool IsCountConst = CountConst != nullptr; 5338 const bool IsValConst = ValConst != nullptr; 5339 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; 5340 const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0; 5341 5342 // Unlikely, but nothing to do if it does happen 5343 if (IsCountConst && CountValue == 0) 5344 return; 5345 5346 // TODO(ascull): if the count is constant but val is not it would be possible 5347 // to inline by spreading the value across 4 bytes and accessing subregs e.g. 5348 // eax, ax and al. 5349 if (shouldOptimizeMemIntrins() && IsCountConst && IsValConst) { 5350 Variable *Base = nullptr; 5351 Variable *VecReg = nullptr; 5352 const uint32_t MaskValue = (ValValue & 0xff); 5353 const uint32_t SpreadValue = 5354 (MaskValue << 24) | (MaskValue << 16) | (MaskValue << 8) | MaskValue; 5355 5356 auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty, 5357 uint32_t OffsetAmt) { 5358 assert(Base != nullptr); 5359 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr; 5360 5361 // TODO(ascull): is 64-bit better with vector or scalar movq? 5362 auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset); 5363 if (isVectorType(Ty)) { 5364 assert(VecReg != nullptr); 5365 _storep(VecReg, Mem); 5366 } else if (Ty == IceType_f64) { 5367 assert(VecReg != nullptr); 5368 _storeq(VecReg, Mem); 5369 } else { 5370 assert(Ty != IceType_i64); 5371 _store(Ctx->getConstantInt(Ty, SpreadValue), Mem); 5372 } 5373 }; 5374 5375 // Find the largest type that can be used and use it as much as possible in 5376 // reverse order. Then handle any remainder with overlapping copies. Since 5377 // the remainder will be at the end, there will be reduces pressure on the 5378 // memory unit as the access to the same memory are far apart. 5379 Type Ty = IceType_void; 5380 if (ValValue == 0 && CountValue >= BytesPerStoreq && 5381 CountValue <= BytesPerStorep * Traits::MEMSET_UNROLL_LIMIT) { 5382 // When the value is zero it can be loaded into a vector register cheaply 5383 // using the xor trick. 5384 Base = legalizeToReg(Dest); 5385 VecReg = makeVectorOfZeros(IceType_v16i8); 5386 Ty = largestTypeInSize(CountValue); 5387 } else if (CountValue <= BytesPerStorei32 * Traits::MEMSET_UNROLL_LIMIT) { 5388 // When the value is non-zero or the count is small we can't use vector 5389 // instructions so are limited to 32-bit stores. 5390 Base = legalizeToReg(Dest); 5391 constexpr uint32_t MaxSize = 4; 5392 Ty = largestTypeInSize(CountValue, MaxSize); 5393 } 5394 5395 if (Base) { 5396 uint32_t TyWidth = typeWidthInBytes(Ty); 5397 5398 uint32_t RemainingBytes = CountValue; 5399 uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth; 5400 while (RemainingBytes >= TyWidth) { 5401 lowerSet(Ty, Offset); 5402 RemainingBytes -= TyWidth; 5403 Offset -= TyWidth; 5404 } 5405 5406 if (RemainingBytes == 0) 5407 return; 5408 5409 // Lower the remaining bytes. Adjust to larger types in order to make use 5410 // of overlaps in the copies. 5411 Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes); 5412 Offset = CountValue - typeWidthInBytes(LeftOverTy); 5413 lowerSet(LeftOverTy, Offset); 5414 return; 5415 } 5416 } 5417 5418 // Fall back on calling the memset function. The value operand needs to be 5419 // extended to a stack slot size because the PNaCl ABI requires arguments to 5420 // be at least 32 bits wide. 5421 Operand *ValExt; 5422 if (IsValConst) { 5423 ValExt = Ctx->getConstantInt(stackSlotType(), ValValue); 5424 } else { 5425 Variable *ValExtVar = Func->makeVariable(stackSlotType()); 5426 lowerCast(InstCast::create(Func, InstCast::Zext, ValExtVar, Val)); 5427 ValExt = ValExtVar; 5428 } 5429 InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memset, nullptr, 3); 5430 Call->addArg(Dest); 5431 Call->addArg(ValExt); 5432 Call->addArg(Count); 5433 lowerCall(Call); 5434 } 5435 5436 class AddressOptimizer { 5437 AddressOptimizer() = delete; 5438 AddressOptimizer(const AddressOptimizer &) = delete; 5439 AddressOptimizer &operator=(const AddressOptimizer &) = delete; 5440 5441 public: 5442 explicit AddressOptimizer(const Cfg *Func) 5443 : Func(Func), VMetadata(Func->getVMetadata()) {} 5444 5445 inline void dumpAddressOpt(const ConstantRelocatable *const Relocatable, 5446 int32_t Offset, const Variable *Base, 5447 const Variable *Index, uint16_t Shift, 5448 const Inst *Reason) const; 5449 5450 inline const Inst *matchAssign(Variable **Var, 5451 ConstantRelocatable **Relocatable, 5452 int32_t *Offset); 5453 5454 inline const Inst *matchCombinedBaseIndex(Variable **Base, Variable **Index, 5455 uint16_t *Shift); 5456 5457 inline const Inst *matchShiftedIndex(Variable **Index, uint16_t *Shift); 5458 5459 inline const Inst *matchOffsetIndexOrBase(Variable **IndexOrBase, 5460 const uint16_t Shift, 5461 ConstantRelocatable **Relocatable, 5462 int32_t *Offset); 5463 5464 private: 5465 const Cfg *const Func; 5466 const VariablesMetadata *const VMetadata; 5467 5468 static bool isAdd(const Inst *Instr) { 5469 if (auto *Arith = llvm::dyn_cast_or_null<const InstArithmetic>(Instr)) { 5470 return (Arith->getOp() == InstArithmetic::Add); 5471 } 5472 return false; 5473 } 5474 }; 5475 5476 void AddressOptimizer::dumpAddressOpt( 5477 const ConstantRelocatable *const Relocatable, int32_t Offset, 5478 const Variable *Base, const Variable *Index, uint16_t Shift, 5479 const Inst *Reason) const { 5480 if (!BuildDefs::dump()) 5481 return; 5482 if (!Func->isVerbose(IceV_AddrOpt)) 5483 return; 5484 OstreamLocker L(Func->getContext()); 5485 Ostream &Str = Func->getContext()->getStrDump(); 5486 Str << "Instruction: "; 5487 Reason->dumpDecorated(Func); 5488 Str << " results in Base="; 5489 if (Base) 5490 Base->dump(Func); 5491 else 5492 Str << "<null>"; 5493 Str << ", Index="; 5494 if (Index) 5495 Index->dump(Func); 5496 else 5497 Str << "<null>"; 5498 Str << ", Shift=" << Shift << ", Offset=" << Offset 5499 << ", Relocatable=" << Relocatable << "\n"; 5500 } 5501 5502 const Inst *AddressOptimizer::matchAssign(Variable **Var, 5503 ConstantRelocatable **Relocatable, 5504 int32_t *Offset) { 5505 // Var originates from Var=SrcVar ==> set Var:=SrcVar 5506 if (*Var == nullptr) 5507 return nullptr; 5508 if (const Inst *VarAssign = VMetadata->getSingleDefinition(*Var)) { 5509 assert(!VMetadata->isMultiDef(*Var)); 5510 if (llvm::isa<InstAssign>(VarAssign)) { 5511 Operand *SrcOp = VarAssign->getSrc(0); 5512 assert(SrcOp); 5513 if (auto *SrcVar = llvm::dyn_cast<Variable>(SrcOp)) { 5514 if (!VMetadata->isMultiDef(SrcVar) && 5515 // TODO: ensure SrcVar stays single-BB 5516 true) { 5517 *Var = SrcVar; 5518 return VarAssign; 5519 } 5520 } else if (auto *Const = llvm::dyn_cast<ConstantInteger32>(SrcOp)) { 5521 int32_t MoreOffset = Const->getValue(); 5522 if (Utils::WouldOverflowAdd(*Offset, MoreOffset)) 5523 return nullptr; 5524 *Var = nullptr; 5525 *Offset += MoreOffset; 5526 return VarAssign; 5527 } else if (auto *AddReloc = llvm::dyn_cast<ConstantRelocatable>(SrcOp)) { 5528 if (*Relocatable == nullptr) { 5529 // It is always safe to fold a relocatable through assignment -- the 5530 // assignment frees a slot in the address operand that can be used to 5531 // hold the Sandbox Pointer -- if any. 5532 *Var = nullptr; 5533 *Relocatable = AddReloc; 5534 return VarAssign; 5535 } 5536 } 5537 } 5538 } 5539 return nullptr; 5540 } 5541 5542 const Inst *AddressOptimizer::matchCombinedBaseIndex(Variable **Base, 5543 Variable **Index, 5544 uint16_t *Shift) { 5545 // Index==nullptr && Base is Base=Var1+Var2 ==> 5546 // set Base=Var1, Index=Var2, Shift=0 5547 if (*Base == nullptr) 5548 return nullptr; 5549 if (*Index != nullptr) 5550 return nullptr; 5551 auto *BaseInst = VMetadata->getSingleDefinition(*Base); 5552 if (BaseInst == nullptr) 5553 return nullptr; 5554 assert(!VMetadata->isMultiDef(*Base)); 5555 if (BaseInst->getSrcSize() < 2) 5556 return nullptr; 5557 if (auto *Var1 = llvm::dyn_cast<Variable>(BaseInst->getSrc(0))) { 5558 if (VMetadata->isMultiDef(Var1)) 5559 return nullptr; 5560 if (auto *Var2 = llvm::dyn_cast<Variable>(BaseInst->getSrc(1))) { 5561 if (VMetadata->isMultiDef(Var2)) 5562 return nullptr; 5563 if (isAdd(BaseInst) && 5564 // TODO: ensure Var1 and Var2 stay single-BB 5565 true) { 5566 *Base = Var1; 5567 *Index = Var2; 5568 *Shift = 0; // should already have been 0 5569 return BaseInst; 5570 } 5571 } 5572 } 5573 return nullptr; 5574 } 5575 5576 const Inst *AddressOptimizer::matchShiftedIndex(Variable **Index, 5577 uint16_t *Shift) { 5578 // Index is Index=Var*Const && log2(Const)+Shift<=3 ==> 5579 // Index=Var, Shift+=log2(Const) 5580 if (*Index == nullptr) 5581 return nullptr; 5582 auto *IndexInst = VMetadata->getSingleDefinition(*Index); 5583 if (IndexInst == nullptr) 5584 return nullptr; 5585 assert(!VMetadata->isMultiDef(*Index)); 5586 5587 // When using an unsigned 32-bit array index on x64, it gets zero-extended 5588 // before the shift & add. The explicit zero extension can be eliminated 5589 // because x86 32-bit operations automatically get zero-extended into the 5590 // corresponding 64-bit register. 5591 if (auto *CastInst = llvm::dyn_cast<InstCast>(IndexInst)) { 5592 if (CastInst->getCastKind() == InstCast::Zext) { 5593 if (auto *Var = llvm::dyn_cast<Variable>(CastInst->getSrc(0))) { 5594 if (Var->getType() == IceType_i32 && 5595 CastInst->getDest()->getType() == IceType_i64) { 5596 IndexInst = VMetadata->getSingleDefinition(Var); 5597 } 5598 } 5599 } 5600 } 5601 5602 if (IndexInst->getSrcSize() < 2) 5603 return nullptr; 5604 if (auto *ArithInst = llvm::dyn_cast<InstArithmetic>(IndexInst)) { 5605 if (auto *Var = llvm::dyn_cast<Variable>(ArithInst->getSrc(0))) { 5606 if (auto *Const = 5607 llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(1))) { 5608 if (VMetadata->isMultiDef(Var) || Const->getType() != IceType_i32) 5609 return nullptr; 5610 switch (ArithInst->getOp()) { 5611 default: 5612 return nullptr; 5613 case InstArithmetic::Mul: { 5614 uint32_t Mult = Const->getValue(); 5615 uint32_t LogMult; 5616 switch (Mult) { 5617 case 1: 5618 LogMult = 0; 5619 break; 5620 case 2: 5621 LogMult = 1; 5622 break; 5623 case 4: 5624 LogMult = 2; 5625 break; 5626 case 8: 5627 LogMult = 3; 5628 break; 5629 default: 5630 return nullptr; 5631 } 5632 if (*Shift + LogMult <= 3) { 5633 *Index = Var; 5634 *Shift += LogMult; 5635 return IndexInst; 5636 } 5637 } 5638 case InstArithmetic::Shl: { 5639 uint32_t ShiftAmount = Const->getValue(); 5640 switch (ShiftAmount) { 5641 case 0: 5642 case 1: 5643 case 2: 5644 case 3: 5645 break; 5646 default: 5647 return nullptr; 5648 } 5649 if (*Shift + ShiftAmount <= 3) { 5650 *Index = Var; 5651 *Shift += ShiftAmount; 5652 return IndexInst; 5653 } 5654 } 5655 } 5656 } 5657 } 5658 } 5659 return nullptr; 5660 } 5661 5662 const Inst *AddressOptimizer::matchOffsetIndexOrBase( 5663 Variable **IndexOrBase, const uint16_t Shift, 5664 ConstantRelocatable **Relocatable, int32_t *Offset) { 5665 // Base is Base=Var+Const || Base is Base=Const+Var ==> 5666 // set Base=Var, Offset+=Const 5667 // Base is Base=Var-Const ==> 5668 // set Base=Var, Offset-=Const 5669 // Index is Index=Var+Const ==> 5670 // set Index=Var, Offset+=(Const<<Shift) 5671 // Index is Index=Const+Var ==> 5672 // set Index=Var, Offset+=(Const<<Shift) 5673 // Index is Index=Var-Const ==> 5674 // set Index=Var, Offset-=(Const<<Shift) 5675 // Treat Index=Var Or Const as Index=Var + Const 5676 // when Var = Var' << N and log2(Const) <= N 5677 // or when Var = (2^M) * (2^N) and log2(Const) <= (M+N) 5678 5679 if (*IndexOrBase == nullptr) { 5680 return nullptr; 5681 } 5682 const Inst *Definition = VMetadata->getSingleDefinition(*IndexOrBase); 5683 if (Definition == nullptr) { 5684 return nullptr; 5685 } 5686 assert(!VMetadata->isMultiDef(*IndexOrBase)); 5687 if (auto *ArithInst = llvm::dyn_cast<const InstArithmetic>(Definition)) { 5688 switch (ArithInst->getOp()) { 5689 case InstArithmetic::Add: 5690 case InstArithmetic::Sub: 5691 case InstArithmetic::Or: 5692 break; 5693 default: 5694 return nullptr; 5695 } 5696 5697 Operand *Src0 = ArithInst->getSrc(0); 5698 Operand *Src1 = ArithInst->getSrc(1); 5699 auto *Var0 = llvm::dyn_cast<Variable>(Src0); 5700 auto *Var1 = llvm::dyn_cast<Variable>(Src1); 5701 auto *Const0 = llvm::dyn_cast<ConstantInteger32>(Src0); 5702 auto *Const1 = llvm::dyn_cast<ConstantInteger32>(Src1); 5703 auto *Reloc0 = llvm::dyn_cast<ConstantRelocatable>(Src0); 5704 auto *Reloc1 = llvm::dyn_cast<ConstantRelocatable>(Src1); 5705 5706 bool IsAdd = false; 5707 if (ArithInst->getOp() == InstArithmetic::Or) { 5708 Variable *Var = nullptr; 5709 ConstantInteger32 *Const = nullptr; 5710 if (Var0 && Const1) { 5711 Var = Var0; 5712 Const = Const1; 5713 } else if (Const0 && Var1) { 5714 Var = Var1; 5715 Const = Const0; 5716 } else { 5717 return nullptr; 5718 } 5719 auto *VarDef = 5720 llvm::dyn_cast<InstArithmetic>(VMetadata->getSingleDefinition(Var)); 5721 if (VarDef == nullptr) 5722 return nullptr; 5723 5724 SizeT ZeroesAvailable = 0; 5725 if (VarDef->getOp() == InstArithmetic::Shl) { 5726 if (auto *ConstInt = 5727 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) { 5728 ZeroesAvailable = ConstInt->getValue(); 5729 } 5730 } else if (VarDef->getOp() == InstArithmetic::Mul) { 5731 SizeT PowerOfTwo = 0; 5732 if (auto *MultConst = 5733 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(0))) { 5734 if (llvm::isPowerOf2_32(MultConst->getValue())) { 5735 PowerOfTwo += MultConst->getValue(); 5736 } 5737 } 5738 if (auto *MultConst = 5739 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) { 5740 if (llvm::isPowerOf2_32(MultConst->getValue())) { 5741 PowerOfTwo += MultConst->getValue(); 5742 } 5743 } 5744 ZeroesAvailable = llvm::Log2_32(PowerOfTwo) + 1; 5745 } 5746 SizeT ZeroesNeeded = llvm::Log2_32(Const->getValue()) + 1; 5747 if (ZeroesNeeded == 0 || ZeroesNeeded > ZeroesAvailable) 5748 return nullptr; 5749 IsAdd = true; // treat it as an add if the above conditions hold 5750 } else { 5751 IsAdd = ArithInst->getOp() == InstArithmetic::Add; 5752 } 5753 5754 Variable *NewIndexOrBase = nullptr; 5755 int32_t NewOffset = 0; 5756 ConstantRelocatable *NewRelocatable = *Relocatable; 5757 if (Var0 && Var1) 5758 // TODO(sehr): merge base/index splitting into here. 5759 return nullptr; 5760 if (!IsAdd && Var1) 5761 return nullptr; 5762 if (Var0) 5763 NewIndexOrBase = Var0; 5764 else if (Var1) 5765 NewIndexOrBase = Var1; 5766 // Don't know how to add/subtract two relocatables. 5767 if ((*Relocatable && (Reloc0 || Reloc1)) || (Reloc0 && Reloc1)) 5768 return nullptr; 5769 // Don't know how to subtract a relocatable. 5770 if (!IsAdd && Reloc1) 5771 return nullptr; 5772 // Incorporate ConstantRelocatables. 5773 if (Reloc0) 5774 NewRelocatable = Reloc0; 5775 else if (Reloc1) 5776 NewRelocatable = Reloc1; 5777 // Compute the updated constant offset. 5778 if (Const0) { 5779 const int32_t MoreOffset = 5780 IsAdd ? Const0->getValue() : -Const0->getValue(); 5781 if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset)) 5782 return nullptr; 5783 NewOffset += MoreOffset; 5784 } 5785 if (Const1) { 5786 const int32_t MoreOffset = 5787 IsAdd ? Const1->getValue() : -Const1->getValue(); 5788 if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset)) 5789 return nullptr; 5790 NewOffset += MoreOffset; 5791 } 5792 if (Utils::WouldOverflowAdd(*Offset, NewOffset << Shift)) 5793 return nullptr; 5794 *IndexOrBase = NewIndexOrBase; 5795 *Offset += (NewOffset << Shift); 5796 // Shift is always zero if this is called with the base 5797 *Relocatable = NewRelocatable; 5798 return Definition; 5799 } 5800 return nullptr; 5801 } 5802 5803 template <typename TypeTraits> 5804 typename TargetX86Base<TypeTraits>::X86OperandMem * 5805 TargetX86Base<TypeTraits>::computeAddressOpt(const Inst *Instr, Type MemType, 5806 Operand *Addr) { 5807 Func->resetCurrentNode(); 5808 if (Func->isVerbose(IceV_AddrOpt)) { 5809 OstreamLocker L(Func->getContext()); 5810 Ostream &Str = Func->getContext()->getStrDump(); 5811 Str << "\nStarting computeAddressOpt for instruction:\n "; 5812 Instr->dumpDecorated(Func); 5813 } 5814 5815 OptAddr NewAddr; 5816 NewAddr.Base = llvm::dyn_cast<Variable>(Addr); 5817 if (NewAddr.Base == nullptr) 5818 return nullptr; 5819 5820 // If the Base has more than one use or is live across multiple blocks, then 5821 // don't go further. Alternatively (?), never consider a transformation that 5822 // would change a variable that is currently *not* live across basic block 5823 // boundaries into one that *is*. 5824 if (!getFlags().getLoopInvariantCodeMotion()) { 5825 // Need multi block address opt when licm is enabled. 5826 // Might make sense to restrict to current node and loop header. 5827 if (Func->getVMetadata()->isMultiBlock( 5828 NewAddr.Base) /* || Base->getUseCount() > 1*/) 5829 return nullptr; 5830 } 5831 AddressOptimizer AddrOpt(Func); 5832 const bool MockBounds = getFlags().getMockBoundsCheck(); 5833 const Inst *Reason = nullptr; 5834 bool AddressWasOptimized = false; 5835 // The following unnamed struct identifies the address mode formation steps 5836 // that could potentially create an invalid memory operand (i.e., no free 5837 // slots for RebasePtr.) We add all those variables to this struct so that we 5838 // can use memset() to reset all members to false. 5839 struct { 5840 bool AssignBase = false; 5841 bool AssignIndex = false; 5842 bool OffsetFromBase = false; 5843 bool OffsetFromIndex = false; 5844 bool CombinedBaseIndex = false; 5845 } Skip; 5846 // This points to the boolean in Skip that represents the last folding 5847 // performed. This is used to disable a pattern match that generated an 5848 // invalid address. Without this, the algorithm would never finish. 5849 bool *SkipLastFolding = nullptr; 5850 // NewAddrCheckpoint is used to rollback the address being formed in case an 5851 // invalid address is formed. 5852 OptAddr NewAddrCheckpoint; 5853 Reason = Instr; 5854 do { 5855 if (SandboxingType != ST_None) { 5856 // When sandboxing, we defer the sandboxing of NewAddr to the Concrete 5857 // Target. If our optimization was overly aggressive, then we simply undo 5858 // what the previous iteration did, and set the previous pattern's skip 5859 // bit to true. 5860 if (!legalizeOptAddrForSandbox(&NewAddr)) { 5861 *SkipLastFolding = true; 5862 SkipLastFolding = nullptr; 5863 NewAddr = NewAddrCheckpoint; 5864 Reason = nullptr; 5865 } 5866 } 5867 5868 if (Reason) { 5869 AddrOpt.dumpAddressOpt(NewAddr.Relocatable, NewAddr.Offset, NewAddr.Base, 5870 NewAddr.Index, NewAddr.Shift, Reason); 5871 AddressWasOptimized = true; 5872 Reason = nullptr; 5873 SkipLastFolding = nullptr; 5874 memset(reinterpret_cast<void*>(&Skip), 0, sizeof(Skip)); 5875 } 5876 5877 NewAddrCheckpoint = NewAddr; 5878 5879 // Update Base and Index to follow through assignments to definitions. 5880 if (!Skip.AssignBase && 5881 (Reason = AddrOpt.matchAssign(&NewAddr.Base, &NewAddr.Relocatable, 5882 &NewAddr.Offset))) { 5883 SkipLastFolding = &Skip.AssignBase; 5884 // Assignments of Base from a Relocatable or ConstantInt32 can result 5885 // in Base becoming nullptr. To avoid code duplication in this loop we 5886 // prefer that Base be non-nullptr if possible. 5887 if ((NewAddr.Base == nullptr) && (NewAddr.Index != nullptr) && 5888 NewAddr.Shift == 0) { 5889 std::swap(NewAddr.Base, NewAddr.Index); 5890 } 5891 continue; 5892 } 5893 if (!Skip.AssignBase && 5894 (Reason = AddrOpt.matchAssign(&NewAddr.Index, &NewAddr.Relocatable, 5895 &NewAddr.Offset))) { 5896 SkipLastFolding = &Skip.AssignIndex; 5897 continue; 5898 } 5899 5900 if (!MockBounds) { 5901 // Transition from: 5902 // <Relocatable + Offset>(Base) to 5903 // <Relocatable + Offset>(Base, Index) 5904 if (!Skip.CombinedBaseIndex && 5905 (Reason = AddrOpt.matchCombinedBaseIndex( 5906 &NewAddr.Base, &NewAddr.Index, &NewAddr.Shift))) { 5907 SkipLastFolding = &Skip.CombinedBaseIndex; 5908 continue; 5909 } 5910 5911 // Recognize multiply/shift and update Shift amount. 5912 // Index becomes Index=Var<<Const && Const+Shift<=3 ==> 5913 // Index=Var, Shift+=Const 5914 // Index becomes Index=Const*Var && log2(Const)+Shift<=3 ==> 5915 // Index=Var, Shift+=log2(Const) 5916 if ((Reason = 5917 AddrOpt.matchShiftedIndex(&NewAddr.Index, &NewAddr.Shift))) { 5918 continue; 5919 } 5920 5921 // If Shift is zero, the choice of Base and Index was purely arbitrary. 5922 // Recognize multiply/shift and set Shift amount. 5923 // Shift==0 && Base is Base=Var*Const && log2(Const)+Shift<=3 ==> 5924 // swap(Index,Base) 5925 // Similar for Base=Const*Var and Base=Var<<Const 5926 if (NewAddr.Shift == 0 && 5927 (Reason = AddrOpt.matchShiftedIndex(&NewAddr.Base, &NewAddr.Shift))) { 5928 std::swap(NewAddr.Base, NewAddr.Index); 5929 continue; 5930 } 5931 } 5932 5933 // Update Offset to reflect additions/subtractions with constants and 5934 // relocatables. 5935 // TODO: consider overflow issues with respect to Offset. 5936 if (!Skip.OffsetFromBase && (Reason = AddrOpt.matchOffsetIndexOrBase( 5937 &NewAddr.Base, /*Shift =*/0, 5938 &NewAddr.Relocatable, &NewAddr.Offset))) { 5939 SkipLastFolding = &Skip.OffsetFromBase; 5940 continue; 5941 } 5942 if (!Skip.OffsetFromIndex && (Reason = AddrOpt.matchOffsetIndexOrBase( 5943 &NewAddr.Index, NewAddr.Shift, 5944 &NewAddr.Relocatable, &NewAddr.Offset))) { 5945 SkipLastFolding = &Skip.OffsetFromIndex; 5946 continue; 5947 } 5948 5949 break; 5950 } while (Reason); 5951 5952 if (!AddressWasOptimized) { 5953 return nullptr; 5954 } 5955 5956 // Undo any addition of RebasePtr. It will be added back when the mem 5957 // operand is sandboxed. 5958 if (NewAddr.Base == RebasePtr) { 5959 NewAddr.Base = nullptr; 5960 } 5961 5962 if (NewAddr.Index == RebasePtr) { 5963 NewAddr.Index = nullptr; 5964 NewAddr.Shift = 0; 5965 } 5966 5967 Constant *OffsetOp = nullptr; 5968 if (NewAddr.Relocatable == nullptr) { 5969 OffsetOp = Ctx->getConstantInt32(NewAddr.Offset); 5970 } else { 5971 OffsetOp = 5972 Ctx->getConstantSym(NewAddr.Relocatable->getOffset() + NewAddr.Offset, 5973 NewAddr.Relocatable->getName()); 5974 } 5975 // Vanilla ICE load instructions should not use the segment registers, and 5976 // computeAddressOpt only works at the level of Variables and Constants, not 5977 // other X86OperandMem, so there should be no mention of segment 5978 // registers there either. 5979 static constexpr auto SegmentReg = 5980 X86OperandMem::SegmentRegisters::DefaultSegment; 5981 5982 return X86OperandMem::create(Func, MemType, NewAddr.Base, OffsetOp, 5983 NewAddr.Index, NewAddr.Shift, SegmentReg); 5984 } 5985 5986 /// Add a mock bounds check on the memory address before using it as a load or 5987 /// store operand. The basic idea is that given a memory operand [reg], we 5988 /// would first add bounds-check code something like: 5989 /// 5990 /// cmp reg, <lb> 5991 /// jl out_of_line_error 5992 /// cmp reg, <ub> 5993 /// jg out_of_line_error 5994 /// 5995 /// In reality, the specific code will depend on how <lb> and <ub> are 5996 /// represented, e.g. an immediate, a global, or a function argument. 5997 /// 5998 /// As such, we need to enforce that the memory operand does not have the form 5999 /// [reg1+reg2], because then there is no simple cmp instruction that would 6000 /// suffice. However, we consider [reg+offset] to be OK because the offset is 6001 /// usually small, and so <ub> could have a safety buffer built in and then we 6002 /// could instead branch to a custom out_of_line_error that does the precise 6003 /// check and jumps back if it turns out OK. 6004 /// 6005 /// For the purpose of mocking the bounds check, we'll do something like this: 6006 /// 6007 /// cmp reg, 0 6008 /// je label 6009 /// cmp reg, 1 6010 /// je label 6011 /// label: 6012 /// 6013 /// Also note that we don't need to add a bounds check to a dereference of a 6014 /// simple global variable address. 6015 template <typename TraitsType> 6016 void TargetX86Base<TraitsType>::doMockBoundsCheck(Operand *Opnd) { 6017 if (!getFlags().getMockBoundsCheck()) 6018 return; 6019 if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd)) { 6020 if (Mem->getIndex()) { 6021 llvm::report_fatal_error("doMockBoundsCheck: Opnd contains index reg"); 6022 } 6023 Opnd = Mem->getBase(); 6024 } 6025 // At this point Opnd could be nullptr, or Variable, or Constant, or perhaps 6026 // something else. We only care if it is Variable. 6027 auto *Var = llvm::dyn_cast_or_null<Variable>(Opnd); 6028 if (Var == nullptr) 6029 return; 6030 // We use lowerStore() to copy out-args onto the stack. This creates a memory 6031 // operand with the stack pointer as the base register. Don't do bounds 6032 // checks on that. 6033 if (Var->getRegNum() == getStackReg()) 6034 return; 6035 6036 auto *Label = InstX86Label::create(Func, this); 6037 _cmp(Opnd, Ctx->getConstantZero(IceType_i32)); 6038 _br(Traits::Cond::Br_e, Label); 6039 _cmp(Opnd, Ctx->getConstantInt32(1)); 6040 _br(Traits::Cond::Br_e, Label); 6041 Context.insert(Label); 6042 } 6043 6044 template <typename TraitsType> 6045 void TargetX86Base<TraitsType>::lowerLoad(const InstLoad *Load) { 6046 // A Load instruction can be treated the same as an Assign instruction, after 6047 // the source operand is transformed into an X86OperandMem operand. Note that 6048 // the address mode optimization already creates an X86OperandMem operand, so 6049 // it doesn't need another level of transformation. 6050 Variable *DestLoad = Load->getDest(); 6051 Type Ty = DestLoad->getType(); 6052 Operand *Src0 = formMemoryOperand(Load->getSourceAddress(), Ty); 6053 doMockBoundsCheck(Src0); 6054 auto *Assign = InstAssign::create(Func, DestLoad, Src0); 6055 lowerAssign(Assign); 6056 } 6057 6058 template <typename TraitsType> 6059 void TargetX86Base<TraitsType>::doAddressOptOther() { 6060 // Inverts some Icmp instructions which helps doAddressOptLoad later. 6061 // TODO(manasijm): Refactor to unify the conditions for Var0 and Var1 6062 Inst *Instr = iteratorToInst(Context.getCur()); 6063 auto *VMetadata = Func->getVMetadata(); 6064 if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Instr)) { 6065 if (llvm::isa<Constant>(Icmp->getSrc(0)) || 6066 llvm::isa<Constant>(Icmp->getSrc(1))) 6067 return; 6068 auto *Var0 = llvm::dyn_cast<Variable>(Icmp->getSrc(0)); 6069 if (Var0 == nullptr) 6070 return; 6071 if (!VMetadata->isTracked(Var0)) 6072 return; 6073 auto *Op0Def = VMetadata->getFirstDefinitionSingleBlock(Var0); 6074 if (Op0Def == nullptr || !llvm::isa<InstLoad>(Op0Def)) 6075 return; 6076 if (VMetadata->getLocalUseNode(Var0) != Context.getNode()) 6077 return; 6078 6079 auto *Var1 = llvm::dyn_cast<Variable>(Icmp->getSrc(1)); 6080 if (Var1 != nullptr && VMetadata->isTracked(Var1)) { 6081 auto *Op1Def = VMetadata->getFirstDefinitionSingleBlock(Var1); 6082 if (Op1Def != nullptr && !VMetadata->isMultiBlock(Var1) && 6083 llvm::isa<InstLoad>(Op1Def)) { 6084 return; // Both are loads 6085 } 6086 } 6087 Icmp->reverseConditionAndOperands(); 6088 } 6089 } 6090 6091 template <typename TraitsType> 6092 void TargetX86Base<TraitsType>::doAddressOptLoad() { 6093 Inst *Instr = iteratorToInst(Context.getCur()); 6094 Operand *Addr = Instr->getSrc(0); 6095 Variable *Dest = Instr->getDest(); 6096 if (auto *OptAddr = computeAddressOpt(Instr, Dest->getType(), Addr)) { 6097 Instr->setDeleted(); 6098 Context.insert<InstLoad>(Dest, OptAddr); 6099 } 6100 } 6101 6102 template <typename TraitsType> 6103 void TargetX86Base<TraitsType>::doAddressOptLoadSubVector() { 6104 auto *Intrinsic = llvm::cast<InstIntrinsicCall>(Context.getCur()); 6105 Operand *Addr = Intrinsic->getArg(0); 6106 Variable *Dest = Intrinsic->getDest(); 6107 if (auto *OptAddr = computeAddressOpt(Intrinsic, Dest->getType(), Addr)) { 6108 Intrinsic->setDeleted(); 6109 const Ice::Intrinsics::IntrinsicInfo Info = { 6110 Ice::Intrinsics::LoadSubVector, Ice::Intrinsics::SideEffects_F, 6111 Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F}; 6112 auto Target = Ctx->getConstantUndef(Ice::IceType_i32); 6113 auto *NewLoad = Context.insert<InstIntrinsicCall>(2, Dest, Target, Info); 6114 NewLoad->addArg(OptAddr); 6115 NewLoad->addArg(Intrinsic->getArg(1)); 6116 } 6117 } 6118 6119 template <typename TraitsType> 6120 void TargetX86Base<TraitsType>::randomlyInsertNop(float Probability, 6121 RandomNumberGenerator &RNG) { 6122 RandomNumberGeneratorWrapper RNGW(RNG); 6123 if (RNGW.getTrueWithProbability(Probability)) { 6124 _nop(RNGW(Traits::X86_NUM_NOP_VARIANTS)); 6125 } 6126 } 6127 6128 template <typename TraitsType> 6129 void TargetX86Base<TraitsType>::lowerPhi(const InstPhi * /*Instr*/) { 6130 Func->setError("Phi found in regular instruction list"); 6131 } 6132 6133 template <typename TraitsType> 6134 void TargetX86Base<TraitsType>::lowerRet(const InstRet *Instr) { 6135 Variable *Reg = nullptr; 6136 if (Instr->hasRetValue()) { 6137 Operand *RetValue = legalize(Instr->getRetValue()); 6138 const Type ReturnType = RetValue->getType(); 6139 assert(isVectorType(ReturnType) || isScalarFloatingType(ReturnType) || 6140 (ReturnType == IceType_i32) || (ReturnType == IceType_i64)); 6141 Reg = moveReturnValueToRegister(RetValue, ReturnType); 6142 } 6143 // Add a ret instruction even if sandboxing is enabled, because addEpilog 6144 // explicitly looks for a ret instruction as a marker for where to insert the 6145 // frame removal instructions. 6146 _ret(Reg); 6147 // Add a fake use of esp to make sure esp stays alive for the entire 6148 // function. Otherwise post-call esp adjustments get dead-code eliminated. 6149 keepEspLiveAtExit(); 6150 } 6151 6152 inline uint32_t makePshufdMask(SizeT Index0, SizeT Index1, SizeT Index2, 6153 SizeT Index3) { 6154 const SizeT Mask = (Index0 & 0x3) | ((Index1 & 0x3) << 2) | 6155 ((Index2 & 0x3) << 4) | ((Index3 & 0x3) << 6); 6156 assert(Mask < 256); 6157 return Mask; 6158 } 6159 6160 template <typename TraitsType> 6161 Variable *TargetX86Base<TraitsType>::lowerShuffleVector_AllFromSameSrc( 6162 Operand *Src, SizeT Index0, SizeT Index1, SizeT Index2, SizeT Index3) { 6163 constexpr SizeT SrcBit = 1 << 2; 6164 assert((Index0 & SrcBit) == (Index1 & SrcBit)); 6165 assert((Index0 & SrcBit) == (Index2 & SrcBit)); 6166 assert((Index0 & SrcBit) == (Index3 & SrcBit)); 6167 (void)SrcBit; 6168 6169 const Type SrcTy = Src->getType(); 6170 auto *T = makeReg(SrcTy); 6171 auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem); 6172 auto *Mask = 6173 Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3)); 6174 _pshufd(T, SrcRM, Mask); 6175 return T; 6176 } 6177 6178 template <typename TraitsType> 6179 Variable *TargetX86Base<TraitsType>::lowerShuffleVector_TwoFromSameSrc( 6180 Operand *Src0, SizeT Index0, SizeT Index1, Operand *Src1, SizeT Index2, 6181 SizeT Index3) { 6182 constexpr SizeT SrcBit = 1 << 2; 6183 assert((Index0 & SrcBit) == (Index1 & SrcBit) || (Index1 == IGNORE_INDEX)); 6184 assert((Index2 & SrcBit) == (Index3 & SrcBit) || (Index3 == IGNORE_INDEX)); 6185 (void)SrcBit; 6186 6187 const Type SrcTy = Src0->getType(); 6188 assert(Src1->getType() == SrcTy); 6189 auto *T = makeReg(SrcTy); 6190 auto *Src0R = legalizeToReg(Src0); 6191 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 6192 auto *Mask = 6193 Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3)); 6194 _movp(T, Src0R); 6195 _shufps(T, Src1RM, Mask); 6196 return T; 6197 } 6198 6199 template <typename TraitsType> 6200 Variable *TargetX86Base<TraitsType>::lowerShuffleVector_UnifyFromDifferentSrcs( 6201 Operand *Src0, SizeT Index0, Operand *Src1, SizeT Index1) { 6202 return lowerShuffleVector_TwoFromSameSrc(Src0, Index0, IGNORE_INDEX, Src1, 6203 Index1, IGNORE_INDEX); 6204 } 6205 6206 inline SizeT makeSrcSwitchMask(SizeT Index0, SizeT Index1, SizeT Index2, 6207 SizeT Index3) { 6208 constexpr SizeT SrcBit = 1 << 2; 6209 const SizeT Index0Bits = ((Index0 & SrcBit) == 0) ? 0 : (1 << 0); 6210 const SizeT Index1Bits = ((Index1 & SrcBit) == 0) ? 0 : (1 << 1); 6211 const SizeT Index2Bits = ((Index2 & SrcBit) == 0) ? 0 : (1 << 2); 6212 const SizeT Index3Bits = ((Index3 & SrcBit) == 0) ? 0 : (1 << 3); 6213 return Index0Bits | Index1Bits | Index2Bits | Index3Bits; 6214 } 6215 6216 template <typename TraitsType> 6217 GlobalString TargetX86Base<TraitsType>::lowerShuffleVector_NewMaskName() { 6218 GlobalString FuncName = Func->getFunctionName(); 6219 const SizeT Id = PshufbMaskCount++; 6220 if (!BuildDefs::dump() || !FuncName.hasStdString()) { 6221 return GlobalString::createWithString( 6222 Ctx, 6223 "$PS" + std::to_string(FuncName.getID()) + "_" + std::to_string(Id)); 6224 } 6225 return GlobalString::createWithString( 6226 Ctx, "Pshufb$" + Func->getFunctionName() + "$" + std::to_string(Id)); 6227 } 6228 6229 template <typename TraitsType> 6230 ConstantRelocatable * 6231 TargetX86Base<TraitsType>::lowerShuffleVector_CreatePshufbMask( 6232 int8_t Idx0, int8_t Idx1, int8_t Idx2, int8_t Idx3, int8_t Idx4, 6233 int8_t Idx5, int8_t Idx6, int8_t Idx7, int8_t Idx8, int8_t Idx9, 6234 int8_t Idx10, int8_t Idx11, int8_t Idx12, int8_t Idx13, int8_t Idx14, 6235 int8_t Idx15) { 6236 static constexpr uint8_t NumElements = 16; 6237 const char Initializer[NumElements] = { 6238 Idx0, Idx1, Idx2, Idx3, Idx4, Idx5, Idx6, Idx7, 6239 Idx8, Idx9, Idx10, Idx11, Idx12, Idx13, Idx14, Idx15, 6240 }; 6241 6242 static constexpr Type V4VectorType = IceType_v4i32; 6243 const uint32_t MaskAlignment = typeWidthInBytesOnStack(V4VectorType); 6244 auto *Mask = VariableDeclaration::create(Func->getGlobalPool()); 6245 GlobalString MaskName = lowerShuffleVector_NewMaskName(); 6246 Mask->setIsConstant(true); 6247 Mask->addInitializer(VariableDeclaration::DataInitializer::create( 6248 Func->getGlobalPool(), Initializer, NumElements)); 6249 Mask->setName(MaskName); 6250 // Mask needs to be 16-byte aligned, or pshufb will seg fault. 6251 Mask->setAlignment(MaskAlignment); 6252 Func->addGlobal(Mask); 6253 6254 constexpr RelocOffsetT Offset = 0; 6255 return llvm::cast<ConstantRelocatable>(Ctx->getConstantSym(Offset, MaskName)); 6256 } 6257 6258 template <typename TraitsType> 6259 void TargetX86Base<TraitsType>::lowerShuffleVector_UsingPshufb( 6260 Variable *Dest, Operand *Src0, Operand *Src1, int8_t Idx0, int8_t Idx1, 6261 int8_t Idx2, int8_t Idx3, int8_t Idx4, int8_t Idx5, int8_t Idx6, 6262 int8_t Idx7, int8_t Idx8, int8_t Idx9, int8_t Idx10, int8_t Idx11, 6263 int8_t Idx12, int8_t Idx13, int8_t Idx14, int8_t Idx15) { 6264 const Type DestTy = Dest->getType(); 6265 static constexpr bool NotRebased = false; 6266 static constexpr Variable *NoBase = nullptr; 6267 // We use void for the memory operand instead of DestTy because using the 6268 // latter causes a validation failure: the X86 Inst layer complains that 6269 // vector mem operands could be under aligned. Thus, using void we avoid the 6270 // validation error. Note that the mask global declaration is aligned, so it 6271 // can be used as an XMM mem operand. 6272 static constexpr Type MaskType = IceType_void; 6273 #define IDX_IN_SRC(N, S) \ 6274 ((((N) & (1 << 4)) == (S << 4)) ? ((N)&0xf) : CLEAR_ALL_BITS) 6275 auto *Mask0M = X86OperandMem::create( 6276 Func, MaskType, NoBase, 6277 lowerShuffleVector_CreatePshufbMask( 6278 IDX_IN_SRC(Idx0, 0), IDX_IN_SRC(Idx1, 0), IDX_IN_SRC(Idx2, 0), 6279 IDX_IN_SRC(Idx3, 0), IDX_IN_SRC(Idx4, 0), IDX_IN_SRC(Idx5, 0), 6280 IDX_IN_SRC(Idx6, 0), IDX_IN_SRC(Idx7, 0), IDX_IN_SRC(Idx8, 0), 6281 IDX_IN_SRC(Idx9, 0), IDX_IN_SRC(Idx10, 0), IDX_IN_SRC(Idx11, 0), 6282 IDX_IN_SRC(Idx12, 0), IDX_IN_SRC(Idx13, 0), IDX_IN_SRC(Idx14, 0), 6283 IDX_IN_SRC(Idx15, 0)), 6284 NotRebased); 6285 6286 auto *T0 = makeReg(DestTy); 6287 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 6288 _movp(T0, Src0RM); 6289 6290 _pshufb(T0, Mask0M); 6291 6292 if (Idx0 >= 16 || Idx1 >= 16 || Idx2 >= 16 || Idx3 >= 16 || Idx4 >= 16 || 6293 Idx5 >= 16 || Idx6 >= 16 || Idx7 >= 16 || Idx8 >= 16 || Idx9 >= 16 || 6294 Idx10 >= 16 || Idx11 >= 16 || Idx12 >= 16 || Idx13 >= 16 || Idx14 >= 16 || 6295 Idx15 >= 16) { 6296 auto *Mask1M = X86OperandMem::create( 6297 Func, MaskType, NoBase, 6298 lowerShuffleVector_CreatePshufbMask( 6299 IDX_IN_SRC(Idx0, 1), IDX_IN_SRC(Idx1, 1), IDX_IN_SRC(Idx2, 1), 6300 IDX_IN_SRC(Idx3, 1), IDX_IN_SRC(Idx4, 1), IDX_IN_SRC(Idx5, 1), 6301 IDX_IN_SRC(Idx6, 1), IDX_IN_SRC(Idx7, 1), IDX_IN_SRC(Idx8, 1), 6302 IDX_IN_SRC(Idx9, 1), IDX_IN_SRC(Idx10, 1), IDX_IN_SRC(Idx11, 1), 6303 IDX_IN_SRC(Idx12, 1), IDX_IN_SRC(Idx13, 1), IDX_IN_SRC(Idx14, 1), 6304 IDX_IN_SRC(Idx15, 1)), 6305 NotRebased); 6306 #undef IDX_IN_SRC 6307 auto *T1 = makeReg(DestTy); 6308 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 6309 _movp(T1, Src1RM); 6310 _pshufb(T1, Mask1M); 6311 _por(T0, T1); 6312 } 6313 6314 _movp(Dest, T0); 6315 } 6316 6317 template <typename TraitsType> 6318 void TargetX86Base<TraitsType>::lowerShuffleVector( 6319 const InstShuffleVector *Instr) { 6320 auto *Dest = Instr->getDest(); 6321 const Type DestTy = Dest->getType(); 6322 auto *Src0 = Instr->getSrc(0); 6323 auto *Src1 = Instr->getSrc(1); 6324 const SizeT NumElements = typeNumElements(DestTy); 6325 6326 auto *T = makeReg(DestTy); 6327 6328 switch (DestTy) { 6329 default: 6330 llvm::report_fatal_error("Unexpected vector type."); 6331 case IceType_v16i1: 6332 case IceType_v16i8: { 6333 static constexpr SizeT ExpectedNumElements = 16; 6334 assert(ExpectedNumElements == Instr->getNumIndexes()); 6335 (void)ExpectedNumElements; 6336 6337 if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7)) { 6338 auto *T = makeReg(DestTy); 6339 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 6340 _movp(T, Src0RM); 6341 _punpckl(T, Src0RM); 6342 _movp(Dest, T); 6343 return; 6344 } 6345 6346 if (Instr->indexesAre(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 6347 23)) { 6348 auto *T = makeReg(DestTy); 6349 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 6350 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 6351 _movp(T, Src0RM); 6352 _punpckl(T, Src1RM); 6353 _movp(Dest, T); 6354 return; 6355 } 6356 6357 if (Instr->indexesAre(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 6358 15, 15)) { 6359 auto *T = makeReg(DestTy); 6360 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 6361 _movp(T, Src0RM); 6362 _punpckh(T, Src0RM); 6363 _movp(Dest, T); 6364 return; 6365 } 6366 6367 if (Instr->indexesAre(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 6368 15, 31)) { 6369 auto *T = makeReg(DestTy); 6370 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 6371 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 6372 _movp(T, Src0RM); 6373 _punpckh(T, Src1RM); 6374 _movp(Dest, T); 6375 return; 6376 } 6377 6378 if (InstructionSet < Traits::SSE4_1) { 6379 // TODO(jpp): figure out how to lower with sse2. 6380 break; 6381 } 6382 6383 const SizeT Index0 = Instr->getIndexValue(0); 6384 const SizeT Index1 = Instr->getIndexValue(1); 6385 const SizeT Index2 = Instr->getIndexValue(2); 6386 const SizeT Index3 = Instr->getIndexValue(3); 6387 const SizeT Index4 = Instr->getIndexValue(4); 6388 const SizeT Index5 = Instr->getIndexValue(5); 6389 const SizeT Index6 = Instr->getIndexValue(6); 6390 const SizeT Index7 = Instr->getIndexValue(7); 6391 const SizeT Index8 = Instr->getIndexValue(8); 6392 const SizeT Index9 = Instr->getIndexValue(9); 6393 const SizeT Index10 = Instr->getIndexValue(10); 6394 const SizeT Index11 = Instr->getIndexValue(11); 6395 const SizeT Index12 = Instr->getIndexValue(12); 6396 const SizeT Index13 = Instr->getIndexValue(13); 6397 const SizeT Index14 = Instr->getIndexValue(14); 6398 const SizeT Index15 = Instr->getIndexValue(15); 6399 6400 lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2, 6401 Index3, Index4, Index5, Index6, Index7, 6402 Index8, Index9, Index10, Index11, Index12, 6403 Index13, Index14, Index15); 6404 return; 6405 } 6406 case IceType_v8i1: 6407 case IceType_v8i16: { 6408 static constexpr SizeT ExpectedNumElements = 8; 6409 assert(ExpectedNumElements == Instr->getNumIndexes()); 6410 (void)ExpectedNumElements; 6411 6412 if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3)) { 6413 auto *T = makeReg(DestTy); 6414 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 6415 _movp(T, Src0RM); 6416 _punpckl(T, Src0RM); 6417 _movp(Dest, T); 6418 return; 6419 } 6420 6421 if (Instr->indexesAre(0, 8, 1, 9, 2, 10, 3, 11)) { 6422 auto *T = makeReg(DestTy); 6423 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 6424 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 6425 _movp(T, Src0RM); 6426 _punpckl(T, Src1RM); 6427 _movp(Dest, T); 6428 return; 6429 } 6430 6431 if (Instr->indexesAre(4, 4, 5, 5, 6, 6, 7, 7)) { 6432 auto *T = makeReg(DestTy); 6433 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 6434 _movp(T, Src0RM); 6435 _punpckh(T, Src0RM); 6436 _movp(Dest, T); 6437 return; 6438 } 6439 6440 if (Instr->indexesAre(4, 12, 5, 13, 6, 14, 7, 15)) { 6441 auto *T = makeReg(DestTy); 6442 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); 6443 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 6444 _movp(T, Src0RM); 6445 _punpckh(T, Src1RM); 6446 _movp(Dest, T); 6447 return; 6448 } 6449 6450 if (InstructionSet < Traits::SSE4_1) { 6451 // TODO(jpp): figure out how to lower with sse2. 6452 break; 6453 } 6454 6455 const SizeT Index0 = Instr->getIndexValue(0); 6456 const SizeT Index1 = Instr->getIndexValue(1); 6457 const SizeT Index2 = Instr->getIndexValue(2); 6458 const SizeT Index3 = Instr->getIndexValue(3); 6459 const SizeT Index4 = Instr->getIndexValue(4); 6460 const SizeT Index5 = Instr->getIndexValue(5); 6461 const SizeT Index6 = Instr->getIndexValue(6); 6462 const SizeT Index7 = Instr->getIndexValue(7); 6463 6464 #define TO_BYTE_INDEX(I) ((I) << 1) 6465 lowerShuffleVector_UsingPshufb( 6466 Dest, Src0, Src1, TO_BYTE_INDEX(Index0), TO_BYTE_INDEX(Index0) + 1, 6467 TO_BYTE_INDEX(Index1), TO_BYTE_INDEX(Index1) + 1, TO_BYTE_INDEX(Index2), 6468 TO_BYTE_INDEX(Index2) + 1, TO_BYTE_INDEX(Index3), 6469 TO_BYTE_INDEX(Index3) + 1, TO_BYTE_INDEX(Index4), 6470 TO_BYTE_INDEX(Index4) + 1, TO_BYTE_INDEX(Index5), 6471 TO_BYTE_INDEX(Index5) + 1, TO_BYTE_INDEX(Index6), 6472 TO_BYTE_INDEX(Index6) + 1, TO_BYTE_INDEX(Index7), 6473 TO_BYTE_INDEX(Index7) + 1); 6474 #undef TO_BYTE_INDEX 6475 return; 6476 } 6477 case IceType_v4i1: 6478 case IceType_v4i32: 6479 case IceType_v4f32: { 6480 static constexpr SizeT ExpectedNumElements = 4; 6481 assert(ExpectedNumElements == Instr->getNumIndexes()); 6482 const SizeT Index0 = Instr->getIndexValue(0); 6483 const SizeT Index1 = Instr->getIndexValue(1); 6484 const SizeT Index2 = Instr->getIndexValue(2); 6485 const SizeT Index3 = Instr->getIndexValue(3); 6486 Variable *T = nullptr; 6487 switch (makeSrcSwitchMask(Index0, Index1, Index2, Index3)) { 6488 #define CASE_SRCS_IN(S0, S1, S2, S3) \ 6489 case (((S0) << 0) | ((S1) << 1) | ((S2) << 2) | ((S3) << 3)) 6490 CASE_SRCS_IN(0, 0, 0, 0) : { 6491 T = lowerShuffleVector_AllFromSameSrc(Src0, Index0, Index1, Index2, 6492 Index3); 6493 } 6494 break; 6495 CASE_SRCS_IN(0, 0, 0, 1) : { 6496 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2, 6497 Src1, Index3); 6498 T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified, 6499 UNIFIED_INDEX_0, UNIFIED_INDEX_1); 6500 } 6501 break; 6502 CASE_SRCS_IN(0, 0, 1, 0) : { 6503 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2, 6504 Src0, Index3); 6505 T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified, 6506 UNIFIED_INDEX_0, UNIFIED_INDEX_1); 6507 } 6508 break; 6509 CASE_SRCS_IN(0, 0, 1, 1) : { 6510 T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Src1, 6511 Index2, Index3); 6512 } 6513 break; 6514 CASE_SRCS_IN(0, 1, 0, 0) : { 6515 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0, 6516 Src1, Index1); 6517 T = lowerShuffleVector_TwoFromSameSrc( 6518 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3); 6519 } 6520 break; 6521 CASE_SRCS_IN(0, 1, 0, 1) : { 6522 if (Index0 == 0 && (Index1 - ExpectedNumElements) == 0 && Index2 == 1 && 6523 (Index3 - ExpectedNumElements) == 1) { 6524 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); 6525 auto *Src0R = legalizeToReg(Src0); 6526 T = makeReg(DestTy); 6527 _movp(T, Src0R); 6528 _punpckl(T, Src1RM); 6529 } else if (Index0 == Index2 && Index1 == Index3) { 6530 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs( 6531 Src0, Index0, Src1, Index1); 6532 T = lowerShuffleVector_AllFromSameSrc( 6533 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0, 6534 UNIFIED_INDEX_1); 6535 } else { 6536 auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs( 6537 Src0, Index0, Src1, Index1); 6538 auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs( 6539 Src0, Index2, Src1, Index3); 6540 T = lowerShuffleVector_TwoFromSameSrc( 6541 Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1, 6542 UNIFIED_INDEX_0, UNIFIED_INDEX_1); 6543 } 6544 } 6545 break; 6546 CASE_SRCS_IN(0, 1, 1, 0) : { 6547 if (Index0 == Index3 && Index1 == Index2) { 6548 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs( 6549 Src0, Index0, Src1, Index1); 6550 T = lowerShuffleVector_AllFromSameSrc( 6551 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1, 6552 UNIFIED_INDEX_0); 6553 } else { 6554 auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs( 6555 Src0, Index0, Src1, Index1); 6556 auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs( 6557 Src1, Index2, Src0, Index3); 6558 T = lowerShuffleVector_TwoFromSameSrc( 6559 Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1, 6560 UNIFIED_INDEX_0, UNIFIED_INDEX_1); 6561 } 6562 } 6563 break; 6564 CASE_SRCS_IN(0, 1, 1, 1) : { 6565 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0, 6566 Src1, Index1); 6567 T = lowerShuffleVector_TwoFromSameSrc( 6568 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3); 6569 } 6570 break; 6571 CASE_SRCS_IN(1, 0, 0, 0) : { 6572 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0, 6573 Src0, Index1); 6574 T = lowerShuffleVector_TwoFromSameSrc( 6575 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3); 6576 } 6577 break; 6578 CASE_SRCS_IN(1, 0, 0, 1) : { 6579 if (Index0 == Index3 && Index1 == Index2) { 6580 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs( 6581 Src1, Index0, Src0, Index1); 6582 T = lowerShuffleVector_AllFromSameSrc( 6583 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1, 6584 UNIFIED_INDEX_0); 6585 } else { 6586 auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs( 6587 Src1, Index0, Src0, Index1); 6588 auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs( 6589 Src0, Index2, Src1, Index3); 6590 T = lowerShuffleVector_TwoFromSameSrc( 6591 Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1, 6592 UNIFIED_INDEX_0, UNIFIED_INDEX_1); 6593 } 6594 } 6595 break; 6596 CASE_SRCS_IN(1, 0, 1, 0) : { 6597 if ((Index0 - ExpectedNumElements) == 0 && Index1 == 0 && 6598 (Index2 - ExpectedNumElements) == 1 && Index3 == 1) { 6599 auto *Src1RM = legalize(Src0, Legal_Reg | Legal_Mem); 6600 auto *Src0R = legalizeToReg(Src1); 6601 T = makeReg(DestTy); 6602 _movp(T, Src0R); 6603 _punpckl(T, Src1RM); 6604 } else if (Index0 == Index2 && Index1 == Index3) { 6605 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs( 6606 Src1, Index0, Src0, Index1); 6607 T = lowerShuffleVector_AllFromSameSrc( 6608 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0, 6609 UNIFIED_INDEX_1); 6610 } else { 6611 auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs( 6612 Src1, Index0, Src0, Index1); 6613 auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs( 6614 Src1, Index2, Src0, Index3); 6615 T = lowerShuffleVector_TwoFromSameSrc( 6616 Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1, 6617 UNIFIED_INDEX_0, UNIFIED_INDEX_1); 6618 } 6619 } 6620 break; 6621 CASE_SRCS_IN(1, 0, 1, 1) : { 6622 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0, 6623 Src0, Index1); 6624 T = lowerShuffleVector_TwoFromSameSrc( 6625 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3); 6626 } 6627 break; 6628 CASE_SRCS_IN(1, 1, 0, 0) : { 6629 T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Src0, 6630 Index2, Index3); 6631 } 6632 break; 6633 CASE_SRCS_IN(1, 1, 0, 1) : { 6634 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2, 6635 Src1, Index3); 6636 T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified, 6637 UNIFIED_INDEX_0, UNIFIED_INDEX_1); 6638 } 6639 break; 6640 CASE_SRCS_IN(1, 1, 1, 0) : { 6641 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2, 6642 Src0, Index3); 6643 T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified, 6644 UNIFIED_INDEX_0, UNIFIED_INDEX_1); 6645 } 6646 break; 6647 CASE_SRCS_IN(1, 1, 1, 1) : { 6648 T = lowerShuffleVector_AllFromSameSrc(Src1, Index0, Index1, Index2, 6649 Index3); 6650 } 6651 break; 6652 #undef CASE_SRCS_IN 6653 } 6654 6655 assert(T != nullptr); 6656 assert(T->getType() == DestTy); 6657 _movp(Dest, T); 6658 return; 6659 } break; 6660 } 6661 6662 // Unoptimized shuffle. Perform a series of inserts and extracts. 6663 Context.insert<InstFakeDef>(T); 6664 const Type ElementType = typeElementType(DestTy); 6665 for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) { 6666 auto *Index = Instr->getIndex(I); 6667 const SizeT Elem = Index->getValue(); 6668 auto *ExtElmt = makeReg(ElementType); 6669 if (Elem < NumElements) { 6670 lowerExtractElement( 6671 InstExtractElement::create(Func, ExtElmt, Src0, Index)); 6672 } else { 6673 lowerExtractElement(InstExtractElement::create( 6674 Func, ExtElmt, Src1, Ctx->getConstantInt32(Elem - NumElements))); 6675 } 6676 auto *NewT = makeReg(DestTy); 6677 lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt, 6678 Ctx->getConstantInt32(I))); 6679 T = NewT; 6680 } 6681 _movp(Dest, T); 6682 } 6683 6684 template <typename TraitsType> 6685 void TargetX86Base<TraitsType>::lowerSelect(const InstSelect *Select) { 6686 Variable *Dest = Select->getDest(); 6687 6688 Operand *Condition = Select->getCondition(); 6689 // Handle folding opportunities. 6690 if (const Inst *Producer = FoldingInfo.getProducerFor(Condition)) { 6691 assert(Producer->isDeleted()); 6692 switch (BoolFolding<Traits>::getProducerKind(Producer)) { 6693 default: 6694 break; 6695 case BoolFolding<Traits>::PK_Icmp32: 6696 case BoolFolding<Traits>::PK_Icmp64: { 6697 lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Select); 6698 return; 6699 } 6700 case BoolFolding<Traits>::PK_Fcmp: { 6701 lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Select); 6702 return; 6703 } 6704 } 6705 } 6706 6707 if (isVectorType(Dest->getType())) { 6708 lowerSelectVector(Select); 6709 return; 6710 } 6711 6712 Operand *CmpResult = legalize(Condition, Legal_Reg | Legal_Mem); 6713 Operand *Zero = Ctx->getConstantZero(IceType_i32); 6714 _cmp(CmpResult, Zero); 6715 Operand *SrcT = Select->getTrueOperand(); 6716 Operand *SrcF = Select->getFalseOperand(); 6717 const BrCond Cond = Traits::Cond::Br_ne; 6718 lowerSelectMove(Dest, Cond, SrcT, SrcF); 6719 } 6720 6721 template <typename TraitsType> 6722 void TargetX86Base<TraitsType>::lowerSelectMove(Variable *Dest, BrCond Cond, 6723 Operand *SrcT, Operand *SrcF) { 6724 Type DestTy = Dest->getType(); 6725 if (typeWidthInBytes(DestTy) == 1 || isFloatingType(DestTy)) { 6726 // The cmov instruction doesn't allow 8-bit or FP operands, so we need 6727 // explicit control flow. 6728 // d=cmp e,f; a=d?b:c ==> cmp e,f; a=b; jne L1; a=c; L1: 6729 auto *Label = InstX86Label::create(Func, this); 6730 SrcT = legalize(SrcT, Legal_Reg | Legal_Imm); 6731 _mov(Dest, SrcT); 6732 _br(Cond, Label); 6733 SrcF = legalize(SrcF, Legal_Reg | Legal_Imm); 6734 _redefined(_mov(Dest, SrcF)); 6735 Context.insert(Label); 6736 return; 6737 } 6738 // mov t, SrcF; cmov_cond t, SrcT; mov dest, t 6739 // But if SrcT is immediate, we might be able to do better, as the cmov 6740 // instruction doesn't allow an immediate operand: 6741 // mov t, SrcT; cmov_!cond t, SrcF; mov dest, t 6742 if (llvm::isa<Constant>(SrcT) && !llvm::isa<Constant>(SrcF)) { 6743 std::swap(SrcT, SrcF); 6744 Cond = InstImpl<TraitsType>::InstX86Base::getOppositeCondition(Cond); 6745 } 6746 if (!Traits::Is64Bit && DestTy == IceType_i64) { 6747 SrcT = legalizeUndef(SrcT); 6748 SrcF = legalizeUndef(SrcF); 6749 // Set the low portion. 6750 auto *DestLo = llvm::cast<Variable>(loOperand(Dest)); 6751 lowerSelectIntMove(DestLo, Cond, loOperand(SrcT), loOperand(SrcF)); 6752 // Set the high portion. 6753 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest)); 6754 lowerSelectIntMove(DestHi, Cond, hiOperand(SrcT), hiOperand(SrcF)); 6755 return; 6756 } 6757 6758 assert(DestTy == IceType_i16 || DestTy == IceType_i32 || 6759 (Traits::Is64Bit && DestTy == IceType_i64)); 6760 lowerSelectIntMove(Dest, Cond, SrcT, SrcF); 6761 } 6762 6763 template <typename TraitsType> 6764 void TargetX86Base<TraitsType>::lowerSelectIntMove(Variable *Dest, BrCond Cond, 6765 Operand *SrcT, 6766 Operand *SrcF) { 6767 Variable *T = nullptr; 6768 SrcF = legalize(SrcF); 6769 _mov(T, SrcF); 6770 SrcT = legalize(SrcT, Legal_Reg | Legal_Mem); 6771 _cmov(T, SrcT, Cond); 6772 _mov(Dest, T); 6773 } 6774 6775 template <typename TraitsType> 6776 void TargetX86Base<TraitsType>::lowerMove(Variable *Dest, Operand *Src, 6777 bool IsRedefinition) { 6778 assert(Dest->getType() == Src->getType()); 6779 assert(!Dest->isRematerializable()); 6780 if (!Traits::Is64Bit && Dest->getType() == IceType_i64) { 6781 Src = legalize(Src); 6782 Operand *SrcLo = loOperand(Src); 6783 Operand *SrcHi = hiOperand(Src); 6784 auto *DestLo = llvm::cast<Variable>(loOperand(Dest)); 6785 auto *DestHi = llvm::cast<Variable>(hiOperand(Dest)); 6786 Variable *T_Lo = nullptr, *T_Hi = nullptr; 6787 _mov(T_Lo, SrcLo); 6788 _redefined(_mov(DestLo, T_Lo), IsRedefinition); 6789 _mov(T_Hi, SrcHi); 6790 _redefined(_mov(DestHi, T_Hi), IsRedefinition); 6791 } else { 6792 Operand *SrcLegal; 6793 if (Dest->hasReg()) { 6794 // If Dest already has a physical register, then only basic legalization 6795 // is needed, as the source operand can be a register, immediate, or 6796 // memory. 6797 SrcLegal = legalize(Src, Legal_Reg, Dest->getRegNum()); 6798 } else { 6799 // If Dest could be a stack operand, then RI must be a physical register 6800 // or a scalar integer immediate. 6801 SrcLegal = legalize(Src, Legal_Reg | Legal_Imm); 6802 } 6803 if (isVectorType(Dest->getType())) { 6804 _redefined(_movp(Dest, SrcLegal), IsRedefinition); 6805 } else { 6806 _redefined(_mov(Dest, SrcLegal), IsRedefinition); 6807 } 6808 } 6809 } 6810 6811 template <typename TraitsType> 6812 bool TargetX86Base<TraitsType>::lowerOptimizeFcmpSelect( 6813 const InstFcmp *Fcmp, const InstSelect *Select) { 6814 Operand *CmpSrc0 = Fcmp->getSrc(0); 6815 Operand *CmpSrc1 = Fcmp->getSrc(1); 6816 Operand *SelectSrcT = Select->getTrueOperand(); 6817 Operand *SelectSrcF = Select->getFalseOperand(); 6818 Variable *SelectDest = Select->getDest(); 6819 6820 // TODO(capn): also handle swapped compare/select operand order. 6821 if (CmpSrc0 != SelectSrcT || CmpSrc1 != SelectSrcF) 6822 return false; 6823 6824 // TODO(sehr, stichnot): fcmp/select patterns (e.g., minsd/maxss) go here. 6825 InstFcmp::FCond Condition = Fcmp->getCondition(); 6826 switch (Condition) { 6827 default: 6828 return false; 6829 case InstFcmp::True: 6830 break; 6831 case InstFcmp::False: 6832 break; 6833 case InstFcmp::Ogt: { 6834 Variable *T = makeReg(SelectDest->getType()); 6835 if (isScalarFloatingType(SelectSrcT->getType())) { 6836 _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem)); 6837 _maxss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem)); 6838 _mov(SelectDest, T); 6839 } else { 6840 _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem)); 6841 _maxps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem)); 6842 _movp(SelectDest, T); 6843 } 6844 return true; 6845 } break; 6846 case InstFcmp::Olt: { 6847 Variable *T = makeReg(SelectSrcT->getType()); 6848 if (isScalarFloatingType(SelectSrcT->getType())) { 6849 _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem)); 6850 _minss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem)); 6851 _mov(SelectDest, T); 6852 } else { 6853 _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem)); 6854 _minps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem)); 6855 _movp(SelectDest, T); 6856 } 6857 return true; 6858 } break; 6859 } 6860 return false; 6861 } 6862 6863 template <typename TraitsType> 6864 void TargetX86Base<TraitsType>::lowerIcmp(const InstIcmp *Icmp) { 6865 Variable *Dest = Icmp->getDest(); 6866 if (isVectorType(Dest->getType())) { 6867 lowerIcmpVector(Icmp); 6868 } else { 6869 constexpr Inst *Consumer = nullptr; 6870 lowerIcmpAndConsumer(Icmp, Consumer); 6871 } 6872 } 6873 6874 template <typename TraitsType> 6875 void TargetX86Base<TraitsType>::lowerSelectVector(const InstSelect *Instr) { 6876 Variable *Dest = Instr->getDest(); 6877 Type DestTy = Dest->getType(); 6878 Operand *SrcT = Instr->getTrueOperand(); 6879 Operand *SrcF = Instr->getFalseOperand(); 6880 Operand *Condition = Instr->getCondition(); 6881 6882 if (!isVectorType(DestTy)) 6883 llvm::report_fatal_error("Expected a vector select"); 6884 6885 Type SrcTy = SrcT->getType(); 6886 Variable *T = makeReg(SrcTy); 6887 Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem); 6888 Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem); 6889 6890 if (InstructionSet >= Traits::SSE4_1) { 6891 // TODO(wala): If the condition operand is a constant, use blendps or 6892 // pblendw. 6893 // 6894 // Use blendvps or pblendvb to implement select. 6895 if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 || 6896 SrcTy == IceType_v4f32) { 6897 Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem); 6898 Variable *xmm0 = makeReg(IceType_v4i32, Traits::RegisterSet::Reg_xmm0); 6899 _movp(xmm0, ConditionRM); 6900 _psll(xmm0, Ctx->getConstantInt8(31)); 6901 _movp(T, SrcFRM); 6902 _blendvps(T, SrcTRM, xmm0); 6903 _movp(Dest, T); 6904 } else { 6905 assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16); 6906 Type SignExtTy = 6907 Condition->getType() == IceType_v8i1 ? IceType_v8i16 : IceType_v16i8; 6908 Variable *xmm0 = makeReg(SignExtTy, Traits::RegisterSet::Reg_xmm0); 6909 lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition)); 6910 _movp(T, SrcFRM); 6911 _pblendvb(T, SrcTRM, xmm0); 6912 _movp(Dest, T); 6913 } 6914 return; 6915 } 6916 // Lower select without Traits::SSE4.1: 6917 // a=d?b:c ==> 6918 // if elementtype(d) != i1: 6919 // d=sext(d); 6920 // a=(b&d)|(c&~d); 6921 Variable *T2 = makeReg(SrcTy); 6922 // Sign extend the condition operand if applicable. 6923 if (SrcTy == IceType_v4f32) { 6924 // The sext operation takes only integer arguments. 6925 Variable *T3 = Func->makeVariable(IceType_v4i32); 6926 lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition)); 6927 _movp(T, T3); 6928 } else if (typeElementType(SrcTy) != IceType_i1) { 6929 lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition)); 6930 } else { 6931 Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem); 6932 _movp(T, ConditionRM); 6933 } 6934 _movp(T2, T); 6935 _pand(T, SrcTRM); 6936 _pandn(T2, SrcFRM); 6937 _por(T, T2); 6938 _movp(Dest, T); 6939 6940 return; 6941 } 6942 6943 template <typename TraitsType> 6944 void TargetX86Base<TraitsType>::lowerStore(const InstStore *Instr) { 6945 Operand *Value = Instr->getData(); 6946 Operand *Addr = Instr->getAddr(); 6947 X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType()); 6948 doMockBoundsCheck(NewAddr); 6949 Type Ty = NewAddr->getType(); 6950 6951 if (!Traits::Is64Bit && Ty == IceType_i64) { 6952 Value = legalizeUndef(Value); 6953 Operand *ValueHi = legalize(hiOperand(Value), Legal_Reg | Legal_Imm); 6954 _store(ValueHi, llvm::cast<X86OperandMem>(hiOperand(NewAddr))); 6955 Operand *ValueLo = legalize(loOperand(Value), Legal_Reg | Legal_Imm); 6956 _store(ValueLo, llvm::cast<X86OperandMem>(loOperand(NewAddr))); 6957 } else if (isVectorType(Ty)) { 6958 _storep(legalizeToReg(Value), NewAddr); 6959 } else { 6960 Value = legalize(Value, Legal_Reg | Legal_Imm); 6961 _store(Value, NewAddr); 6962 } 6963 } 6964 6965 template <typename TraitsType> 6966 void TargetX86Base<TraitsType>::doAddressOptStore() { 6967 auto *Instr = llvm::cast<InstStore>(Context.getCur()); 6968 Operand *Addr = Instr->getAddr(); 6969 Operand *Data = Instr->getData(); 6970 if (auto *OptAddr = computeAddressOpt(Instr, Data->getType(), Addr)) { 6971 Instr->setDeleted(); 6972 auto *NewStore = Context.insert<InstStore>(Data, OptAddr); 6973 if (Instr->getDest()) 6974 NewStore->setRmwBeacon(Instr->getRmwBeacon()); 6975 } 6976 } 6977 6978 template <typename TraitsType> 6979 void TargetX86Base<TraitsType>::doAddressOptStoreSubVector() { 6980 auto *Intrinsic = llvm::cast<InstIntrinsicCall>(Context.getCur()); 6981 Operand *Addr = Intrinsic->getArg(1); 6982 Operand *Data = Intrinsic->getArg(0); 6983 if (auto *OptAddr = computeAddressOpt(Intrinsic, Data->getType(), Addr)) { 6984 Intrinsic->setDeleted(); 6985 const Ice::Intrinsics::IntrinsicInfo Info = { 6986 Ice::Intrinsics::StoreSubVector, Ice::Intrinsics::SideEffects_T, 6987 Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T}; 6988 auto Target = Ctx->getConstantUndef(Ice::IceType_i32); 6989 auto *NewStore = 6990 Context.insert<InstIntrinsicCall>(3, nullptr, Target, Info); 6991 NewStore->addArg(Data); 6992 NewStore->addArg(OptAddr); 6993 NewStore->addArg(Intrinsic->getArg(2)); 6994 } 6995 } 6996 6997 template <typename TraitsType> 6998 Operand *TargetX86Base<TraitsType>::lowerCmpRange(Operand *Comparison, 6999 uint64_t Min, uint64_t Max) { 7000 // TODO(ascull): 64-bit should not reach here but only because it is not 7001 // implemented yet. This should be able to handle the 64-bit case. 7002 assert(Traits::Is64Bit || Comparison->getType() != IceType_i64); 7003 // Subtracting 0 is a nop so don't do it 7004 if (Min != 0) { 7005 // Avoid clobbering the comparison by copying it 7006 Variable *T = nullptr; 7007 _mov(T, Comparison); 7008 _sub(T, Ctx->getConstantInt32(Min)); 7009 Comparison = T; 7010 } 7011 7012 _cmp(Comparison, Ctx->getConstantInt32(Max - Min)); 7013 7014 return Comparison; 7015 } 7016 7017 template <typename TraitsType> 7018 void TargetX86Base<TraitsType>::lowerCaseCluster(const CaseCluster &Case, 7019 Operand *Comparison, 7020 bool DoneCmp, 7021 CfgNode *DefaultTarget) { 7022 switch (Case.getKind()) { 7023 case CaseCluster::JumpTable: { 7024 InstX86Label *SkipJumpTable; 7025 7026 Operand *RangeIndex = 7027 lowerCmpRange(Comparison, Case.getLow(), Case.getHigh()); 7028 if (DefaultTarget == nullptr) { 7029 // Skip over jump table logic if comparison not in range and no default 7030 SkipJumpTable = InstX86Label::create(Func, this); 7031 _br(Traits::Cond::Br_a, SkipJumpTable); 7032 } else { 7033 _br(Traits::Cond::Br_a, DefaultTarget); 7034 } 7035 7036 InstJumpTable *JumpTable = Case.getJumpTable(); 7037 Context.insert(JumpTable); 7038 7039 // Make sure the index is a register of the same width as the base 7040 Variable *Index; 7041 const Type PointerType = getPointerType(); 7042 if (RangeIndex->getType() != PointerType) { 7043 Index = makeReg(PointerType); 7044 if (RangeIndex->getType() == IceType_i64) { 7045 assert(Traits::Is64Bit); 7046 _mov(Index, RangeIndex); // trunc 7047 } else { 7048 Operand *RangeIndexRM = legalize(RangeIndex, Legal_Reg | Legal_Mem); 7049 _movzx(Index, RangeIndexRM); 7050 } 7051 } else { 7052 Index = legalizeToReg(RangeIndex); 7053 } 7054 7055 constexpr RelocOffsetT RelocOffset = 0; 7056 constexpr Variable *NoBase = nullptr; 7057 constexpr Constant *NoOffset = nullptr; 7058 auto JTName = GlobalString::createWithString(Ctx, JumpTable->getName()); 7059 Constant *Offset = Ctx->getConstantSym(RelocOffset, JTName); 7060 uint16_t Shift = typeWidthInBytesLog2(PointerType); 7061 constexpr auto Segment = X86OperandMem::SegmentRegisters::DefaultSegment; 7062 7063 Variable *Target = nullptr; 7064 if (Traits::Is64Bit && NeedSandboxing) { 7065 assert(Index != nullptr && Index->getType() == IceType_i32); 7066 } 7067 7068 if (PointerType == IceType_i32) { 7069 _mov(Target, X86OperandMem::create(Func, PointerType, NoBase, Offset, 7070 Index, Shift, Segment)); 7071 } else { 7072 auto *Base = makeReg(IceType_i64); 7073 _lea(Base, X86OperandMem::create(Func, IceType_void, NoBase, Offset)); 7074 _mov(Target, X86OperandMem::create(Func, PointerType, Base, NoOffset, 7075 Index, Shift, Segment)); 7076 } 7077 7078 lowerIndirectJump(Target); 7079 7080 if (DefaultTarget == nullptr) 7081 Context.insert(SkipJumpTable); 7082 return; 7083 } 7084 case CaseCluster::Range: { 7085 if (Case.isUnitRange()) { 7086 // Single item 7087 if (!DoneCmp) { 7088 Constant *Value = Ctx->getConstantInt32(Case.getLow()); 7089 _cmp(Comparison, Value); 7090 } 7091 _br(Traits::Cond::Br_e, Case.getTarget()); 7092 } else if (DoneCmp && Case.isPairRange()) { 7093 // Range of two items with first item aleady compared against 7094 _br(Traits::Cond::Br_e, Case.getTarget()); 7095 Constant *Value = Ctx->getConstantInt32(Case.getHigh()); 7096 _cmp(Comparison, Value); 7097 _br(Traits::Cond::Br_e, Case.getTarget()); 7098 } else { 7099 // Range 7100 lowerCmpRange(Comparison, Case.getLow(), Case.getHigh()); 7101 _br(Traits::Cond::Br_be, Case.getTarget()); 7102 } 7103 if (DefaultTarget != nullptr) 7104 _br(DefaultTarget); 7105 return; 7106 } 7107 } 7108 } 7109 7110 template <typename TraitsType> 7111 void TargetX86Base<TraitsType>::lowerSwitch(const InstSwitch *Instr) { 7112 // Group cases together and navigate through them with a binary search 7113 CaseClusterArray CaseClusters = CaseCluster::clusterizeSwitch(Func, Instr); 7114 Operand *Src0 = Instr->getComparison(); 7115 CfgNode *DefaultTarget = Instr->getLabelDefault(); 7116 7117 assert(CaseClusters.size() != 0); // Should always be at least one 7118 7119 if (!Traits::Is64Bit && Src0->getType() == IceType_i64) { 7120 Src0 = legalize(Src0); // get Base/Index into physical registers 7121 Operand *Src0Lo = loOperand(Src0); 7122 Operand *Src0Hi = hiOperand(Src0); 7123 if (CaseClusters.back().getHigh() > UINT32_MAX) { 7124 // TODO(ascull): handle 64-bit case properly (currently naive version) 7125 // This might be handled by a higher level lowering of switches. 7126 SizeT NumCases = Instr->getNumCases(); 7127 if (NumCases >= 2) { 7128 Src0Lo = legalizeToReg(Src0Lo); 7129 Src0Hi = legalizeToReg(Src0Hi); 7130 } else { 7131 Src0Lo = legalize(Src0Lo, Legal_Reg | Legal_Mem); 7132 Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem); 7133 } 7134 for (SizeT I = 0; I < NumCases; ++I) { 7135 Constant *ValueLo = Ctx->getConstantInt32(Instr->getValue(I)); 7136 Constant *ValueHi = Ctx->getConstantInt32(Instr->getValue(I) >> 32); 7137 InstX86Label *Label = InstX86Label::create(Func, this); 7138 _cmp(Src0Lo, ValueLo); 7139 _br(Traits::Cond::Br_ne, Label); 7140 _cmp(Src0Hi, ValueHi); 7141 _br(Traits::Cond::Br_e, Instr->getLabel(I)); 7142 Context.insert(Label); 7143 } 7144 _br(Instr->getLabelDefault()); 7145 return; 7146 } else { 7147 // All the values are 32-bit so just check the operand is too and then 7148 // fall through to the 32-bit implementation. This is a common case. 7149 Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem); 7150 Constant *Zero = Ctx->getConstantInt32(0); 7151 _cmp(Src0Hi, Zero); 7152 _br(Traits::Cond::Br_ne, DefaultTarget); 7153 Src0 = Src0Lo; 7154 } 7155 } 7156 7157 // 32-bit lowering 7158 7159 if (CaseClusters.size() == 1) { 7160 // Jump straight to default if needed. Currently a common case as jump 7161 // tables occur on their own. 7162 constexpr bool DoneCmp = false; 7163 lowerCaseCluster(CaseClusters.front(), Src0, DoneCmp, DefaultTarget); 7164 return; 7165 } 7166 7167 // Going to be using multiple times so get it in a register early 7168 Variable *Comparison = legalizeToReg(Src0); 7169 7170 // A span is over the clusters 7171 struct SearchSpan { 7172 SearchSpan(SizeT Begin, SizeT Size, InstX86Label *Label) 7173 : Begin(Begin), Size(Size), Label(Label) {} 7174 7175 SizeT Begin; 7176 SizeT Size; 7177 InstX86Label *Label; 7178 }; 7179 // The stack will only grow to the height of the tree so 12 should be plenty 7180 std::stack<SearchSpan, llvm::SmallVector<SearchSpan, 12>> SearchSpanStack; 7181 SearchSpanStack.emplace(0, CaseClusters.size(), nullptr); 7182 bool DoneCmp = false; 7183 7184 while (!SearchSpanStack.empty()) { 7185 SearchSpan Span = SearchSpanStack.top(); 7186 SearchSpanStack.pop(); 7187 7188 if (Span.Label != nullptr) 7189 Context.insert(Span.Label); 7190 7191 switch (Span.Size) { 7192 case 0: 7193 llvm::report_fatal_error("Invalid SearchSpan size"); 7194 break; 7195 7196 case 1: 7197 lowerCaseCluster(CaseClusters[Span.Begin], Comparison, DoneCmp, 7198 SearchSpanStack.empty() ? nullptr : DefaultTarget); 7199 DoneCmp = false; 7200 break; 7201 7202 case 2: { 7203 const CaseCluster *CaseA = &CaseClusters[Span.Begin]; 7204 const CaseCluster *CaseB = &CaseClusters[Span.Begin + 1]; 7205 7206 // Placing a range last may allow register clobbering during the range 7207 // test. That means there is no need to clone the register. If it is a 7208 // unit range the comparison may have already been done in the binary 7209 // search (DoneCmp) and so it should be placed first. If this is a range 7210 // of two items and the comparison with the low value has already been 7211 // done, comparing with the other element is cheaper than a range test. 7212 // If the low end of the range is zero then there is no subtraction and 7213 // nothing to be gained. 7214 if (!CaseA->isUnitRange() && 7215 !(CaseA->getLow() == 0 || (DoneCmp && CaseA->isPairRange()))) { 7216 std::swap(CaseA, CaseB); 7217 DoneCmp = false; 7218 } 7219 7220 lowerCaseCluster(*CaseA, Comparison, DoneCmp); 7221 DoneCmp = false; 7222 lowerCaseCluster(*CaseB, Comparison, DoneCmp, 7223 SearchSpanStack.empty() ? nullptr : DefaultTarget); 7224 } break; 7225 7226 default: 7227 // Pick the middle item and branch b or ae 7228 SizeT PivotIndex = Span.Begin + (Span.Size / 2); 7229 const CaseCluster &Pivot = CaseClusters[PivotIndex]; 7230 Constant *Value = Ctx->getConstantInt32(Pivot.getLow()); 7231 InstX86Label *Label = InstX86Label::create(Func, this); 7232 _cmp(Comparison, Value); 7233 // TODO(ascull): does it alway have to be far? 7234 _br(Traits::Cond::Br_b, Label, InstX86Br::Far); 7235 // Lower the left and (pivot+right) sides, falling through to the right 7236 SearchSpanStack.emplace(Span.Begin, Span.Size / 2, Label); 7237 SearchSpanStack.emplace(PivotIndex, Span.Size - (Span.Size / 2), nullptr); 7238 DoneCmp = true; 7239 break; 7240 } 7241 } 7242 7243 _br(DefaultTarget); 7244 } 7245 7246 /// The following pattern occurs often in lowered C and C++ code: 7247 /// 7248 /// %cmp = fcmp/icmp pred <n x ty> %src0, %src1 7249 /// %cmp.ext = sext <n x i1> %cmp to <n x ty> 7250 /// 7251 /// We can eliminate the sext operation by copying the result of pcmpeqd, 7252 /// pcmpgtd, or cmpps (which produce sign extended results) to the result of the 7253 /// sext operation. 7254 template <typename TraitsType> 7255 void TargetX86Base<TraitsType>::eliminateNextVectorSextInstruction( 7256 Variable *SignExtendedResult) { 7257 if (auto *NextCast = 7258 llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) { 7259 if (NextCast->getCastKind() == InstCast::Sext && 7260 NextCast->getSrc(0) == SignExtendedResult) { 7261 NextCast->setDeleted(); 7262 _movp(NextCast->getDest(), legalizeToReg(SignExtendedResult)); 7263 // Skip over the instruction. 7264 Context.advanceNext(); 7265 } 7266 } 7267 } 7268 7269 template <typename TraitsType> 7270 void TargetX86Base<TraitsType>::lowerUnreachable( 7271 const InstUnreachable * /*Instr*/) { 7272 _ud2(); 7273 // Add a fake use of esp to make sure esp adjustments after the unreachable 7274 // do not get dead-code eliminated. 7275 keepEspLiveAtExit(); 7276 } 7277 7278 template <typename TraitsType> 7279 void TargetX86Base<TraitsType>::lowerBreakpoint( 7280 const InstBreakpoint * /*Instr*/) { 7281 _int3(); 7282 } 7283 7284 template <typename TraitsType> 7285 void TargetX86Base<TraitsType>::lowerRMW(const InstX86FakeRMW *RMW) { 7286 // If the beacon variable's live range does not end in this instruction, then 7287 // it must end in the modified Store instruction that follows. This means 7288 // that the original Store instruction is still there, either because the 7289 // value being stored is used beyond the Store instruction, or because dead 7290 // code elimination did not happen. In either case, we cancel RMW lowering 7291 // (and the caller deletes the RMW instruction). 7292 if (!RMW->isLastUse(RMW->getBeacon())) 7293 return; 7294 Operand *Src = RMW->getData(); 7295 Type Ty = Src->getType(); 7296 X86OperandMem *Addr = formMemoryOperand(RMW->getAddr(), Ty); 7297 doMockBoundsCheck(Addr); 7298 if (!Traits::Is64Bit && Ty == IceType_i64) { 7299 Src = legalizeUndef(Src); 7300 Operand *SrcLo = legalize(loOperand(Src), Legal_Reg | Legal_Imm); 7301 Operand *SrcHi = legalize(hiOperand(Src), Legal_Reg | Legal_Imm); 7302 auto *AddrLo = llvm::cast<X86OperandMem>(loOperand(Addr)); 7303 auto *AddrHi = llvm::cast<X86OperandMem>(hiOperand(Addr)); 7304 switch (RMW->getOp()) { 7305 default: 7306 // TODO(stichnot): Implement other arithmetic operators. 7307 break; 7308 case InstArithmetic::Add: 7309 _add_rmw(AddrLo, SrcLo); 7310 _adc_rmw(AddrHi, SrcHi); 7311 return; 7312 case InstArithmetic::Sub: 7313 _sub_rmw(AddrLo, SrcLo); 7314 _sbb_rmw(AddrHi, SrcHi); 7315 return; 7316 case InstArithmetic::And: 7317 _and_rmw(AddrLo, SrcLo); 7318 _and_rmw(AddrHi, SrcHi); 7319 return; 7320 case InstArithmetic::Or: 7321 _or_rmw(AddrLo, SrcLo); 7322 _or_rmw(AddrHi, SrcHi); 7323 return; 7324 case InstArithmetic::Xor: 7325 _xor_rmw(AddrLo, SrcLo); 7326 _xor_rmw(AddrHi, SrcHi); 7327 return; 7328 } 7329 } else { 7330 // x86-32: i8, i16, i32 7331 // x86-64: i8, i16, i32, i64 7332 switch (RMW->getOp()) { 7333 default: 7334 // TODO(stichnot): Implement other arithmetic operators. 7335 break; 7336 case InstArithmetic::Add: 7337 Src = legalize(Src, Legal_Reg | Legal_Imm); 7338 _add_rmw(Addr, Src); 7339 return; 7340 case InstArithmetic::Sub: 7341 Src = legalize(Src, Legal_Reg | Legal_Imm); 7342 _sub_rmw(Addr, Src); 7343 return; 7344 case InstArithmetic::And: 7345 Src = legalize(Src, Legal_Reg | Legal_Imm); 7346 _and_rmw(Addr, Src); 7347 return; 7348 case InstArithmetic::Or: 7349 Src = legalize(Src, Legal_Reg | Legal_Imm); 7350 _or_rmw(Addr, Src); 7351 return; 7352 case InstArithmetic::Xor: 7353 Src = legalize(Src, Legal_Reg | Legal_Imm); 7354 _xor_rmw(Addr, Src); 7355 return; 7356 } 7357 } 7358 llvm::report_fatal_error("Couldn't lower RMW instruction"); 7359 } 7360 7361 template <typename TraitsType> 7362 void TargetX86Base<TraitsType>::lowerOther(const Inst *Instr) { 7363 if (const auto *RMW = llvm::dyn_cast<InstX86FakeRMW>(Instr)) { 7364 lowerRMW(RMW); 7365 } else { 7366 TargetLowering::lowerOther(Instr); 7367 } 7368 } 7369 7370 /// Turn an i64 Phi instruction into a pair of i32 Phi instructions, to preserve 7371 /// integrity of liveness analysis. Undef values are also turned into zeroes, 7372 /// since loOperand() and hiOperand() don't expect Undef input. Also, in 7373 /// Non-SFI mode, add a FakeUse(RebasePtr) for every pooled constant operand. 7374 template <typename TraitsType> void TargetX86Base<TraitsType>::prelowerPhis() { 7375 if (getFlags().getUseNonsfi()) { 7376 assert(RebasePtr); 7377 CfgNode *Node = Context.getNode(); 7378 uint32_t RebasePtrUseCount = 0; 7379 for (Inst &I : Node->getPhis()) { 7380 auto *Phi = llvm::dyn_cast<InstPhi>(&I); 7381 if (Phi->isDeleted()) 7382 continue; 7383 for (SizeT I = 0; I < Phi->getSrcSize(); ++I) { 7384 Operand *Src = Phi->getSrc(I); 7385 // TODO(stichnot): This over-counts for +0.0, and under-counts for other 7386 // kinds of pooling. 7387 if (llvm::isa<ConstantRelocatable>(Src) || 7388 llvm::isa<ConstantFloat>(Src) || llvm::isa<ConstantDouble>(Src)) { 7389 ++RebasePtrUseCount; 7390 } 7391 } 7392 } 7393 if (RebasePtrUseCount) { 7394 Node->getInsts().push_front(InstFakeUse::create(Func, RebasePtr)); 7395 } 7396 } 7397 if (Traits::Is64Bit) { 7398 // On x86-64 we don't need to prelower phis -- the architecture can handle 7399 // 64-bit integer natively. 7400 return; 7401 } 7402 7403 // Pause constant blinding or pooling, blinding or pooling will be done later 7404 // during phi lowering assignments 7405 BoolFlagSaver B(RandomizationPoolingPaused, true); 7406 PhiLowering::prelowerPhis32Bit<TargetX86Base<TraitsType>>( 7407 this, Context.getNode(), Func); 7408 } 7409 7410 template <typename TraitsType> 7411 void TargetX86Base<TraitsType>::genTargetHelperCallFor(Inst *Instr) { 7412 uint32_t StackArgumentsSize = 0; 7413 if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) { 7414 RuntimeHelper HelperID = RuntimeHelper::H_Num; 7415 Variable *Dest = Arith->getDest(); 7416 Type DestTy = Dest->getType(); 7417 if (!Traits::Is64Bit && DestTy == IceType_i64) { 7418 switch (Arith->getOp()) { 7419 default: 7420 return; 7421 case InstArithmetic::Udiv: 7422 HelperID = RuntimeHelper::H_udiv_i64; 7423 break; 7424 case InstArithmetic::Sdiv: 7425 HelperID = RuntimeHelper::H_sdiv_i64; 7426 break; 7427 case InstArithmetic::Urem: 7428 HelperID = RuntimeHelper::H_urem_i64; 7429 break; 7430 case InstArithmetic::Srem: 7431 HelperID = RuntimeHelper::H_srem_i64; 7432 break; 7433 } 7434 } else if (isVectorType(DestTy)) { 7435 Variable *Dest = Arith->getDest(); 7436 Operand *Src0 = Arith->getSrc(0); 7437 Operand *Src1 = Arith->getSrc(1); 7438 switch (Arith->getOp()) { 7439 default: 7440 return; 7441 case InstArithmetic::Mul: 7442 if (DestTy == IceType_v16i8) { 7443 scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1); 7444 Arith->setDeleted(); 7445 } 7446 return; 7447 case InstArithmetic::Shl: 7448 case InstArithmetic::Lshr: 7449 case InstArithmetic::Ashr: 7450 if (llvm::isa<Constant>(Src1)) { 7451 return; 7452 } 7453 case InstArithmetic::Udiv: 7454 case InstArithmetic::Urem: 7455 case InstArithmetic::Sdiv: 7456 case InstArithmetic::Srem: 7457 case InstArithmetic::Frem: 7458 scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1); 7459 Arith->setDeleted(); 7460 return; 7461 } 7462 } else { 7463 switch (Arith->getOp()) { 7464 default: 7465 return; 7466 case InstArithmetic::Frem: 7467 if (isFloat32Asserting32Or64(DestTy)) 7468 HelperID = RuntimeHelper::H_frem_f32; 7469 else 7470 HelperID = RuntimeHelper::H_frem_f64; 7471 } 7472 } 7473 constexpr SizeT MaxSrcs = 2; 7474 InstCall *Call = makeHelperCall(HelperID, Dest, MaxSrcs); 7475 Call->addArg(Arith->getSrc(0)); 7476 Call->addArg(Arith->getSrc(1)); 7477 StackArgumentsSize = getCallStackArgumentsSizeBytes(Call); 7478 Context.insert(Call); 7479 Arith->setDeleted(); 7480 } else if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) { 7481 InstCast::OpKind CastKind = Cast->getCastKind(); 7482 Operand *Src0 = Cast->getSrc(0); 7483 const Type SrcType = Src0->getType(); 7484 Variable *Dest = Cast->getDest(); 7485 const Type DestTy = Dest->getType(); 7486 RuntimeHelper HelperID = RuntimeHelper::H_Num; 7487 Variable *CallDest = Dest; 7488 switch (CastKind) { 7489 default: 7490 return; 7491 case InstCast::Fptosi: 7492 if (!Traits::Is64Bit && DestTy == IceType_i64) { 7493 HelperID = isFloat32Asserting32Or64(SrcType) 7494 ? RuntimeHelper::H_fptosi_f32_i64 7495 : RuntimeHelper::H_fptosi_f64_i64; 7496 } else { 7497 return; 7498 } 7499 break; 7500 case InstCast::Fptoui: 7501 if (isVectorType(DestTy)) { 7502 assert(DestTy == IceType_v4i32); 7503 assert(SrcType == IceType_v4f32); 7504 HelperID = RuntimeHelper::H_fptoui_4xi32_f32; 7505 } else if (DestTy == IceType_i64 || 7506 (!Traits::Is64Bit && DestTy == IceType_i32)) { 7507 if (Traits::Is64Bit) { 7508 HelperID = isFloat32Asserting32Or64(SrcType) 7509 ? RuntimeHelper::H_fptoui_f32_i64 7510 : RuntimeHelper::H_fptoui_f64_i64; 7511 } else if (isInt32Asserting32Or64(DestTy)) { 7512 HelperID = isFloat32Asserting32Or64(SrcType) 7513 ? RuntimeHelper::H_fptoui_f32_i32 7514 : RuntimeHelper::H_fptoui_f64_i32; 7515 } else { 7516 HelperID = isFloat32Asserting32Or64(SrcType) 7517 ? RuntimeHelper::H_fptoui_f32_i64 7518 : RuntimeHelper::H_fptoui_f64_i64; 7519 } 7520 } else { 7521 return; 7522 } 7523 break; 7524 case InstCast::Sitofp: 7525 if (!Traits::Is64Bit && SrcType == IceType_i64) { 7526 HelperID = isFloat32Asserting32Or64(DestTy) 7527 ? RuntimeHelper::H_sitofp_i64_f32 7528 : RuntimeHelper::H_sitofp_i64_f64; 7529 } else { 7530 return; 7531 } 7532 break; 7533 case InstCast::Uitofp: 7534 if (isVectorType(SrcType)) { 7535 assert(DestTy == IceType_v4f32); 7536 assert(SrcType == IceType_v4i32); 7537 HelperID = RuntimeHelper::H_uitofp_4xi32_4xf32; 7538 } else if (SrcType == IceType_i64 || 7539 (!Traits::Is64Bit && SrcType == IceType_i32)) { 7540 if (isInt32Asserting32Or64(SrcType)) { 7541 HelperID = isFloat32Asserting32Or64(DestTy) 7542 ? RuntimeHelper::H_uitofp_i32_f32 7543 : RuntimeHelper::H_uitofp_i32_f64; 7544 } else { 7545 HelperID = isFloat32Asserting32Or64(DestTy) 7546 ? RuntimeHelper::H_uitofp_i64_f32 7547 : RuntimeHelper::H_uitofp_i64_f64; 7548 } 7549 } else { 7550 return; 7551 } 7552 break; 7553 case InstCast::Bitcast: { 7554 if (DestTy == Src0->getType()) 7555 return; 7556 switch (DestTy) { 7557 default: 7558 return; 7559 case IceType_i8: 7560 assert(Src0->getType() == IceType_v8i1); 7561 HelperID = RuntimeHelper::H_bitcast_8xi1_i8; 7562 CallDest = Func->makeVariable(IceType_i32); 7563 break; 7564 case IceType_i16: 7565 assert(Src0->getType() == IceType_v16i1); 7566 HelperID = RuntimeHelper::H_bitcast_16xi1_i16; 7567 CallDest = Func->makeVariable(IceType_i32); 7568 break; 7569 case IceType_v8i1: { 7570 assert(Src0->getType() == IceType_i8); 7571 HelperID = RuntimeHelper::H_bitcast_i8_8xi1; 7572 Variable *Src0AsI32 = Func->makeVariable(stackSlotType()); 7573 // Arguments to functions are required to be at least 32 bits wide. 7574 Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0); 7575 Src0 = Src0AsI32; 7576 } break; 7577 case IceType_v16i1: { 7578 assert(Src0->getType() == IceType_i16); 7579 HelperID = RuntimeHelper::H_bitcast_i16_16xi1; 7580 Variable *Src0AsI32 = Func->makeVariable(stackSlotType()); 7581 // Arguments to functions are required to be at least 32 bits wide. 7582 Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0); 7583 Src0 = Src0AsI32; 7584 } break; 7585 } 7586 } break; 7587 } 7588 constexpr SizeT MaxSrcs = 1; 7589 InstCall *Call = makeHelperCall(HelperID, CallDest, MaxSrcs); 7590 Call->addArg(Src0); 7591 StackArgumentsSize = getCallStackArgumentsSizeBytes(Call); 7592 Context.insert(Call); 7593 // The PNaCl ABI disallows i8/i16 return types, so truncate the helper call 7594 // result to the appropriate type as necessary. 7595 if (CallDest->getType() != Dest->getType()) 7596 Context.insert<InstCast>(InstCast::Trunc, Dest, CallDest); 7597 Cast->setDeleted(); 7598 } else if (auto *Intrinsic = llvm::dyn_cast<InstIntrinsicCall>(Instr)) { 7599 CfgVector<Type> ArgTypes; 7600 Type ReturnType = IceType_void; 7601 switch (Intrinsics::IntrinsicID ID = Intrinsic->getIntrinsicInfo().ID) { 7602 default: 7603 return; 7604 case Intrinsics::Ctpop: { 7605 Operand *Val = Intrinsic->getArg(0); 7606 Type ValTy = Val->getType(); 7607 if (ValTy == IceType_i64) 7608 ArgTypes = {IceType_i64}; 7609 else 7610 ArgTypes = {IceType_i32}; 7611 ReturnType = IceType_i32; 7612 } break; 7613 case Intrinsics::Longjmp: 7614 ArgTypes = {IceType_i32, IceType_i32}; 7615 ReturnType = IceType_void; 7616 break; 7617 case Intrinsics::Memcpy: 7618 ArgTypes = {IceType_i32, IceType_i32, IceType_i32}; 7619 ReturnType = IceType_void; 7620 break; 7621 case Intrinsics::Memmove: 7622 ArgTypes = {IceType_i32, IceType_i32, IceType_i32}; 7623 ReturnType = IceType_void; 7624 break; 7625 case Intrinsics::Memset: 7626 ArgTypes = {IceType_i32, IceType_i32, IceType_i32}; 7627 ReturnType = IceType_void; 7628 break; 7629 case Intrinsics::NaClReadTP: 7630 ReturnType = IceType_i32; 7631 break; 7632 case Intrinsics::Setjmp: 7633 ArgTypes = {IceType_i32}; 7634 ReturnType = IceType_i32; 7635 break; 7636 } 7637 StackArgumentsSize = getCallStackArgumentsSizeBytes(ArgTypes, ReturnType); 7638 } else if (auto *Call = llvm::dyn_cast<InstCall>(Instr)) { 7639 StackArgumentsSize = getCallStackArgumentsSizeBytes(Call); 7640 } else if (auto *Ret = llvm::dyn_cast<InstRet>(Instr)) { 7641 if (!Ret->hasRetValue()) 7642 return; 7643 Operand *RetValue = Ret->getRetValue(); 7644 Type ReturnType = RetValue->getType(); 7645 if (!isScalarFloatingType(ReturnType)) 7646 return; 7647 StackArgumentsSize = typeWidthInBytes(ReturnType); 7648 } else { 7649 return; 7650 } 7651 StackArgumentsSize = Traits::applyStackAlignment(StackArgumentsSize); 7652 updateMaxOutArgsSizeBytes(StackArgumentsSize); 7653 } 7654 7655 template <typename TraitsType> 7656 uint32_t TargetX86Base<TraitsType>::getCallStackArgumentsSizeBytes( 7657 const CfgVector<Type> &ArgTypes, Type ReturnType) { 7658 uint32_t OutArgumentsSizeBytes = 0; 7659 uint32_t XmmArgCount = 0; 7660 uint32_t GprArgCount = 0; 7661 for (SizeT i = 0, NumArgTypes = ArgTypes.size(); i < NumArgTypes; ++i) { 7662 Type Ty = ArgTypes[i]; 7663 // The PNaCl ABI requires the width of arguments to be at least 32 bits. 7664 assert(typeWidthInBytes(Ty) >= 4); 7665 if (isVectorType(Ty) && 7666 Traits::getRegisterForXmmArgNum(Traits::getArgIndex(i, XmmArgCount)) 7667 .hasValue()) { 7668 ++XmmArgCount; 7669 } else if (isScalarFloatingType(Ty) && Traits::X86_PASS_SCALAR_FP_IN_XMM && 7670 Traits::getRegisterForXmmArgNum( 7671 Traits::getArgIndex(i, XmmArgCount)) 7672 .hasValue()) { 7673 ++XmmArgCount; 7674 } else if (isScalarIntegerType(Ty) && 7675 Traits::getRegisterForGprArgNum( 7676 Ty, Traits::getArgIndex(i, GprArgCount)) 7677 .hasValue()) { 7678 // The 64 bit ABI allows some integers to be passed in GPRs. 7679 ++GprArgCount; 7680 } else { 7681 if (isVectorType(Ty)) { 7682 OutArgumentsSizeBytes = 7683 Traits::applyStackAlignment(OutArgumentsSizeBytes); 7684 } 7685 OutArgumentsSizeBytes += typeWidthInBytesOnStack(Ty); 7686 } 7687 } 7688 if (Traits::Is64Bit) 7689 return OutArgumentsSizeBytes; 7690 // The 32 bit ABI requires floating point values to be returned on the x87 FP 7691 // stack. Ensure there is enough space for the fstp/movs for floating returns. 7692 if (isScalarFloatingType(ReturnType)) { 7693 OutArgumentsSizeBytes = 7694 std::max(OutArgumentsSizeBytes, 7695 static_cast<uint32_t>(typeWidthInBytesOnStack(ReturnType))); 7696 } 7697 return OutArgumentsSizeBytes; 7698 } 7699 7700 template <typename TraitsType> 7701 uint32_t TargetX86Base<TraitsType>::getCallStackArgumentsSizeBytes( 7702 const InstCall *Instr) { 7703 // Build a vector of the arguments' types. 7704 const SizeT NumArgs = Instr->getNumArgs(); 7705 CfgVector<Type> ArgTypes; 7706 ArgTypes.reserve(NumArgs); 7707 for (SizeT i = 0; i < NumArgs; ++i) { 7708 Operand *Arg = Instr->getArg(i); 7709 ArgTypes.emplace_back(Arg->getType()); 7710 } 7711 // Compute the return type (if any); 7712 Type ReturnType = IceType_void; 7713 Variable *Dest = Instr->getDest(); 7714 if (Dest != nullptr) 7715 ReturnType = Dest->getType(); 7716 return getShadowStoreSize<Traits>() + getCallStackArgumentsSizeBytes(ArgTypes, ReturnType); 7717 } 7718 7719 template <typename TraitsType> 7720 Variable *TargetX86Base<TraitsType>::makeZeroedRegister(Type Ty, 7721 RegNumT RegNum) { 7722 Variable *Reg = makeReg(Ty, RegNum); 7723 switch (Ty) { 7724 case IceType_i1: 7725 case IceType_i8: 7726 case IceType_i16: 7727 case IceType_i32: 7728 case IceType_i64: 7729 // Conservatively do "mov reg, 0" to avoid modifying FLAGS. 7730 _mov(Reg, Ctx->getConstantZero(Ty)); 7731 break; 7732 case IceType_f32: 7733 case IceType_f64: 7734 Context.insert<InstFakeDef>(Reg); 7735 _xorps(Reg, Reg); 7736 break; 7737 default: 7738 // All vector types use the same pxor instruction. 7739 assert(isVectorType(Ty)); 7740 Context.insert<InstFakeDef>(Reg); 7741 _pxor(Reg, Reg); 7742 break; 7743 } 7744 return Reg; 7745 } 7746 7747 // There is no support for loading or emitting vector constants, so the vector 7748 // values returned from makeVectorOfZeros, makeVectorOfOnes, etc. are 7749 // initialized with register operations. 7750 // 7751 // TODO(wala): Add limited support for vector constants so that complex 7752 // initialization in registers is unnecessary. 7753 7754 template <typename TraitsType> 7755 Variable *TargetX86Base<TraitsType>::makeVectorOfZeros(Type Ty, 7756 RegNumT RegNum) { 7757 return makeZeroedRegister(Ty, RegNum); 7758 } 7759 7760 template <typename TraitsType> 7761 Variable *TargetX86Base<TraitsType>::makeVectorOfMinusOnes(Type Ty, 7762 RegNumT RegNum) { 7763 Variable *MinusOnes = makeReg(Ty, RegNum); 7764 // Insert a FakeDef so the live range of MinusOnes is not overestimated. 7765 Context.insert<InstFakeDef>(MinusOnes); 7766 if (Ty == IceType_f64) 7767 // Making a vector of minus ones of type f64 is currently only used for the 7768 // fabs intrinsic. To use the f64 type to create this mask with pcmpeqq 7769 // requires SSE 4.1. Since we're just creating a mask, pcmpeqd does the 7770 // same job and only requires SSE2. 7771 _pcmpeq(MinusOnes, MinusOnes, IceType_f32); 7772 else 7773 _pcmpeq(MinusOnes, MinusOnes); 7774 return MinusOnes; 7775 } 7776 7777 template <typename TraitsType> 7778 Variable *TargetX86Base<TraitsType>::makeVectorOfOnes(Type Ty, RegNumT RegNum) { 7779 Variable *Dest = makeVectorOfZeros(Ty, RegNum); 7780 Variable *MinusOne = makeVectorOfMinusOnes(Ty); 7781 _psub(Dest, MinusOne); 7782 return Dest; 7783 } 7784 7785 template <typename TraitsType> 7786 Variable *TargetX86Base<TraitsType>::makeVectorOfHighOrderBits(Type Ty, 7787 RegNumT RegNum) { 7788 assert(Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v8i16 || 7789 Ty == IceType_v16i8); 7790 if (Ty == IceType_v4f32 || Ty == IceType_v4i32 || Ty == IceType_v8i16) { 7791 Variable *Reg = makeVectorOfOnes(Ty, RegNum); 7792 SizeT Shift = 7793 typeWidthInBytes(typeElementType(Ty)) * Traits::X86_CHAR_BIT - 1; 7794 _psll(Reg, Ctx->getConstantInt8(Shift)); 7795 return Reg; 7796 } else { 7797 // SSE has no left shift operation for vectors of 8 bit integers. 7798 constexpr uint32_t HIGH_ORDER_BITS_MASK = 0x80808080; 7799 Constant *ConstantMask = Ctx->getConstantInt32(HIGH_ORDER_BITS_MASK); 7800 Variable *Reg = makeReg(Ty, RegNum); 7801 _movd(Reg, legalize(ConstantMask, Legal_Reg | Legal_Mem)); 7802 _pshufd(Reg, Reg, Ctx->getConstantZero(IceType_i8)); 7803 return Reg; 7804 } 7805 } 7806 7807 /// Construct a mask in a register that can be and'ed with a floating-point 7808 /// value to mask off its sign bit. The value will be <4 x 0x7fffffff> for f32 7809 /// and v4f32, and <2 x 0x7fffffffffffffff> for f64. Construct it as vector of 7810 /// ones logically right shifted one bit. 7811 // TODO(stichnot): Fix the wala 7812 // TODO: above, to represent vector constants in memory. 7813 template <typename TraitsType> 7814 Variable *TargetX86Base<TraitsType>::makeVectorOfFabsMask(Type Ty, 7815 RegNumT RegNum) { 7816 Variable *Reg = makeVectorOfMinusOnes(Ty, RegNum); 7817 _psrl(Reg, Ctx->getConstantInt8(1)); 7818 return Reg; 7819 } 7820 7821 template <typename TraitsType> 7822 typename TargetX86Base<TraitsType>::X86OperandMem * 7823 TargetX86Base<TraitsType>::getMemoryOperandForStackSlot(Type Ty, Variable *Slot, 7824 uint32_t Offset) { 7825 // Ensure that Loc is a stack slot. 7826 assert(Slot->mustNotHaveReg()); 7827 assert(Slot->getRegNum().hasNoValue()); 7828 // Compute the location of Loc in memory. 7829 // TODO(wala,stichnot): lea should not 7830 // be required. The address of the stack slot is known at compile time 7831 // (although not until after addProlog()). 7832 const Type PointerType = getPointerType(); 7833 Variable *Loc = makeReg(PointerType); 7834 _lea(Loc, Slot); 7835 Constant *ConstantOffset = Ctx->getConstantInt32(Offset); 7836 return X86OperandMem::create(Func, Ty, Loc, ConstantOffset); 7837 } 7838 7839 /// Lowering helper to copy a scalar integer source operand into some 8-bit GPR. 7840 /// Src is assumed to already be legalized. If the source operand is known to 7841 /// be a memory or immediate operand, a simple mov will suffice. But if the 7842 /// source operand can be a physical register, then it must first be copied into 7843 /// a physical register that is truncable to 8-bit, then truncated into a 7844 /// physical register that can receive a truncation, and finally copied into the 7845 /// result 8-bit register (which in general can be any 8-bit register). For 7846 /// example, moving %ebp into %ah may be accomplished as: 7847 /// movl %ebp, %edx 7848 /// mov_trunc %edx, %dl // this redundant assignment is ultimately elided 7849 /// movb %dl, %ah 7850 /// On the other hand, moving a memory or immediate operand into ah: 7851 /// movb 4(%ebp), %ah 7852 /// movb $my_imm, %ah 7853 /// 7854 /// Note #1. On a 64-bit target, the "movb 4(%ebp), %ah" is likely not 7855 /// encodable, so RegNum=Reg_ah should NOT be given as an argument. Instead, 7856 /// use RegNum=RegNumT() and then let the caller do a separate copy into 7857 /// Reg_ah. 7858 /// 7859 /// Note #2. ConstantRelocatable operands are also put through this process 7860 /// (not truncated directly) because our ELF emitter does R_386_32 relocations 7861 /// but not R_386_8 relocations. 7862 /// 7863 /// Note #3. If Src is a Variable, the result will be an infinite-weight i8 7864 /// Variable with the RCX86_IsTrunc8Rcvr register class. As such, this helper 7865 /// is a convenient way to prevent ah/bh/ch/dh from being an (invalid) argument 7866 /// to the pinsrb instruction. 7867 template <typename TraitsType> 7868 Variable *TargetX86Base<TraitsType>::copyToReg8(Operand *Src, RegNumT RegNum) { 7869 Type Ty = Src->getType(); 7870 assert(isScalarIntegerType(Ty)); 7871 assert(Ty != IceType_i1); 7872 Variable *Reg = makeReg(IceType_i8, RegNum); 7873 Reg->setRegClass(RCX86_IsTrunc8Rcvr); 7874 if (llvm::isa<Variable>(Src) || llvm::isa<ConstantRelocatable>(Src)) { 7875 Variable *SrcTruncable = makeReg(Ty); 7876 switch (Ty) { 7877 case IceType_i64: 7878 SrcTruncable->setRegClass(RCX86_Is64To8); 7879 break; 7880 case IceType_i32: 7881 SrcTruncable->setRegClass(RCX86_Is32To8); 7882 break; 7883 case IceType_i16: 7884 SrcTruncable->setRegClass(RCX86_Is16To8); 7885 break; 7886 default: 7887 // i8 - just use default register class 7888 break; 7889 } 7890 Variable *SrcRcvr = makeReg(IceType_i8); 7891 SrcRcvr->setRegClass(RCX86_IsTrunc8Rcvr); 7892 _mov(SrcTruncable, Src); 7893 _mov(SrcRcvr, SrcTruncable); 7894 Src = SrcRcvr; 7895 } 7896 _mov(Reg, Src); 7897 return Reg; 7898 } 7899 7900 /// Helper for legalize() to emit the right code to lower an operand to a 7901 /// register of the appropriate type. 7902 template <typename TraitsType> 7903 Variable *TargetX86Base<TraitsType>::copyToReg(Operand *Src, RegNumT RegNum) { 7904 Type Ty = Src->getType(); 7905 Variable *Reg = makeReg(Ty, RegNum); 7906 if (isVectorType(Ty)) { 7907 _movp(Reg, Src); 7908 } else { 7909 _mov(Reg, Src); 7910 } 7911 return Reg; 7912 } 7913 7914 template <typename TraitsType> 7915 Operand *TargetX86Base<TraitsType>::legalize(Operand *From, LegalMask Allowed, 7916 RegNumT RegNum) { 7917 const bool UseNonsfi = getFlags().getUseNonsfi(); 7918 const Type Ty = From->getType(); 7919 // Assert that a physical register is allowed. To date, all calls to 7920 // legalize() allow a physical register. If a physical register needs to be 7921 // explicitly disallowed, then new code will need to be written to force a 7922 // spill. 7923 assert(Allowed & Legal_Reg); 7924 // If we're asking for a specific physical register, make sure we're not 7925 // allowing any other operand kinds. (This could be future work, e.g. allow 7926 // the shl shift amount to be either an immediate or in ecx.) 7927 assert(RegNum.hasNoValue() || Allowed == Legal_Reg); 7928 7929 // Substitute with an available infinite-weight variable if possible. Only do 7930 // this when we are not asking for a specific register, and when the 7931 // substitution is not locked to a specific register, and when the types 7932 // match, in order to capture the vast majority of opportunities and avoid 7933 // corner cases in the lowering. 7934 if (RegNum.hasNoValue()) { 7935 if (Variable *Subst = getContext().availabilityGet(From)) { 7936 // At this point we know there is a potential substitution available. 7937 if (Subst->mustHaveReg() && !Subst->hasReg()) { 7938 // At this point we know the substitution will have a register. 7939 if (From->getType() == Subst->getType()) { 7940 // At this point we know the substitution's register is compatible. 7941 return Subst; 7942 } 7943 } 7944 } 7945 } 7946 7947 if (auto *Mem = llvm::dyn_cast<X86OperandMem>(From)) { 7948 // Before doing anything with a Mem operand, we need to ensure that the 7949 // Base and Index components are in physical registers. 7950 Variable *Base = Mem->getBase(); 7951 Variable *Index = Mem->getIndex(); 7952 Constant *Offset = Mem->getOffset(); 7953 Variable *RegBase = nullptr; 7954 Variable *RegIndex = nullptr; 7955 uint16_t Shift = Mem->getShift(); 7956 if (Base) { 7957 RegBase = llvm::cast<Variable>( 7958 legalize(Base, Legal_Reg | Legal_Rematerializable)); 7959 } 7960 if (Index) { 7961 // TODO(jpp): perhaps we should only allow Legal_Reg if 7962 // Base->isRematerializable. 7963 RegIndex = llvm::cast<Variable>( 7964 legalize(Index, Legal_Reg | Legal_Rematerializable)); 7965 } 7966 7967 if (Base != RegBase || Index != RegIndex) { 7968 Mem = X86OperandMem::create(Func, Ty, RegBase, Offset, RegIndex, Shift, 7969 Mem->getSegmentRegister()); 7970 } 7971 7972 // For all Memory Operands, we do randomization/pooling here. 7973 From = randomizeOrPoolImmediate(Mem); 7974 7975 if (!(Allowed & Legal_Mem)) { 7976 From = copyToReg(From, RegNum); 7977 } 7978 return From; 7979 } 7980 7981 if (auto *Const = llvm::dyn_cast<Constant>(From)) { 7982 if (llvm::isa<ConstantUndef>(Const)) { 7983 From = legalizeUndef(Const, RegNum); 7984 if (isVectorType(Ty)) 7985 return From; 7986 Const = llvm::cast<Constant>(From); 7987 } 7988 // There should be no constants of vector type (other than undef). 7989 assert(!isVectorType(Ty)); 7990 7991 // If the operand is a 64 bit constant integer we need to legalize it to a 7992 // register in x86-64. 7993 if (Traits::Is64Bit) { 7994 if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Const)) { 7995 if (!Utils::IsInt(32, C64->getValue())) { 7996 if (RegNum.hasValue()) { 7997 assert(Traits::getGprForType(IceType_i64, RegNum) == RegNum); 7998 } 7999 return copyToReg(Const, RegNum); 8000 } 8001 } 8002 } 8003 8004 // If the operand is an 32 bit constant integer, we should check whether we 8005 // need to randomize it or pool it. 8006 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Const)) { 8007 Operand *NewConst = randomizeOrPoolImmediate(C, RegNum); 8008 if (NewConst != Const) { 8009 return NewConst; 8010 } 8011 } 8012 8013 if (auto *CR = llvm::dyn_cast<ConstantRelocatable>(Const)) { 8014 // If the operand is a ConstantRelocatable, and Legal_AddrAbs is not 8015 // specified, and UseNonsfi is indicated, we need to add RebasePtr. 8016 if (UseNonsfi && !(Allowed & Legal_AddrAbs)) { 8017 assert(Ty == IceType_i32); 8018 Variable *NewVar = makeReg(Ty, RegNum); 8019 auto *Mem = Traits::X86OperandMem::create(Func, Ty, nullptr, CR); 8020 // LEAs are not automatically sandboxed, thus we explicitly invoke 8021 // _sandbox_mem_reference. 8022 _lea(NewVar, _sandbox_mem_reference(Mem)); 8023 From = NewVar; 8024 } 8025 } else if (isScalarFloatingType(Ty)) { 8026 // Convert a scalar floating point constant into an explicit memory 8027 // operand. 8028 if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(Const)) { 8029 if (Utils::isPositiveZero(ConstFloat->getValue())) 8030 return makeZeroedRegister(Ty, RegNum); 8031 } else if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(Const)) { 8032 if (Utils::isPositiveZero(ConstDouble->getValue())) 8033 return makeZeroedRegister(Ty, RegNum); 8034 } 8035 8036 auto *CFrom = llvm::cast<Constant>(From); 8037 assert(CFrom->getShouldBePooled()); 8038 Constant *Offset = Ctx->getConstantSym(0, CFrom->getLabelName()); 8039 auto *Mem = X86OperandMem::create(Func, Ty, nullptr, Offset); 8040 From = Mem; 8041 } 8042 8043 bool NeedsReg = false; 8044 if (!(Allowed & Legal_Imm) && !isScalarFloatingType(Ty)) 8045 // Immediate specifically not allowed. 8046 NeedsReg = true; 8047 if (!(Allowed & Legal_Mem) && isScalarFloatingType(Ty)) 8048 // On x86, FP constants are lowered to mem operands. 8049 NeedsReg = true; 8050 if (NeedsReg) { 8051 From = copyToReg(From, RegNum); 8052 } 8053 return From; 8054 } 8055 8056 if (auto *Var = llvm::dyn_cast<Variable>(From)) { 8057 // Check if the variable is guaranteed a physical register. This can happen 8058 // either when the variable is pre-colored or when it is assigned infinite 8059 // weight. 8060 bool MustHaveRegister = (Var->hasReg() || Var->mustHaveReg()); 8061 bool MustRematerialize = 8062 (Var->isRematerializable() && !(Allowed & Legal_Rematerializable)); 8063 // We need a new physical register for the operand if: 8064 // - Mem is not allowed and Var isn't guaranteed a physical register, or 8065 // - RegNum is required and Var->getRegNum() doesn't match, or 8066 // - Var is a rematerializable variable and rematerializable pass-through is 8067 // not allowed (in which case we need a lea instruction). 8068 if (MustRematerialize) { 8069 Variable *NewVar = makeReg(Ty, RegNum); 8070 // Since Var is rematerializable, the offset will be added when the lea is 8071 // emitted. 8072 constexpr Constant *NoOffset = nullptr; 8073 auto *Mem = X86OperandMem::create(Func, Ty, Var, NoOffset); 8074 _lea(NewVar, Mem); 8075 From = NewVar; 8076 } else if ((!(Allowed & Legal_Mem) && !MustHaveRegister) || 8077 (RegNum.hasValue() && RegNum != Var->getRegNum())) { 8078 From = copyToReg(From, RegNum); 8079 } 8080 return From; 8081 } 8082 8083 llvm::report_fatal_error("Unhandled operand kind in legalize()"); 8084 return From; 8085 } 8086 8087 /// Provide a trivial wrapper to legalize() for this common usage. 8088 template <typename TraitsType> 8089 Variable *TargetX86Base<TraitsType>::legalizeToReg(Operand *From, 8090 RegNumT RegNum) { 8091 return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum)); 8092 } 8093 8094 /// Legalize undef values to concrete values. 8095 template <typename TraitsType> 8096 Operand *TargetX86Base<TraitsType>::legalizeUndef(Operand *From, 8097 RegNumT RegNum) { 8098 Type Ty = From->getType(); 8099 if (llvm::isa<ConstantUndef>(From)) { 8100 // Lower undefs to zero. Another option is to lower undefs to an 8101 // uninitialized register; however, using an uninitialized register results 8102 // in less predictable code. 8103 // 8104 // If in the future the implementation is changed to lower undef values to 8105 // uninitialized registers, a FakeDef will be needed: 8106 // Context.insert<InstFakeDef>(Reg); 8107 // This is in order to ensure that the live range of Reg is not 8108 // overestimated. If the constant being lowered is a 64 bit value, then 8109 // the result should be split and the lo and hi components will need to go 8110 // in uninitialized registers. 8111 if (isVectorType(Ty)) 8112 return makeVectorOfZeros(Ty, RegNum); 8113 return Ctx->getConstantZero(Ty); 8114 } 8115 return From; 8116 } 8117 8118 /// For the cmp instruction, if Src1 is an immediate, or known to be a physical 8119 /// register, we can allow Src0 to be a memory operand. Otherwise, Src0 must be 8120 /// copied into a physical register. (Actually, either Src0 or Src1 can be 8121 /// chosen for the physical register, but unfortunately we have to commit to one 8122 /// or the other before register allocation.) 8123 template <typename TraitsType> 8124 Operand *TargetX86Base<TraitsType>::legalizeSrc0ForCmp(Operand *Src0, 8125 Operand *Src1) { 8126 bool IsSrc1ImmOrReg = false; 8127 if (llvm::isa<Constant>(Src1)) { 8128 IsSrc1ImmOrReg = true; 8129 } else if (auto *Var = llvm::dyn_cast<Variable>(Src1)) { 8130 if (Var->hasReg()) 8131 IsSrc1ImmOrReg = true; 8132 } 8133 return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg); 8134 } 8135 8136 template <typename TraitsType> 8137 typename TargetX86Base<TraitsType>::X86OperandMem * 8138 TargetX86Base<TraitsType>::formMemoryOperand(Operand *Opnd, Type Ty, 8139 bool DoLegalize) { 8140 auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd); 8141 // It may be the case that address mode optimization already creates an 8142 // X86OperandMem, so in that case it wouldn't need another level of 8143 // transformation. 8144 if (!Mem) { 8145 auto *Base = llvm::dyn_cast<Variable>(Opnd); 8146 auto *Offset = llvm::dyn_cast<Constant>(Opnd); 8147 assert(Base || Offset); 8148 if (Offset) { 8149 // During memory operand building, we do not blind or pool the constant 8150 // offset, we will work on the whole memory operand later as one entity 8151 // later, this save one instruction. By turning blinding and pooling off, 8152 // we guarantee legalize(Offset) will return a Constant*. 8153 if (!llvm::isa<ConstantRelocatable>(Offset)) { 8154 BoolFlagSaver B(RandomizationPoolingPaused, true); 8155 8156 Offset = llvm::cast<Constant>(legalize(Offset)); 8157 } 8158 8159 assert(llvm::isa<ConstantInteger32>(Offset) || 8160 llvm::isa<ConstantRelocatable>(Offset)); 8161 } 8162 // Not completely sure whether it's OK to leave IsRebased unset when 8163 // creating the mem operand. If DoLegalize is true, it will definitely be 8164 // applied during the legalize() call, but perhaps not during the 8165 // randomizeOrPoolImmediate() call. In any case, the emit routines will 8166 // assert that PIC legalization has been applied. 8167 Mem = X86OperandMem::create(Func, Ty, Base, Offset); 8168 } 8169 // Do legalization, which contains randomization/pooling or do 8170 // randomization/pooling. 8171 return llvm::cast<X86OperandMem>(DoLegalize ? legalize(Mem) 8172 : randomizeOrPoolImmediate(Mem)); 8173 } 8174 8175 template <typename TraitsType> 8176 Variable *TargetX86Base<TraitsType>::makeReg(Type Type, RegNumT RegNum) { 8177 // There aren't any 64-bit integer registers for x86-32. 8178 assert(Traits::Is64Bit || Type != IceType_i64); 8179 Variable *Reg = Func->makeVariable(Type); 8180 if (RegNum.hasValue()) 8181 Reg->setRegNum(RegNum); 8182 else 8183 Reg->setMustHaveReg(); 8184 return Reg; 8185 } 8186 8187 const Type TypeForSize[] = {IceType_i8, IceType_i16, IceType_i32, IceType_f64, 8188 IceType_v16i8}; 8189 8190 template <typename TraitsType> 8191 Type TargetX86Base<TraitsType>::largestTypeInSize(uint32_t Size, 8192 uint32_t MaxSize) { 8193 assert(Size != 0); 8194 uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined); 8195 uint32_t MaxIndex = MaxSize == NoSizeLimit 8196 ? llvm::array_lengthof(TypeForSize) - 1 8197 : llvm::findLastSet(MaxSize, llvm::ZB_Undefined); 8198 return TypeForSize[std::min(TyIndex, MaxIndex)]; 8199 } 8200 8201 template <typename TraitsType> 8202 Type TargetX86Base<TraitsType>::firstTypeThatFitsSize(uint32_t Size, 8203 uint32_t MaxSize) { 8204 assert(Size != 0); 8205 uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined); 8206 if (!llvm::isPowerOf2_32(Size)) 8207 ++TyIndex; 8208 uint32_t MaxIndex = MaxSize == NoSizeLimit 8209 ? llvm::array_lengthof(TypeForSize) - 1 8210 : llvm::findLastSet(MaxSize, llvm::ZB_Undefined); 8211 return TypeForSize[std::min(TyIndex, MaxIndex)]; 8212 } 8213 8214 template <typename TraitsType> void TargetX86Base<TraitsType>::postLower() { 8215 if (Func->getOptLevel() == Opt_m1) 8216 return; 8217 markRedefinitions(); 8218 Context.availabilityUpdate(); 8219 } 8220 8221 template <typename TraitsType> 8222 void TargetX86Base<TraitsType>::makeRandomRegisterPermutation( 8223 llvm::SmallVectorImpl<RegNumT> &Permutation, 8224 const SmallBitVector &ExcludeRegisters, uint64_t Salt) const { 8225 Traits::makeRandomRegisterPermutation(Func, Permutation, ExcludeRegisters, 8226 Salt); 8227 } 8228 8229 template <typename TraitsType> 8230 void TargetX86Base<TraitsType>::emit(const ConstantInteger32 *C) const { 8231 if (!BuildDefs::dump()) 8232 return; 8233 Ostream &Str = Ctx->getStrEmit(); 8234 Str << "$" << C->getValue(); 8235 } 8236 8237 template <typename TraitsType> 8238 void TargetX86Base<TraitsType>::emit(const ConstantInteger64 *C) const { 8239 if (!Traits::Is64Bit) { 8240 llvm::report_fatal_error("Not expecting to emit 64-bit integers"); 8241 } else { 8242 if (!BuildDefs::dump()) 8243 return; 8244 Ostream &Str = Ctx->getStrEmit(); 8245 Str << "$" << C->getValue(); 8246 } 8247 } 8248 8249 template <typename TraitsType> 8250 void TargetX86Base<TraitsType>::emit(const ConstantFloat *C) const { 8251 if (!BuildDefs::dump()) 8252 return; 8253 Ostream &Str = Ctx->getStrEmit(); 8254 Str << C->getLabelName(); 8255 } 8256 8257 template <typename TraitsType> 8258 void TargetX86Base<TraitsType>::emit(const ConstantDouble *C) const { 8259 if (!BuildDefs::dump()) 8260 return; 8261 Ostream &Str = Ctx->getStrEmit(); 8262 Str << C->getLabelName(); 8263 } 8264 8265 template <typename TraitsType> 8266 void TargetX86Base<TraitsType>::emit(const ConstantUndef *) const { 8267 llvm::report_fatal_error("undef value encountered by emitter."); 8268 } 8269 8270 template <class Machine> 8271 void TargetX86Base<Machine>::emit(const ConstantRelocatable *C) const { 8272 if (!BuildDefs::dump()) 8273 return; 8274 assert(!getFlags().getUseNonsfi() || 8275 C->getName().toString() == GlobalOffsetTable); 8276 Ostream &Str = Ctx->getStrEmit(); 8277 Str << "$"; 8278 emitWithoutPrefix(C); 8279 } 8280 8281 /// Randomize or pool an Immediate. 8282 template <typename TraitsType> 8283 Operand * 8284 TargetX86Base<TraitsType>::randomizeOrPoolImmediate(Constant *Immediate, 8285 RegNumT RegNum) { 8286 assert(llvm::isa<ConstantInteger32>(Immediate) || 8287 llvm::isa<ConstantRelocatable>(Immediate)); 8288 if (getFlags().getRandomizeAndPoolImmediatesOption() == RPI_None || 8289 RandomizationPoolingPaused == true) { 8290 // Immediates randomization/pooling off or paused 8291 return Immediate; 8292 } 8293 8294 if (Traits::Is64Bit && NeedSandboxing) { 8295 // Immediate randomization/pooling is currently disabled for x86-64 8296 // sandboxing for it could generate invalid memory operands. 8297 assert(false && 8298 "Constant pooling/randomization is disabled for x8664 sandbox."); 8299 return Immediate; 8300 } 8301 8302 if (!Immediate->shouldBeRandomizedOrPooled()) { 8303 // the constant Immediate is not eligible for blinding/pooling 8304 return Immediate; 8305 } 8306 Ctx->statsUpdateRPImms(); 8307 switch (getFlags().getRandomizeAndPoolImmediatesOption()) { 8308 default: 8309 llvm::report_fatal_error("Unsupported -randomize-pool-immediates option"); 8310 case RPI_Randomize: { 8311 // blind the constant 8312 // FROM: 8313 // imm 8314 // TO: 8315 // insert: mov imm+cookie, Reg 8316 // insert: lea -cookie[Reg], Reg 8317 // => Reg 8318 // If we have already assigned a phy register, we must come from 8319 // advancedPhiLowering()=>lowerAssign(). In this case we should reuse the 8320 // assigned register as this assignment is that start of its use-def 8321 // chain. So we add RegNum argument here. Note we use 'lea' instruction 8322 // instead of 'xor' to avoid affecting the flags. 8323 Variable *Reg = makeReg(IceType_i32, RegNum); 8324 auto *Integer = llvm::cast<ConstantInteger32>(Immediate); 8325 uint32_t Value = Integer->getValue(); 8326 uint32_t Cookie = Func->getConstantBlindingCookie(); 8327 _mov(Reg, Ctx->getConstantInt(IceType_i32, Cookie + Value)); 8328 Constant *Offset = Ctx->getConstantInt(IceType_i32, 0 - Cookie); 8329 _lea(Reg, X86OperandMem::create(Func, IceType_i32, Reg, Offset)); 8330 if (Immediate->getType() == IceType_i32) { 8331 return Reg; 8332 } 8333 Variable *TruncReg = makeReg(Immediate->getType(), RegNum); 8334 _mov(TruncReg, Reg); 8335 return TruncReg; 8336 } 8337 case RPI_Pool: { 8338 // pool the constant 8339 // FROM: 8340 // imm 8341 // TO: 8342 // insert: mov $label, Reg 8343 // => Reg 8344 assert(getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Pool); 8345 assert(Immediate->getShouldBePooled()); 8346 // if we have already assigned a phy register, we must come from 8347 // advancedPhiLowering()=>lowerAssign(). In this case we should reuse the 8348 // assigned register as this assignment is that start of its use-def 8349 // chain. So we add RegNum argument here. 8350 Variable *Reg = makeReg(Immediate->getType(), RegNum); 8351 constexpr RelocOffsetT Offset = 0; 8352 Constant *Symbol = Ctx->getConstantSym(Offset, Immediate->getLabelName()); 8353 constexpr Variable *NoBase = nullptr; 8354 X86OperandMem *MemOperand = 8355 X86OperandMem::create(Func, Immediate->getType(), NoBase, Symbol); 8356 _mov(Reg, MemOperand); 8357 return Reg; 8358 } 8359 } 8360 } 8361 8362 template <typename TraitsType> 8363 typename TargetX86Base<TraitsType>::X86OperandMem * 8364 TargetX86Base<TraitsType>::randomizeOrPoolImmediate(X86OperandMem *MemOperand, 8365 RegNumT RegNum) { 8366 assert(MemOperand); 8367 if (getFlags().getRandomizeAndPoolImmediatesOption() == RPI_None || 8368 RandomizationPoolingPaused == true) { 8369 // immediates randomization/pooling is turned off 8370 return MemOperand; 8371 } 8372 8373 if (Traits::Is64Bit && NeedSandboxing) { 8374 // Immediate randomization/pooling is currently disabled for x86-64 8375 // sandboxing for it could generate invalid memory operands. 8376 assert(false && 8377 "Constant pooling/randomization is disabled for x8664 sandbox."); 8378 return MemOperand; 8379 } 8380 8381 // If this memory operand is already a randomized one, we do not randomize it 8382 // again. 8383 if (MemOperand->getRandomized()) 8384 return MemOperand; 8385 8386 auto *C = llvm::dyn_cast_or_null<Constant>(MemOperand->getOffset()); 8387 8388 if (C == nullptr) { 8389 return MemOperand; 8390 } 8391 8392 if (!C->shouldBeRandomizedOrPooled()) { 8393 return MemOperand; 8394 } 8395 8396 // The offset of this mem operand should be blinded or pooled 8397 Ctx->statsUpdateRPImms(); 8398 switch (getFlags().getRandomizeAndPoolImmediatesOption()) { 8399 default: 8400 llvm::report_fatal_error("Unsupported -randomize-pool-immediates option"); 8401 case RPI_Randomize: { 8402 // blind the constant offset 8403 // FROM: 8404 // offset[base, index, shift] 8405 // TO: 8406 // insert: lea offset+cookie[base], RegTemp 8407 // => -cookie[RegTemp, index, shift] 8408 uint32_t Value = 8409 llvm::dyn_cast<ConstantInteger32>(MemOperand->getOffset())->getValue(); 8410 uint32_t Cookie = Func->getConstantBlindingCookie(); 8411 Constant *Mask1 = 8412 Ctx->getConstantInt(MemOperand->getOffset()->getType(), Cookie + Value); 8413 Constant *Mask2 = 8414 Ctx->getConstantInt(MemOperand->getOffset()->getType(), 0 - Cookie); 8415 8416 X86OperandMem *TempMemOperand = X86OperandMem::create( 8417 Func, MemOperand->getType(), MemOperand->getBase(), Mask1); 8418 // If we have already assigned a physical register, we must come from 8419 // advancedPhiLowering()=>lowerAssign(). In this case we should reuse 8420 // the assigned register as this assignment is that start of its 8421 // use-def chain. So we add RegNum argument here. 8422 Variable *RegTemp = makeReg(MemOperand->getOffset()->getType(), RegNum); 8423 _lea(RegTemp, TempMemOperand); 8424 8425 X86OperandMem *NewMemOperand = X86OperandMem::create( 8426 Func, MemOperand->getType(), RegTemp, Mask2, MemOperand->getIndex(), 8427 MemOperand->getShift(), MemOperand->getSegmentRegister(), 8428 MemOperand->getIsRebased()); 8429 8430 // Label this memory operand as randomized, so we won't randomize it 8431 // again in case we call legalize() multiple times on this memory 8432 // operand. 8433 NewMemOperand->setRandomized(true); 8434 return NewMemOperand; 8435 } 8436 case RPI_Pool: { 8437 // pool the constant offset 8438 // FROM: 8439 // offset[base, index, shift] 8440 // TO: 8441 // insert: mov $label, RegTemp 8442 // insert: lea [base, RegTemp], RegTemp 8443 // =>[RegTemp, index, shift] 8444 8445 // Memory operand should never exist as source operands in phi lowering 8446 // assignments, so there is no need to reuse any registers here. For 8447 // phi lowering, we should not ask for new physical registers in 8448 // general. However, if we do meet Memory Operand during phi lowering, 8449 // we should not blind or pool the immediates for now. 8450 if (RegNum.hasValue()) 8451 return MemOperand; 8452 Variable *RegTemp = makeReg(IceType_i32); 8453 assert(MemOperand->getOffset()->getShouldBePooled()); 8454 constexpr RelocOffsetT SymOffset = 0; 8455 Constant *Symbol = 8456 Ctx->getConstantSym(SymOffset, MemOperand->getOffset()->getLabelName()); 8457 constexpr Variable *NoBase = nullptr; 8458 X86OperandMem *SymbolOperand = X86OperandMem::create( 8459 Func, MemOperand->getOffset()->getType(), NoBase, Symbol); 8460 _mov(RegTemp, SymbolOperand); 8461 // If we have a base variable here, we should add the lea instruction 8462 // to add the value of the base variable to RegTemp. If there is no 8463 // base variable, we won't need this lea instruction. 8464 if (MemOperand->getBase()) { 8465 X86OperandMem *CalculateOperand = X86OperandMem::create( 8466 Func, MemOperand->getType(), MemOperand->getBase(), nullptr, RegTemp, 8467 0, MemOperand->getSegmentRegister()); 8468 _lea(RegTemp, CalculateOperand); 8469 } 8470 X86OperandMem *NewMemOperand = X86OperandMem::create( 8471 Func, MemOperand->getType(), RegTemp, nullptr, MemOperand->getIndex(), 8472 MemOperand->getShift(), MemOperand->getSegmentRegister()); 8473 return NewMemOperand; 8474 } 8475 } 8476 } 8477 8478 template <typename TraitsType> 8479 void TargetX86Base<TraitsType>::emitJumpTable( 8480 const Cfg *, const InstJumpTable *JumpTable) const { 8481 if (!BuildDefs::dump()) 8482 return; 8483 Ostream &Str = Ctx->getStrEmit(); 8484 const bool UseNonsfi = getFlags().getUseNonsfi(); 8485 const char *Prefix = UseNonsfi ? ".data.rel.ro." : ".rodata."; 8486 Str << "\t.section\t" << Prefix << JumpTable->getSectionName() 8487 << ",\"a\",@progbits\n" 8488 "\t.align\t" << typeWidthInBytes(getPointerType()) << "\n" 8489 << JumpTable->getName() << ":"; 8490 8491 // On X86 ILP32 pointers are 32-bit hence the use of .long 8492 for (SizeT I = 0; I < JumpTable->getNumTargets(); ++I) 8493 Str << "\n\t.long\t" << JumpTable->getTarget(I)->getAsmName(); 8494 Str << "\n"; 8495 } 8496 8497 template <typename TraitsType> 8498 template <typename T> 8499 void TargetDataX86<TraitsType>::emitConstantPool(GlobalContext *Ctx) { 8500 if (!BuildDefs::dump()) 8501 return; 8502 Ostream &Str = Ctx->getStrEmit(); 8503 Type Ty = T::Ty; 8504 SizeT Align = typeAlignInBytes(Ty); 8505 ConstantList Pool = Ctx->getConstantPool(Ty); 8506 8507 Str << "\t.section\t.rodata.cst" << Align << ",\"aM\",@progbits," << Align 8508 << "\n"; 8509 Str << "\t.align\t" << Align << "\n"; 8510 8511 // If reorder-pooled-constants option is set to true, we need to shuffle the 8512 // constant pool before emitting it. 8513 if (getFlags().getReorderPooledConstants() && !Pool.empty()) { 8514 // Use the constant's kind value as the salt for creating random number 8515 // generator. 8516 Operand::OperandKind K = (*Pool.begin())->getKind(); 8517 RandomNumberGenerator RNG(getFlags().getRandomSeed(), 8518 RPE_PooledConstantReordering, K); 8519 RandomShuffle(Pool.begin(), Pool.end(), 8520 [&RNG](uint64_t N) { return (uint32_t)RNG.next(N); }); 8521 } 8522 8523 for (Constant *C : Pool) { 8524 if (!C->getShouldBePooled()) 8525 continue; 8526 auto *Const = llvm::cast<typename T::IceType>(C); 8527 typename T::IceType::PrimType Value = Const->getValue(); 8528 // Use memcpy() to copy bits from Value into RawValue in a way that avoids 8529 // breaking strict-aliasing rules. 8530 typename T::PrimitiveIntType RawValue; 8531 memcpy(&RawValue, &Value, sizeof(Value)); 8532 char buf[30]; 8533 int CharsPrinted = 8534 snprintf(buf, llvm::array_lengthof(buf), T::PrintfString, RawValue); 8535 assert(CharsPrinted >= 0); 8536 assert((size_t)CharsPrinted < llvm::array_lengthof(buf)); 8537 (void)CharsPrinted; // avoid warnings if asserts are disabled 8538 Str << Const->getLabelName(); 8539 Str << ":\n\t" << T::AsmTag << "\t" << buf << "\t/* " << T::TypeName << " " 8540 << Value << " */\n"; 8541 } 8542 } 8543 8544 template <typename TraitsType> 8545 void TargetDataX86<TraitsType>::lowerConstants() { 8546 if (getFlags().getDisableTranslation()) 8547 return; 8548 switch (getFlags().getOutFileType()) { 8549 case FT_Elf: { 8550 ELFObjectWriter *Writer = Ctx->getObjectWriter(); 8551 8552 Writer->writeConstantPool<ConstantInteger32>(IceType_i8); 8553 Writer->writeConstantPool<ConstantInteger32>(IceType_i16); 8554 Writer->writeConstantPool<ConstantInteger32>(IceType_i32); 8555 8556 Writer->writeConstantPool<ConstantFloat>(IceType_f32); 8557 Writer->writeConstantPool<ConstantDouble>(IceType_f64); 8558 } break; 8559 case FT_Asm: 8560 case FT_Iasm: { 8561 OstreamLocker L(Ctx); 8562 8563 emitConstantPool<PoolTypeConverter<uint8_t>>(Ctx); 8564 emitConstantPool<PoolTypeConverter<uint16_t>>(Ctx); 8565 emitConstantPool<PoolTypeConverter<uint32_t>>(Ctx); 8566 8567 emitConstantPool<PoolTypeConverter<float>>(Ctx); 8568 emitConstantPool<PoolTypeConverter<double>>(Ctx); 8569 } break; 8570 } 8571 } 8572 8573 template <typename TraitsType> 8574 void TargetDataX86<TraitsType>::lowerJumpTables() { 8575 const bool IsPIC = getFlags().getUseNonsfi(); 8576 switch (getFlags().getOutFileType()) { 8577 case FT_Elf: { 8578 ELFObjectWriter *Writer = Ctx->getObjectWriter(); 8579 constexpr FixupKind FK_Abs64 = llvm::ELF::R_X86_64_64; 8580 const FixupKind RelocationKind = 8581 (getPointerType() == IceType_i32) ? Traits::FK_Abs : FK_Abs64; 8582 for (const JumpTableData &JT : Ctx->getJumpTables()) 8583 Writer->writeJumpTable(JT, RelocationKind, IsPIC); 8584 } break; 8585 case FT_Asm: 8586 // Already emitted from Cfg 8587 break; 8588 case FT_Iasm: { 8589 if (!BuildDefs::dump()) 8590 return; 8591 Ostream &Str = Ctx->getStrEmit(); 8592 const char *Prefix = IsPIC ? ".data.rel.ro." : ".rodata."; 8593 for (const JumpTableData &JT : Ctx->getJumpTables()) { 8594 Str << "\t.section\t" << Prefix << JT.getSectionName() 8595 << ",\"a\",@progbits\n" 8596 "\t.align\t" << typeWidthInBytes(getPointerType()) << "\n" 8597 << JT.getName().toString() << ":"; 8598 8599 // On X8664 ILP32 pointers are 32-bit hence the use of .long 8600 for (intptr_t TargetOffset : JT.getTargetOffsets()) 8601 Str << "\n\t.long\t" << JT.getFunctionName() << "+" << TargetOffset; 8602 Str << "\n"; 8603 } 8604 } break; 8605 } 8606 } 8607 8608 template <typename TraitsType> 8609 void TargetDataX86<TraitsType>::lowerGlobals( 8610 const VariableDeclarationList &Vars, const std::string &SectionSuffix) { 8611 const bool IsPIC = getFlags().getUseNonsfi(); 8612 switch (getFlags().getOutFileType()) { 8613 case FT_Elf: { 8614 ELFObjectWriter *Writer = Ctx->getObjectWriter(); 8615 Writer->writeDataSection(Vars, Traits::FK_Abs, SectionSuffix, IsPIC); 8616 } break; 8617 case FT_Asm: 8618 case FT_Iasm: { 8619 OstreamLocker L(Ctx); 8620 for (const VariableDeclaration *Var : Vars) { 8621 if (getFlags().matchTranslateOnly(Var->getName(), 0)) { 8622 emitGlobal(*Var, SectionSuffix); 8623 } 8624 } 8625 } break; 8626 } 8627 } 8628 } // end of namespace X86NAMESPACE 8629 } // end of namespace Ice 8630 8631 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H 8632