1 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This file provides a helper that implements much of the TTI interface in 11 /// terms of the target-independent code generator and TargetLowering 12 /// interfaces. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #ifndef LLVM_CODEGEN_BASICTTIIMPL_H 17 #define LLVM_CODEGEN_BASICTTIIMPL_H 18 19 #include "llvm/ADT/APInt.h" 20 #include "llvm/ADT/ArrayRef.h" 21 #include "llvm/ADT/BitVector.h" 22 #include "llvm/ADT/SmallPtrSet.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/Analysis/LoopInfo.h" 25 #include "llvm/Analysis/TargetTransformInfo.h" 26 #include "llvm/Analysis/TargetTransformInfoImpl.h" 27 #include "llvm/CodeGen/ISDOpcodes.h" 28 #include "llvm/CodeGen/TargetLowering.h" 29 #include "llvm/CodeGen/TargetSubtargetInfo.h" 30 #include "llvm/CodeGen/ValueTypes.h" 31 #include "llvm/IR/BasicBlock.h" 32 #include "llvm/IR/CallSite.h" 33 #include "llvm/IR/Constant.h" 34 #include "llvm/IR/Constants.h" 35 #include "llvm/IR/DataLayout.h" 36 #include "llvm/IR/DerivedTypes.h" 37 #include "llvm/IR/InstrTypes.h" 38 #include "llvm/IR/Instruction.h" 39 #include "llvm/IR/Instructions.h" 40 #include "llvm/IR/Intrinsics.h" 41 #include "llvm/IR/Operator.h" 42 #include "llvm/IR/Type.h" 43 #include "llvm/IR/Value.h" 44 #include "llvm/MC/MCSchedule.h" 45 #include "llvm/Support/Casting.h" 46 #include "llvm/Support/CommandLine.h" 47 #include "llvm/Support/ErrorHandling.h" 48 #include "llvm/Support/MachineValueType.h" 49 #include "llvm/Support/MathExtras.h" 50 #include <algorithm> 51 #include <cassert> 52 #include <cstdint> 53 #include <limits> 54 #include <utility> 55 56 namespace llvm { 57 58 class Function; 59 class GlobalValue; 60 class LLVMContext; 61 class ScalarEvolution; 62 class SCEV; 63 class TargetMachine; 64 65 extern cl::opt<unsigned> PartialUnrollingThreshold; 66 67 /// Base class which can be used to help build a TTI implementation. 68 /// 69 /// This class provides as much implementation of the TTI interface as is 70 /// possible using the target independent parts of the code generator. 71 /// 72 /// In order to subclass it, your class must implement a getST() method to 73 /// return the subtarget, and a getTLI() method to return the target lowering. 74 /// We need these methods implemented in the derived class so that this class 75 /// doesn't have to duplicate storage for them. 76 template <typename T> 77 class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { 78 private: 79 using BaseT = TargetTransformInfoImplCRTPBase<T>; 80 using TTI = TargetTransformInfo; 81 82 /// Estimate a cost of Broadcast as an extract and sequence of insert 83 /// operations. getBroadcastShuffleOverhead(Type * Ty)84 unsigned getBroadcastShuffleOverhead(Type *Ty) { 85 assert(Ty->isVectorTy() && "Can only shuffle vectors"); 86 unsigned Cost = 0; 87 // Broadcast cost is equal to the cost of extracting the zero'th element 88 // plus the cost of inserting it into every element of the result vector. 89 Cost += static_cast<T *>(this)->getVectorInstrCost( 90 Instruction::ExtractElement, Ty, 0); 91 92 for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { 93 Cost += static_cast<T *>(this)->getVectorInstrCost( 94 Instruction::InsertElement, Ty, i); 95 } 96 return Cost; 97 } 98 99 /// Estimate a cost of shuffle as a sequence of extract and insert 100 /// operations. getPermuteShuffleOverhead(Type * Ty)101 unsigned getPermuteShuffleOverhead(Type *Ty) { 102 assert(Ty->isVectorTy() && "Can only shuffle vectors"); 103 unsigned Cost = 0; 104 // Shuffle cost is equal to the cost of extracting element from its argument 105 // plus the cost of inserting them onto the result vector. 106 107 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from 108 // index 0 of first vector, index 1 of second vector,index 2 of first 109 // vector and finally index 3 of second vector and insert them at index 110 // <0,1,2,3> of result vector. 111 for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { 112 Cost += static_cast<T *>(this) 113 ->getVectorInstrCost(Instruction::InsertElement, Ty, i); 114 Cost += static_cast<T *>(this) 115 ->getVectorInstrCost(Instruction::ExtractElement, Ty, i); 116 } 117 return Cost; 118 } 119 120 /// Estimate a cost of subvector extraction as a sequence of extract and 121 /// insert operations. getExtractSubvectorOverhead(Type * Ty,int Index,Type * SubTy)122 unsigned getExtractSubvectorOverhead(Type *Ty, int Index, Type *SubTy) { 123 assert(Ty && Ty->isVectorTy() && SubTy && SubTy->isVectorTy() && 124 "Can only extract subvectors from vectors"); 125 int NumSubElts = SubTy->getVectorNumElements(); 126 assert((Index + NumSubElts) <= (int)Ty->getVectorNumElements() && 127 "SK_ExtractSubvector index out of range"); 128 129 unsigned Cost = 0; 130 // Subvector extraction cost is equal to the cost of extracting element from 131 // the source type plus the cost of inserting them into the result vector 132 // type. 133 for (int i = 0; i != NumSubElts; ++i) { 134 Cost += static_cast<T *>(this)->getVectorInstrCost( 135 Instruction::ExtractElement, Ty, i + Index); 136 Cost += static_cast<T *>(this)->getVectorInstrCost( 137 Instruction::InsertElement, SubTy, i); 138 } 139 return Cost; 140 } 141 142 /// Estimate a cost of subvector insertion as a sequence of extract and 143 /// insert operations. getInsertSubvectorOverhead(Type * Ty,int Index,Type * SubTy)144 unsigned getInsertSubvectorOverhead(Type *Ty, int Index, Type *SubTy) { 145 assert(Ty && Ty->isVectorTy() && SubTy && SubTy->isVectorTy() && 146 "Can only insert subvectors into vectors"); 147 int NumSubElts = SubTy->getVectorNumElements(); 148 assert((Index + NumSubElts) <= (int)Ty->getVectorNumElements() && 149 "SK_InsertSubvector index out of range"); 150 151 unsigned Cost = 0; 152 // Subvector insertion cost is equal to the cost of extracting element from 153 // the source type plus the cost of inserting them into the result vector 154 // type. 155 for (int i = 0; i != NumSubElts; ++i) { 156 Cost += static_cast<T *>(this)->getVectorInstrCost( 157 Instruction::ExtractElement, SubTy, i); 158 Cost += static_cast<T *>(this)->getVectorInstrCost( 159 Instruction::InsertElement, Ty, i + Index); 160 } 161 return Cost; 162 } 163 164 /// Local query method delegates up to T which *must* implement this! getST()165 const TargetSubtargetInfo *getST() const { 166 return static_cast<const T *>(this)->getST(); 167 } 168 169 /// Local query method delegates up to T which *must* implement this! getTLI()170 const TargetLoweringBase *getTLI() const { 171 return static_cast<const T *>(this)->getTLI(); 172 } 173 getISDIndexedMode(TTI::MemIndexedMode M)174 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) { 175 switch (M) { 176 case TTI::MIM_Unindexed: 177 return ISD::UNINDEXED; 178 case TTI::MIM_PreInc: 179 return ISD::PRE_INC; 180 case TTI::MIM_PreDec: 181 return ISD::PRE_DEC; 182 case TTI::MIM_PostInc: 183 return ISD::POST_INC; 184 case TTI::MIM_PostDec: 185 return ISD::POST_DEC; 186 } 187 llvm_unreachable("Unexpected MemIndexedMode"); 188 } 189 190 protected: BasicTTIImplBase(const TargetMachine * TM,const DataLayout & DL)191 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) 192 : BaseT(DL) {} 193 virtual ~BasicTTIImplBase() = default; 194 195 using TargetTransformInfoImplBase::DL; 196 197 public: 198 /// \name Scalar TTI Implementations 199 /// @{ allowsMisalignedMemoryAccesses(LLVMContext & Context,unsigned BitWidth,unsigned AddressSpace,unsigned Alignment,bool * Fast)200 bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, 201 unsigned AddressSpace, unsigned Alignment, 202 bool *Fast) const { 203 EVT E = EVT::getIntegerVT(Context, BitWidth); 204 return getTLI()->allowsMisalignedMemoryAccesses( 205 E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast); 206 } 207 hasBranchDivergence()208 bool hasBranchDivergence() { return false; } 209 isSourceOfDivergence(const Value * V)210 bool isSourceOfDivergence(const Value *V) { return false; } 211 isAlwaysUniform(const Value * V)212 bool isAlwaysUniform(const Value *V) { return false; } 213 getFlatAddressSpace()214 unsigned getFlatAddressSpace() { 215 // Return an invalid address space. 216 return -1; 217 } 218 collectFlatAddressOperands(SmallVectorImpl<int> & OpIndexes,Intrinsic::ID IID)219 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, 220 Intrinsic::ID IID) const { 221 return false; 222 } 223 rewriteIntrinsicWithAddressSpace(IntrinsicInst * II,Value * OldV,Value * NewV)224 bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, 225 Value *OldV, Value *NewV) const { 226 return false; 227 } 228 isLegalAddImmediate(int64_t imm)229 bool isLegalAddImmediate(int64_t imm) { 230 return getTLI()->isLegalAddImmediate(imm); 231 } 232 isLegalICmpImmediate(int64_t imm)233 bool isLegalICmpImmediate(int64_t imm) { 234 return getTLI()->isLegalICmpImmediate(imm); 235 } 236 237 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, 238 bool HasBaseReg, int64_t Scale, 239 unsigned AddrSpace, Instruction *I = nullptr) { 240 TargetLoweringBase::AddrMode AM; 241 AM.BaseGV = BaseGV; 242 AM.BaseOffs = BaseOffset; 243 AM.HasBaseReg = HasBaseReg; 244 AM.Scale = Scale; 245 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I); 246 } 247 isIndexedLoadLegal(TTI::MemIndexedMode M,Type * Ty,const DataLayout & DL)248 bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, 249 const DataLayout &DL) const { 250 EVT VT = getTLI()->getValueType(DL, Ty); 251 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT); 252 } 253 isIndexedStoreLegal(TTI::MemIndexedMode M,Type * Ty,const DataLayout & DL)254 bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, 255 const DataLayout &DL) const { 256 EVT VT = getTLI()->getValueType(DL, Ty); 257 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT); 258 } 259 isLSRCostLess(TTI::LSRCost C1,TTI::LSRCost C2)260 bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) { 261 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); 262 } 263 getScalingFactorCost(Type * Ty,GlobalValue * BaseGV,int64_t BaseOffset,bool HasBaseReg,int64_t Scale,unsigned AddrSpace)264 int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, 265 bool HasBaseReg, int64_t Scale, unsigned AddrSpace) { 266 TargetLoweringBase::AddrMode AM; 267 AM.BaseGV = BaseGV; 268 AM.BaseOffs = BaseOffset; 269 AM.HasBaseReg = HasBaseReg; 270 AM.Scale = Scale; 271 return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace); 272 } 273 isTruncateFree(Type * Ty1,Type * Ty2)274 bool isTruncateFree(Type *Ty1, Type *Ty2) { 275 return getTLI()->isTruncateFree(Ty1, Ty2); 276 } 277 isProfitableToHoist(Instruction * I)278 bool isProfitableToHoist(Instruction *I) { 279 return getTLI()->isProfitableToHoist(I); 280 } 281 useAA()282 bool useAA() const { return getST()->useAA(); } 283 isTypeLegal(Type * Ty)284 bool isTypeLegal(Type *Ty) { 285 EVT VT = getTLI()->getValueType(DL, Ty); 286 return getTLI()->isTypeLegal(VT); 287 } 288 getGEPCost(Type * PointeeType,const Value * Ptr,ArrayRef<const Value * > Operands)289 int getGEPCost(Type *PointeeType, const Value *Ptr, 290 ArrayRef<const Value *> Operands) { 291 return BaseT::getGEPCost(PointeeType, Ptr, Operands); 292 } 293 getExtCost(const Instruction * I,const Value * Src)294 int getExtCost(const Instruction *I, const Value *Src) { 295 if (getTLI()->isExtFree(I)) 296 return TargetTransformInfo::TCC_Free; 297 298 if (isa<ZExtInst>(I) || isa<SExtInst>(I)) 299 if (const LoadInst *LI = dyn_cast<LoadInst>(Src)) 300 if (getTLI()->isExtLoad(LI, I, DL)) 301 return TargetTransformInfo::TCC_Free; 302 303 return TargetTransformInfo::TCC_Basic; 304 } 305 getIntrinsicCost(Intrinsic::ID IID,Type * RetTy,ArrayRef<const Value * > Arguments,const User * U)306 unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, 307 ArrayRef<const Value *> Arguments, const User *U) { 308 return BaseT::getIntrinsicCost(IID, RetTy, Arguments, U); 309 } 310 getIntrinsicCost(Intrinsic::ID IID,Type * RetTy,ArrayRef<Type * > ParamTys,const User * U)311 unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, 312 ArrayRef<Type *> ParamTys, const User *U) { 313 if (IID == Intrinsic::cttz) { 314 if (getTLI()->isCheapToSpeculateCttz()) 315 return TargetTransformInfo::TCC_Basic; 316 return TargetTransformInfo::TCC_Expensive; 317 } 318 319 if (IID == Intrinsic::ctlz) { 320 if (getTLI()->isCheapToSpeculateCtlz()) 321 return TargetTransformInfo::TCC_Basic; 322 return TargetTransformInfo::TCC_Expensive; 323 } 324 325 return BaseT::getIntrinsicCost(IID, RetTy, ParamTys, U); 326 } 327 getEstimatedNumberOfCaseClusters(const SwitchInst & SI,unsigned & JumpTableSize,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI)328 unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, 329 unsigned &JumpTableSize, 330 ProfileSummaryInfo *PSI, 331 BlockFrequencyInfo *BFI) { 332 /// Try to find the estimated number of clusters. Note that the number of 333 /// clusters identified in this function could be different from the actual 334 /// numbers found in lowering. This function ignore switches that are 335 /// lowered with a mix of jump table / bit test / BTree. This function was 336 /// initially intended to be used when estimating the cost of switch in 337 /// inline cost heuristic, but it's a generic cost model to be used in other 338 /// places (e.g., in loop unrolling). 339 unsigned N = SI.getNumCases(); 340 const TargetLoweringBase *TLI = getTLI(); 341 const DataLayout &DL = this->getDataLayout(); 342 343 JumpTableSize = 0; 344 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent()); 345 346 // Early exit if both a jump table and bit test are not allowed. 347 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N)) 348 return N; 349 350 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue(); 351 APInt MinCaseVal = MaxCaseVal; 352 for (auto CI : SI.cases()) { 353 const APInt &CaseVal = CI.getCaseValue()->getValue(); 354 if (CaseVal.sgt(MaxCaseVal)) 355 MaxCaseVal = CaseVal; 356 if (CaseVal.slt(MinCaseVal)) 357 MinCaseVal = CaseVal; 358 } 359 360 // Check if suitable for a bit test 361 if (N <= DL.getIndexSizeInBits(0u)) { 362 SmallPtrSet<const BasicBlock *, 4> Dests; 363 for (auto I : SI.cases()) 364 Dests.insert(I.getCaseSuccessor()); 365 366 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal, 367 DL)) 368 return 1; 369 } 370 371 // Check if suitable for a jump table. 372 if (IsJTAllowed) { 373 if (N < 2 || N < TLI->getMinimumJumpTableEntries()) 374 return N; 375 uint64_t Range = 376 (MaxCaseVal - MinCaseVal) 377 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1; 378 // Check whether a range of clusters is dense enough for a jump table 379 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) { 380 JumpTableSize = Range; 381 return 1; 382 } 383 } 384 return N; 385 } 386 shouldBuildLookupTables()387 bool shouldBuildLookupTables() { 388 const TargetLoweringBase *TLI = getTLI(); 389 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) || 390 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other); 391 } 392 haveFastSqrt(Type * Ty)393 bool haveFastSqrt(Type *Ty) { 394 const TargetLoweringBase *TLI = getTLI(); 395 EVT VT = TLI->getValueType(DL, Ty); 396 return TLI->isTypeLegal(VT) && 397 TLI->isOperationLegalOrCustom(ISD::FSQRT, VT); 398 } 399 isFCmpOrdCheaperThanFCmpZero(Type * Ty)400 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) { 401 return true; 402 } 403 getFPOpCost(Type * Ty)404 unsigned getFPOpCost(Type *Ty) { 405 // Check whether FADD is available, as a proxy for floating-point in 406 // general. 407 const TargetLoweringBase *TLI = getTLI(); 408 EVT VT = TLI->getValueType(DL, Ty); 409 if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT)) 410 return TargetTransformInfo::TCC_Basic; 411 return TargetTransformInfo::TCC_Expensive; 412 } 413 getOperationCost(unsigned Opcode,Type * Ty,Type * OpTy)414 unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) { 415 const TargetLoweringBase *TLI = getTLI(); 416 switch (Opcode) { 417 default: break; 418 case Instruction::Trunc: 419 if (TLI->isTruncateFree(OpTy, Ty)) 420 return TargetTransformInfo::TCC_Free; 421 return TargetTransformInfo::TCC_Basic; 422 case Instruction::ZExt: 423 if (TLI->isZExtFree(OpTy, Ty)) 424 return TargetTransformInfo::TCC_Free; 425 return TargetTransformInfo::TCC_Basic; 426 427 case Instruction::AddrSpaceCast: 428 if (TLI->isFreeAddrSpaceCast(OpTy->getPointerAddressSpace(), 429 Ty->getPointerAddressSpace())) 430 return TargetTransformInfo::TCC_Free; 431 return TargetTransformInfo::TCC_Basic; 432 } 433 434 return BaseT::getOperationCost(Opcode, Ty, OpTy); 435 } 436 getInliningThresholdMultiplier()437 unsigned getInliningThresholdMultiplier() { return 1; } 438 getInlinerVectorBonusPercent()439 int getInlinerVectorBonusPercent() { return 150; } 440 getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP)441 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 442 TTI::UnrollingPreferences &UP) { 443 // This unrolling functionality is target independent, but to provide some 444 // motivation for its intended use, for x86: 445 446 // According to the Intel 64 and IA-32 Architectures Optimization Reference 447 // Manual, Intel Core models and later have a loop stream detector (and 448 // associated uop queue) that can benefit from partial unrolling. 449 // The relevant requirements are: 450 // - The loop must have no more than 4 (8 for Nehalem and later) branches 451 // taken, and none of them may be calls. 452 // - The loop can have no more than 18 (28 for Nehalem and later) uops. 453 454 // According to the Software Optimization Guide for AMD Family 15h 455 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor 456 // and loop buffer which can benefit from partial unrolling. 457 // The relevant requirements are: 458 // - The loop must have fewer than 16 branches 459 // - The loop must have less than 40 uops in all executed loop branches 460 461 // The number of taken branches in a loop is hard to estimate here, and 462 // benchmarking has revealed that it is better not to be conservative when 463 // estimating the branch count. As a result, we'll ignore the branch limits 464 // until someone finds a case where it matters in practice. 465 466 unsigned MaxOps; 467 const TargetSubtargetInfo *ST = getST(); 468 if (PartialUnrollingThreshold.getNumOccurrences() > 0) 469 MaxOps = PartialUnrollingThreshold; 470 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0) 471 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize; 472 else 473 return; 474 475 // Scan the loop: don't unroll loops with calls. 476 for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; 477 ++I) { 478 BasicBlock *BB = *I; 479 480 for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J) 481 if (isa<CallInst>(J) || isa<InvokeInst>(J)) { 482 ImmutableCallSite CS(&*J); 483 if (const Function *F = CS.getCalledFunction()) { 484 if (!static_cast<T *>(this)->isLoweredToCall(F)) 485 continue; 486 } 487 488 return; 489 } 490 } 491 492 // Enable runtime and partial unrolling up to the specified size. 493 // Enable using trip count upper bound to unroll loops. 494 UP.Partial = UP.Runtime = UP.UpperBound = true; 495 UP.PartialThreshold = MaxOps; 496 497 // Avoid unrolling when optimizing for size. 498 UP.OptSizeThreshold = 0; 499 UP.PartialOptSizeThreshold = 0; 500 501 // Set number of instructions optimized when "back edge" 502 // becomes "fall through" to default value of 2. 503 UP.BEInsns = 2; 504 } 505 isHardwareLoopProfitable(Loop * L,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * LibInfo,HardwareLoopInfo & HWLoopInfo)506 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, 507 AssumptionCache &AC, 508 TargetLibraryInfo *LibInfo, 509 HardwareLoopInfo &HWLoopInfo) { 510 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); 511 } 512 preferPredicateOverEpilogue(Loop * L,LoopInfo * LI,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * TLI,DominatorTree * DT,const LoopAccessInfo * LAI)513 bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, 514 AssumptionCache &AC, TargetLibraryInfo *TLI, 515 DominatorTree *DT, 516 const LoopAccessInfo *LAI) { 517 return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); 518 } 519 getInstructionLatency(const Instruction * I)520 int getInstructionLatency(const Instruction *I) { 521 if (isa<LoadInst>(I)) 522 return getST()->getSchedModel().DefaultLoadLatency; 523 524 return BaseT::getInstructionLatency(I); 525 } 526 527 virtual Optional<unsigned> getCacheSize(TargetTransformInfo::CacheLevel Level)528 getCacheSize(TargetTransformInfo::CacheLevel Level) const { 529 return Optional<unsigned>( 530 getST()->getCacheSize(static_cast<unsigned>(Level))); 531 } 532 533 virtual Optional<unsigned> getCacheAssociativity(TargetTransformInfo::CacheLevel Level)534 getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const { 535 Optional<unsigned> TargetResult = 536 getST()->getCacheAssociativity(static_cast<unsigned>(Level)); 537 538 if (TargetResult) 539 return TargetResult; 540 541 return BaseT::getCacheAssociativity(Level); 542 } 543 getCacheLineSize()544 virtual unsigned getCacheLineSize() const { 545 return getST()->getCacheLineSize(); 546 } 547 getPrefetchDistance()548 virtual unsigned getPrefetchDistance() const { 549 return getST()->getPrefetchDistance(); 550 } 551 getMinPrefetchStride()552 virtual unsigned getMinPrefetchStride() const { 553 return getST()->getMinPrefetchStride(); 554 } 555 getMaxPrefetchIterationsAhead()556 virtual unsigned getMaxPrefetchIterationsAhead() const { 557 return getST()->getMaxPrefetchIterationsAhead(); 558 } 559 560 /// @} 561 562 /// \name Vector TTI Implementations 563 /// @{ 564 getRegisterBitWidth(bool Vector)565 unsigned getRegisterBitWidth(bool Vector) const { return 32; } 566 567 /// Estimate the overhead of scalarizing an instruction. Insert and Extract 568 /// are set if the result needs to be inserted and/or extracted from vectors. getScalarizationOverhead(Type * Ty,bool Insert,bool Extract)569 unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) { 570 assert(Ty->isVectorTy() && "Can only scalarize vectors"); 571 unsigned Cost = 0; 572 573 for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { 574 if (Insert) 575 Cost += static_cast<T *>(this) 576 ->getVectorInstrCost(Instruction::InsertElement, Ty, i); 577 if (Extract) 578 Cost += static_cast<T *>(this) 579 ->getVectorInstrCost(Instruction::ExtractElement, Ty, i); 580 } 581 582 return Cost; 583 } 584 585 /// Estimate the overhead of scalarizing an instructions unique 586 /// non-constant operands. The types of the arguments are ordinarily 587 /// scalar, in which case the costs are multiplied with VF. getOperandsScalarizationOverhead(ArrayRef<const Value * > Args,unsigned VF)588 unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args, 589 unsigned VF) { 590 unsigned Cost = 0; 591 SmallPtrSet<const Value*, 4> UniqueOperands; 592 for (const Value *A : Args) { 593 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) { 594 Type *VecTy = nullptr; 595 if (A->getType()->isVectorTy()) { 596 VecTy = A->getType(); 597 // If A is a vector operand, VF should be 1 or correspond to A. 598 assert((VF == 1 || VF == VecTy->getVectorNumElements()) && 599 "Vector argument does not match VF"); 600 } 601 else 602 VecTy = VectorType::get(A->getType(), VF); 603 604 Cost += getScalarizationOverhead(VecTy, false, true); 605 } 606 } 607 608 return Cost; 609 } 610 getScalarizationOverhead(Type * VecTy,ArrayRef<const Value * > Args)611 unsigned getScalarizationOverhead(Type *VecTy, ArrayRef<const Value *> Args) { 612 assert(VecTy->isVectorTy()); 613 614 unsigned Cost = 0; 615 616 Cost += getScalarizationOverhead(VecTy, true, false); 617 if (!Args.empty()) 618 Cost += getOperandsScalarizationOverhead(Args, 619 VecTy->getVectorNumElements()); 620 else 621 // When no information on arguments is provided, we add the cost 622 // associated with one argument as a heuristic. 623 Cost += getScalarizationOverhead(VecTy, false, true); 624 625 return Cost; 626 } 627 getMaxInterleaveFactor(unsigned VF)628 unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } 629 630 unsigned getArithmeticInstrCost( 631 unsigned Opcode, Type *Ty, 632 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, 633 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, 634 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, 635 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, 636 ArrayRef<const Value *> Args = ArrayRef<const Value *>(), 637 const Instruction *CxtI = nullptr) { 638 // Check if any of the operands are vector operands. 639 const TargetLoweringBase *TLI = getTLI(); 640 int ISD = TLI->InstructionOpcodeToISD(Opcode); 641 assert(ISD && "Invalid opcode"); 642 643 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 644 645 bool IsFloat = Ty->isFPOrFPVectorTy(); 646 // Assume that floating point arithmetic operations cost twice as much as 647 // integer operations. 648 unsigned OpCost = (IsFloat ? 2 : 1); 649 650 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 651 // The operation is legal. Assume it costs 1. 652 // TODO: Once we have extract/insert subvector cost we need to use them. 653 return LT.first * OpCost; 654 } 655 656 if (!TLI->isOperationExpand(ISD, LT.second)) { 657 // If the operation is custom lowered, then assume that the code is twice 658 // as expensive. 659 return LT.first * 2 * OpCost; 660 } 661 662 // Else, assume that we need to scalarize this op. 663 // TODO: If one of the types get legalized by splitting, handle this 664 // similarly to what getCastInstrCost() does. 665 if (Ty->isVectorTy()) { 666 unsigned Num = Ty->getVectorNumElements(); 667 unsigned Cost = static_cast<T *>(this) 668 ->getArithmeticInstrCost(Opcode, Ty->getScalarType()); 669 // Return the cost of multiple scalar invocation plus the cost of 670 // inserting and extracting the values. 671 return getScalarizationOverhead(Ty, Args) + Num * Cost; 672 } 673 674 // We don't know anything about this scalar instruction. 675 return OpCost; 676 } 677 getShuffleCost(TTI::ShuffleKind Kind,Type * Tp,int Index,Type * SubTp)678 unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, 679 Type *SubTp) { 680 switch (Kind) { 681 case TTI::SK_Broadcast: 682 return getBroadcastShuffleOverhead(Tp); 683 case TTI::SK_Select: 684 case TTI::SK_Reverse: 685 case TTI::SK_Transpose: 686 case TTI::SK_PermuteSingleSrc: 687 case TTI::SK_PermuteTwoSrc: 688 return getPermuteShuffleOverhead(Tp); 689 case TTI::SK_ExtractSubvector: 690 return getExtractSubvectorOverhead(Tp, Index, SubTp); 691 case TTI::SK_InsertSubvector: 692 return getInsertSubvectorOverhead(Tp, Index, SubTp); 693 } 694 llvm_unreachable("Unknown TTI::ShuffleKind"); 695 } 696 697 unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, 698 const Instruction *I = nullptr) { 699 const TargetLoweringBase *TLI = getTLI(); 700 int ISD = TLI->InstructionOpcodeToISD(Opcode); 701 assert(ISD && "Invalid opcode"); 702 std::pair<unsigned, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, Src); 703 std::pair<unsigned, MVT> DstLT = TLI->getTypeLegalizationCost(DL, Dst); 704 705 // Check for NOOP conversions. 706 if (SrcLT.first == DstLT.first && 707 SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) { 708 709 // Bitcast between types that are legalized to the same type are free. 710 if (Opcode == Instruction::BitCast || Opcode == Instruction::Trunc) 711 return 0; 712 } 713 714 if (Opcode == Instruction::Trunc && 715 TLI->isTruncateFree(SrcLT.second, DstLT.second)) 716 return 0; 717 718 if (Opcode == Instruction::ZExt && 719 TLI->isZExtFree(SrcLT.second, DstLT.second)) 720 return 0; 721 722 if (Opcode == Instruction::AddrSpaceCast && 723 TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(), 724 Dst->getPointerAddressSpace())) 725 return 0; 726 727 // If this is a zext/sext of a load, return 0 if the corresponding 728 // extending load exists on target. 729 if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) && 730 I && isa<LoadInst>(I->getOperand(0))) { 731 EVT ExtVT = EVT::getEVT(Dst); 732 EVT LoadVT = EVT::getEVT(Src); 733 unsigned LType = 734 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD); 735 if (TLI->isLoadExtLegal(LType, ExtVT, LoadVT)) 736 return 0; 737 } 738 739 // If the cast is marked as legal (or promote) then assume low cost. 740 if (SrcLT.first == DstLT.first && 741 TLI->isOperationLegalOrPromote(ISD, DstLT.second)) 742 return 1; 743 744 // Handle scalar conversions. 745 if (!Src->isVectorTy() && !Dst->isVectorTy()) { 746 // Scalar bitcasts are usually free. 747 if (Opcode == Instruction::BitCast) 748 return 0; 749 750 // Just check the op cost. If the operation is legal then assume it costs 751 // 1. 752 if (!TLI->isOperationExpand(ISD, DstLT.second)) 753 return 1; 754 755 // Assume that illegal scalar instruction are expensive. 756 return 4; 757 } 758 759 // Check vector-to-vector casts. 760 if (Dst->isVectorTy() && Src->isVectorTy()) { 761 // If the cast is between same-sized registers, then the check is simple. 762 if (SrcLT.first == DstLT.first && 763 SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) { 764 765 // Assume that Zext is done using AND. 766 if (Opcode == Instruction::ZExt) 767 return 1; 768 769 // Assume that sext is done using SHL and SRA. 770 if (Opcode == Instruction::SExt) 771 return 2; 772 773 // Just check the op cost. If the operation is legal then assume it 774 // costs 775 // 1 and multiply by the type-legalization overhead. 776 if (!TLI->isOperationExpand(ISD, DstLT.second)) 777 return SrcLT.first * 1; 778 } 779 780 // If we are legalizing by splitting, query the concrete TTI for the cost 781 // of casting the original vector twice. We also need to factor in the 782 // cost of the split itself. Count that as 1, to be consistent with 783 // TLI->getTypeLegalizationCost(). 784 if ((TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) == 785 TargetLowering::TypeSplitVector || 786 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) == 787 TargetLowering::TypeSplitVector) && 788 Src->getVectorNumElements() > 1 && Dst->getVectorNumElements() > 1) { 789 Type *SplitDst = VectorType::get(Dst->getVectorElementType(), 790 Dst->getVectorNumElements() / 2); 791 Type *SplitSrc = VectorType::get(Src->getVectorElementType(), 792 Src->getVectorNumElements() / 2); 793 T *TTI = static_cast<T *>(this); 794 return TTI->getVectorSplitCost() + 795 (2 * TTI->getCastInstrCost(Opcode, SplitDst, SplitSrc, I)); 796 } 797 798 // In other cases where the source or destination are illegal, assume 799 // the operation will get scalarized. 800 unsigned Num = Dst->getVectorNumElements(); 801 unsigned Cost = static_cast<T *>(this)->getCastInstrCost( 802 Opcode, Dst->getScalarType(), Src->getScalarType(), I); 803 804 // Return the cost of multiple scalar invocation plus the cost of 805 // inserting and extracting the values. 806 return getScalarizationOverhead(Dst, true, true) + Num * Cost; 807 } 808 809 // We already handled vector-to-vector and scalar-to-scalar conversions. 810 // This 811 // is where we handle bitcast between vectors and scalars. We need to assume 812 // that the conversion is scalarized in one way or another. 813 if (Opcode == Instruction::BitCast) 814 // Illegal bitcasts are done by storing and loading from a stack slot. 815 return (Src->isVectorTy() ? getScalarizationOverhead(Src, false, true) 816 : 0) + 817 (Dst->isVectorTy() ? getScalarizationOverhead(Dst, true, false) 818 : 0); 819 820 llvm_unreachable("Unhandled cast"); 821 } 822 getExtractWithExtendCost(unsigned Opcode,Type * Dst,VectorType * VecTy,unsigned Index)823 unsigned getExtractWithExtendCost(unsigned Opcode, Type *Dst, 824 VectorType *VecTy, unsigned Index) { 825 return static_cast<T *>(this)->getVectorInstrCost( 826 Instruction::ExtractElement, VecTy, Index) + 827 static_cast<T *>(this)->getCastInstrCost(Opcode, Dst, 828 VecTy->getElementType()); 829 } 830 getCFInstrCost(unsigned Opcode)831 unsigned getCFInstrCost(unsigned Opcode) { 832 // Branches are assumed to be predicted. 833 return 0; 834 } 835 getCmpSelInstrCost(unsigned Opcode,Type * ValTy,Type * CondTy,const Instruction * I)836 unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, 837 const Instruction *I) { 838 const TargetLoweringBase *TLI = getTLI(); 839 int ISD = TLI->InstructionOpcodeToISD(Opcode); 840 assert(ISD && "Invalid opcode"); 841 842 // Selects on vectors are actually vector selects. 843 if (ISD == ISD::SELECT) { 844 assert(CondTy && "CondTy must exist"); 845 if (CondTy->isVectorTy()) 846 ISD = ISD::VSELECT; 847 } 848 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 849 850 if (!(ValTy->isVectorTy() && !LT.second.isVector()) && 851 !TLI->isOperationExpand(ISD, LT.second)) { 852 // The operation is legal. Assume it costs 1. Multiply 853 // by the type-legalization overhead. 854 return LT.first * 1; 855 } 856 857 // Otherwise, assume that the cast is scalarized. 858 // TODO: If one of the types get legalized by splitting, handle this 859 // similarly to what getCastInstrCost() does. 860 if (ValTy->isVectorTy()) { 861 unsigned Num = ValTy->getVectorNumElements(); 862 if (CondTy) 863 CondTy = CondTy->getScalarType(); 864 unsigned Cost = static_cast<T *>(this)->getCmpSelInstrCost( 865 Opcode, ValTy->getScalarType(), CondTy, I); 866 867 // Return the cost of multiple scalar invocation plus the cost of 868 // inserting and extracting the values. 869 return getScalarizationOverhead(ValTy, true, false) + Num * Cost; 870 } 871 872 // Unknown scalar opcode. 873 return 1; 874 } 875 getVectorInstrCost(unsigned Opcode,Type * Val,unsigned Index)876 unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { 877 std::pair<unsigned, MVT> LT = 878 getTLI()->getTypeLegalizationCost(DL, Val->getScalarType()); 879 880 return LT.first; 881 } 882 883 unsigned getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, 884 unsigned AddressSpace, 885 const Instruction *I = nullptr) { 886 assert(!Src->isVoidTy() && "Invalid type"); 887 std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Src); 888 889 // Assuming that all loads of legal types cost 1. 890 unsigned Cost = LT.first; 891 892 if (Src->isVectorTy() && 893 Src->getPrimitiveSizeInBits() < LT.second.getSizeInBits()) { 894 // This is a vector load that legalizes to a larger type than the vector 895 // itself. Unless the corresponding extending load or truncating store is 896 // legal, then this will scalarize. 897 TargetLowering::LegalizeAction LA = TargetLowering::Expand; 898 EVT MemVT = getTLI()->getValueType(DL, Src); 899 if (Opcode == Instruction::Store) 900 LA = getTLI()->getTruncStoreAction(LT.second, MemVT); 901 else 902 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT); 903 904 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { 905 // This is a vector load/store for some illegal type that is scalarized. 906 // We must account for the cost of building or decomposing the vector. 907 Cost += getScalarizationOverhead(Src, Opcode != Instruction::Store, 908 Opcode == Instruction::Store); 909 } 910 } 911 912 return Cost; 913 } 914 915 unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, 916 unsigned Factor, 917 ArrayRef<unsigned> Indices, 918 unsigned Alignment, unsigned AddressSpace, 919 bool UseMaskForCond = false, 920 bool UseMaskForGaps = false) { 921 VectorType *VT = dyn_cast<VectorType>(VecTy); 922 assert(VT && "Expect a vector type for interleaved memory op"); 923 924 unsigned NumElts = VT->getNumElements(); 925 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor"); 926 927 unsigned NumSubElts = NumElts / Factor; 928 VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts); 929 930 // Firstly, the cost of load/store operation. 931 unsigned Cost; 932 if (UseMaskForCond || UseMaskForGaps) 933 Cost = static_cast<T *>(this)->getMaskedMemoryOpCost( 934 Opcode, VecTy, Alignment, AddressSpace); 935 else 936 Cost = static_cast<T *>(this)->getMemoryOpCost( 937 Opcode, VecTy, MaybeAlign(Alignment), AddressSpace); 938 939 // Legalize the vector type, and get the legalized and unlegalized type 940 // sizes. 941 MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; 942 unsigned VecTySize = 943 static_cast<T *>(this)->getDataLayout().getTypeStoreSize(VecTy); 944 unsigned VecTyLTSize = VecTyLT.getStoreSize(); 945 946 // Return the ceiling of dividing A by B. 947 auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; }; 948 949 // Scale the cost of the memory operation by the fraction of legalized 950 // instructions that will actually be used. We shouldn't account for the 951 // cost of dead instructions since they will be removed. 952 // 953 // E.g., An interleaved load of factor 8: 954 // %vec = load <16 x i64>, <16 x i64>* %ptr 955 // %v0 = shufflevector %vec, undef, <0, 8> 956 // 957 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be 958 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized 959 // type). The other loads are unused. 960 // 961 // We only scale the cost of loads since interleaved store groups aren't 962 // allowed to have gaps. 963 if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) { 964 // The number of loads of a legal type it will take to represent a load 965 // of the unlegalized vector type. 966 unsigned NumLegalInsts = ceil(VecTySize, VecTyLTSize); 967 968 // The number of elements of the unlegalized type that correspond to a 969 // single legal instruction. 970 unsigned NumEltsPerLegalInst = ceil(NumElts, NumLegalInsts); 971 972 // Determine which legal instructions will be used. 973 BitVector UsedInsts(NumLegalInsts, false); 974 for (unsigned Index : Indices) 975 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt) 976 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst); 977 978 // Scale the cost of the load by the fraction of legal instructions that 979 // will be used. 980 Cost *= UsedInsts.count() / NumLegalInsts; 981 } 982 983 // Then plus the cost of interleave operation. 984 if (Opcode == Instruction::Load) { 985 // The interleave cost is similar to extract sub vectors' elements 986 // from the wide vector, and insert them into sub vectors. 987 // 988 // E.g. An interleaved load of factor 2 (with one member of index 0): 989 // %vec = load <8 x i32>, <8 x i32>* %ptr 990 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 991 // The cost is estimated as extract elements at 0, 2, 4, 6 from the 992 // <8 x i32> vector and insert them into a <4 x i32> vector. 993 994 assert(Indices.size() <= Factor && 995 "Interleaved memory op has too many members"); 996 997 for (unsigned Index : Indices) { 998 assert(Index < Factor && "Invalid index for interleaved memory op"); 999 1000 // Extract elements from loaded vector for each sub vector. 1001 for (unsigned i = 0; i < NumSubElts; i++) 1002 Cost += static_cast<T *>(this)->getVectorInstrCost( 1003 Instruction::ExtractElement, VT, Index + i * Factor); 1004 } 1005 1006 unsigned InsSubCost = 0; 1007 for (unsigned i = 0; i < NumSubElts; i++) 1008 InsSubCost += static_cast<T *>(this)->getVectorInstrCost( 1009 Instruction::InsertElement, SubVT, i); 1010 1011 Cost += Indices.size() * InsSubCost; 1012 } else { 1013 // The interleave cost is extract all elements from sub vectors, and 1014 // insert them into the wide vector. 1015 // 1016 // E.g. An interleaved store of factor 2: 1017 // %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7> 1018 // store <8 x i32> %interleaved.vec, <8 x i32>* %ptr 1019 // The cost is estimated as extract all elements from both <4 x i32> 1020 // vectors and insert into the <8 x i32> vector. 1021 1022 unsigned ExtSubCost = 0; 1023 for (unsigned i = 0; i < NumSubElts; i++) 1024 ExtSubCost += static_cast<T *>(this)->getVectorInstrCost( 1025 Instruction::ExtractElement, SubVT, i); 1026 Cost += ExtSubCost * Factor; 1027 1028 for (unsigned i = 0; i < NumElts; i++) 1029 Cost += static_cast<T *>(this) 1030 ->getVectorInstrCost(Instruction::InsertElement, VT, i); 1031 } 1032 1033 if (!UseMaskForCond) 1034 return Cost; 1035 1036 Type *I8Type = Type::getInt8Ty(VT->getContext()); 1037 VectorType *MaskVT = VectorType::get(I8Type, NumElts); 1038 SubVT = VectorType::get(I8Type, NumSubElts); 1039 1040 // The Mask shuffling cost is extract all the elements of the Mask 1041 // and insert each of them Factor times into the wide vector: 1042 // 1043 // E.g. an interleaved group with factor 3: 1044 // %mask = icmp ult <8 x i32> %vec1, %vec2 1045 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef, 1046 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7> 1047 // The cost is estimated as extract all mask elements from the <8xi1> mask 1048 // vector and insert them factor times into the <24xi1> shuffled mask 1049 // vector. 1050 for (unsigned i = 0; i < NumSubElts; i++) 1051 Cost += static_cast<T *>(this)->getVectorInstrCost( 1052 Instruction::ExtractElement, SubVT, i); 1053 1054 for (unsigned i = 0; i < NumElts; i++) 1055 Cost += static_cast<T *>(this)->getVectorInstrCost( 1056 Instruction::InsertElement, MaskVT, i); 1057 1058 // The Gaps mask is invariant and created outside the loop, therefore the 1059 // cost of creating it is not accounted for here. However if we have both 1060 // a MaskForGaps and some other mask that guards the execution of the 1061 // memory access, we need to account for the cost of And-ing the two masks 1062 // inside the loop. 1063 if (UseMaskForGaps) 1064 Cost += static_cast<T *>(this)->getArithmeticInstrCost( 1065 BinaryOperator::And, MaskVT); 1066 1067 return Cost; 1068 } 1069 1070 /// Get intrinsic cost based on arguments. 1071 unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, 1072 ArrayRef<Value *> Args, FastMathFlags FMF, 1073 unsigned VF = 1) { 1074 unsigned RetVF = (RetTy->isVectorTy() ? RetTy->getVectorNumElements() : 1); 1075 assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type"); 1076 auto *ConcreteTTI = static_cast<T *>(this); 1077 1078 switch (IID) { 1079 default: { 1080 // Assume that we need to scalarize this intrinsic. 1081 SmallVector<Type *, 4> Types; 1082 for (Value *Op : Args) { 1083 Type *OpTy = Op->getType(); 1084 assert(VF == 1 || !OpTy->isVectorTy()); 1085 Types.push_back(VF == 1 ? OpTy : VectorType::get(OpTy, VF)); 1086 } 1087 1088 if (VF > 1 && !RetTy->isVoidTy()) 1089 RetTy = VectorType::get(RetTy, VF); 1090 1091 // Compute the scalarization overhead based on Args for a vector 1092 // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while 1093 // CostModel will pass a vector RetTy and VF is 1. 1094 unsigned ScalarizationCost = std::numeric_limits<unsigned>::max(); 1095 if (RetVF > 1 || VF > 1) { 1096 ScalarizationCost = 0; 1097 if (!RetTy->isVoidTy()) 1098 ScalarizationCost += getScalarizationOverhead(RetTy, true, false); 1099 ScalarizationCost += getOperandsScalarizationOverhead(Args, VF); 1100 } 1101 1102 return ConcreteTTI->getIntrinsicInstrCost(IID, RetTy, Types, FMF, 1103 ScalarizationCost); 1104 } 1105 case Intrinsic::masked_scatter: { 1106 assert(VF == 1 && "Can't vectorize types here."); 1107 Value *Mask = Args[3]; 1108 bool VarMask = !isa<Constant>(Mask); 1109 unsigned Alignment = cast<ConstantInt>(Args[2])->getZExtValue(); 1110 return ConcreteTTI->getGatherScatterOpCost( 1111 Instruction::Store, Args[0]->getType(), Args[1], VarMask, Alignment); 1112 } 1113 case Intrinsic::masked_gather: { 1114 assert(VF == 1 && "Can't vectorize types here."); 1115 Value *Mask = Args[2]; 1116 bool VarMask = !isa<Constant>(Mask); 1117 unsigned Alignment = cast<ConstantInt>(Args[1])->getZExtValue(); 1118 return ConcreteTTI->getGatherScatterOpCost(Instruction::Load, RetTy, 1119 Args[0], VarMask, Alignment); 1120 } 1121 case Intrinsic::experimental_vector_reduce_add: 1122 case Intrinsic::experimental_vector_reduce_mul: 1123 case Intrinsic::experimental_vector_reduce_and: 1124 case Intrinsic::experimental_vector_reduce_or: 1125 case Intrinsic::experimental_vector_reduce_xor: 1126 case Intrinsic::experimental_vector_reduce_v2_fadd: 1127 case Intrinsic::experimental_vector_reduce_v2_fmul: 1128 case Intrinsic::experimental_vector_reduce_smax: 1129 case Intrinsic::experimental_vector_reduce_smin: 1130 case Intrinsic::experimental_vector_reduce_fmax: 1131 case Intrinsic::experimental_vector_reduce_fmin: 1132 case Intrinsic::experimental_vector_reduce_umax: 1133 case Intrinsic::experimental_vector_reduce_umin: 1134 return getIntrinsicInstrCost(IID, RetTy, Args[0]->getType(), FMF); 1135 case Intrinsic::fshl: 1136 case Intrinsic::fshr: { 1137 Value *X = Args[0]; 1138 Value *Y = Args[1]; 1139 Value *Z = Args[2]; 1140 TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW; 1141 TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX); 1142 TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY); 1143 TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ); 1144 TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue; 1145 OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2 1146 : TTI::OP_None; 1147 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) 1148 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) 1149 unsigned Cost = 0; 1150 Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::Or, RetTy); 1151 Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::Sub, RetTy); 1152 Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::Shl, RetTy, 1153 OpKindX, OpKindZ, OpPropsX); 1154 Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::LShr, RetTy, 1155 OpKindY, OpKindZ, OpPropsY); 1156 // Non-constant shift amounts requires a modulo. 1157 if (OpKindZ != TTI::OK_UniformConstantValue && 1158 OpKindZ != TTI::OK_NonUniformConstantValue) 1159 Cost += ConcreteTTI->getArithmeticInstrCost(BinaryOperator::URem, RetTy, 1160 OpKindZ, OpKindBW, OpPropsZ, 1161 OpPropsBW); 1162 // For non-rotates (X != Y) we must add shift-by-zero handling costs. 1163 if (X != Y) { 1164 Type *CondTy = RetTy->getWithNewBitWidth(1); 1165 Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, 1166 CondTy, nullptr); 1167 Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::Select, RetTy, 1168 CondTy, nullptr); 1169 } 1170 return Cost; 1171 } 1172 } 1173 } 1174 1175 /// Get intrinsic cost based on argument types. 1176 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the 1177 /// cost of scalarizing the arguments and the return value will be computed 1178 /// based on types. 1179 unsigned getIntrinsicInstrCost( 1180 Intrinsic::ID IID, Type *RetTy, ArrayRef<Type *> Tys, FastMathFlags FMF, 1181 unsigned ScalarizationCostPassed = std::numeric_limits<unsigned>::max()) { 1182 auto *ConcreteTTI = static_cast<T *>(this); 1183 1184 SmallVector<unsigned, 2> ISDs; 1185 unsigned SingleCallCost = 10; // Library call cost. Make it expensive. 1186 switch (IID) { 1187 default: { 1188 // Assume that we need to scalarize this intrinsic. 1189 unsigned ScalarizationCost = ScalarizationCostPassed; 1190 unsigned ScalarCalls = 1; 1191 Type *ScalarRetTy = RetTy; 1192 if (RetTy->isVectorTy()) { 1193 if (ScalarizationCostPassed == std::numeric_limits<unsigned>::max()) 1194 ScalarizationCost = getScalarizationOverhead(RetTy, true, false); 1195 ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements()); 1196 ScalarRetTy = RetTy->getScalarType(); 1197 } 1198 SmallVector<Type *, 4> ScalarTys; 1199 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 1200 Type *Ty = Tys[i]; 1201 if (Ty->isVectorTy()) { 1202 if (ScalarizationCostPassed == std::numeric_limits<unsigned>::max()) 1203 ScalarizationCost += getScalarizationOverhead(Ty, false, true); 1204 ScalarCalls = std::max(ScalarCalls, Ty->getVectorNumElements()); 1205 Ty = Ty->getScalarType(); 1206 } 1207 ScalarTys.push_back(Ty); 1208 } 1209 if (ScalarCalls == 1) 1210 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap. 1211 1212 unsigned ScalarCost = 1213 ConcreteTTI->getIntrinsicInstrCost(IID, ScalarRetTy, ScalarTys, FMF); 1214 1215 return ScalarCalls * ScalarCost + ScalarizationCost; 1216 } 1217 // Look for intrinsics that can be lowered directly or turned into a scalar 1218 // intrinsic call. 1219 case Intrinsic::sqrt: 1220 ISDs.push_back(ISD::FSQRT); 1221 break; 1222 case Intrinsic::sin: 1223 ISDs.push_back(ISD::FSIN); 1224 break; 1225 case Intrinsic::cos: 1226 ISDs.push_back(ISD::FCOS); 1227 break; 1228 case Intrinsic::exp: 1229 ISDs.push_back(ISD::FEXP); 1230 break; 1231 case Intrinsic::exp2: 1232 ISDs.push_back(ISD::FEXP2); 1233 break; 1234 case Intrinsic::log: 1235 ISDs.push_back(ISD::FLOG); 1236 break; 1237 case Intrinsic::log10: 1238 ISDs.push_back(ISD::FLOG10); 1239 break; 1240 case Intrinsic::log2: 1241 ISDs.push_back(ISD::FLOG2); 1242 break; 1243 case Intrinsic::fabs: 1244 ISDs.push_back(ISD::FABS); 1245 break; 1246 case Intrinsic::canonicalize: 1247 ISDs.push_back(ISD::FCANONICALIZE); 1248 break; 1249 case Intrinsic::minnum: 1250 ISDs.push_back(ISD::FMINNUM); 1251 if (FMF.noNaNs()) 1252 ISDs.push_back(ISD::FMINIMUM); 1253 break; 1254 case Intrinsic::maxnum: 1255 ISDs.push_back(ISD::FMAXNUM); 1256 if (FMF.noNaNs()) 1257 ISDs.push_back(ISD::FMAXIMUM); 1258 break; 1259 case Intrinsic::copysign: 1260 ISDs.push_back(ISD::FCOPYSIGN); 1261 break; 1262 case Intrinsic::floor: 1263 ISDs.push_back(ISD::FFLOOR); 1264 break; 1265 case Intrinsic::ceil: 1266 ISDs.push_back(ISD::FCEIL); 1267 break; 1268 case Intrinsic::trunc: 1269 ISDs.push_back(ISD::FTRUNC); 1270 break; 1271 case Intrinsic::nearbyint: 1272 ISDs.push_back(ISD::FNEARBYINT); 1273 break; 1274 case Intrinsic::rint: 1275 ISDs.push_back(ISD::FRINT); 1276 break; 1277 case Intrinsic::round: 1278 ISDs.push_back(ISD::FROUND); 1279 break; 1280 case Intrinsic::pow: 1281 ISDs.push_back(ISD::FPOW); 1282 break; 1283 case Intrinsic::fma: 1284 ISDs.push_back(ISD::FMA); 1285 break; 1286 case Intrinsic::fmuladd: 1287 ISDs.push_back(ISD::FMA); 1288 break; 1289 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. 1290 case Intrinsic::lifetime_start: 1291 case Intrinsic::lifetime_end: 1292 case Intrinsic::sideeffect: 1293 return 0; 1294 case Intrinsic::masked_store: 1295 return ConcreteTTI->getMaskedMemoryOpCost(Instruction::Store, Tys[0], 0, 1296 0); 1297 case Intrinsic::masked_load: 1298 return ConcreteTTI->getMaskedMemoryOpCost(Instruction::Load, RetTy, 0, 0); 1299 case Intrinsic::experimental_vector_reduce_add: 1300 return ConcreteTTI->getArithmeticReductionCost(Instruction::Add, Tys[0], 1301 /*IsPairwiseForm=*/false); 1302 case Intrinsic::experimental_vector_reduce_mul: 1303 return ConcreteTTI->getArithmeticReductionCost(Instruction::Mul, Tys[0], 1304 /*IsPairwiseForm=*/false); 1305 case Intrinsic::experimental_vector_reduce_and: 1306 return ConcreteTTI->getArithmeticReductionCost(Instruction::And, Tys[0], 1307 /*IsPairwiseForm=*/false); 1308 case Intrinsic::experimental_vector_reduce_or: 1309 return ConcreteTTI->getArithmeticReductionCost(Instruction::Or, Tys[0], 1310 /*IsPairwiseForm=*/false); 1311 case Intrinsic::experimental_vector_reduce_xor: 1312 return ConcreteTTI->getArithmeticReductionCost(Instruction::Xor, Tys[0], 1313 /*IsPairwiseForm=*/false); 1314 case Intrinsic::experimental_vector_reduce_v2_fadd: 1315 return ConcreteTTI->getArithmeticReductionCost( 1316 Instruction::FAdd, Tys[0], 1317 /*IsPairwiseForm=*/false); // FIXME: Add new flag for cost of strict 1318 // reductions. 1319 case Intrinsic::experimental_vector_reduce_v2_fmul: 1320 return ConcreteTTI->getArithmeticReductionCost( 1321 Instruction::FMul, Tys[0], 1322 /*IsPairwiseForm=*/false); // FIXME: Add new flag for cost of strict 1323 // reductions. 1324 case Intrinsic::experimental_vector_reduce_smax: 1325 case Intrinsic::experimental_vector_reduce_smin: 1326 case Intrinsic::experimental_vector_reduce_fmax: 1327 case Intrinsic::experimental_vector_reduce_fmin: 1328 return ConcreteTTI->getMinMaxReductionCost( 1329 Tys[0], CmpInst::makeCmpResultType(Tys[0]), /*IsPairwiseForm=*/false, 1330 /*IsUnsigned=*/true); 1331 case Intrinsic::experimental_vector_reduce_umax: 1332 case Intrinsic::experimental_vector_reduce_umin: 1333 return ConcreteTTI->getMinMaxReductionCost( 1334 Tys[0], CmpInst::makeCmpResultType(Tys[0]), /*IsPairwiseForm=*/false, 1335 /*IsUnsigned=*/false); 1336 case Intrinsic::sadd_sat: 1337 case Intrinsic::ssub_sat: { 1338 Type *CondTy = RetTy->getWithNewBitWidth(1); 1339 1340 Type *OpTy = StructType::create({RetTy, CondTy}); 1341 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat 1342 ? Intrinsic::sadd_with_overflow 1343 : Intrinsic::ssub_with_overflow; 1344 1345 // SatMax -> Overflow && SumDiff < 0 1346 // SatMin -> Overflow && SumDiff >= 0 1347 unsigned Cost = 0; 1348 Cost += ConcreteTTI->getIntrinsicInstrCost( 1349 OverflowOp, OpTy, {RetTy, RetTy}, FMF, ScalarizationCostPassed); 1350 Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, 1351 CondTy, nullptr); 1352 Cost += 2 * ConcreteTTI->getCmpSelInstrCost(BinaryOperator::Select, RetTy, 1353 CondTy, nullptr); 1354 return Cost; 1355 } 1356 case Intrinsic::uadd_sat: 1357 case Intrinsic::usub_sat: { 1358 Type *CondTy = RetTy->getWithNewBitWidth(1); 1359 1360 Type *OpTy = StructType::create({RetTy, CondTy}); 1361 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat 1362 ? Intrinsic::uadd_with_overflow 1363 : Intrinsic::usub_with_overflow; 1364 1365 unsigned Cost = 0; 1366 Cost += ConcreteTTI->getIntrinsicInstrCost( 1367 OverflowOp, OpTy, {RetTy, RetTy}, FMF, ScalarizationCostPassed); 1368 Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::Select, RetTy, 1369 CondTy, nullptr); 1370 return Cost; 1371 } 1372 case Intrinsic::smul_fix: 1373 case Intrinsic::umul_fix: { 1374 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2; 1375 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize); 1376 1377 unsigned ExtOp = 1378 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; 1379 1380 unsigned Cost = 0; 1381 Cost += 2 * ConcreteTTI->getCastInstrCost(ExtOp, ExtTy, RetTy); 1382 Cost += ConcreteTTI->getArithmeticInstrCost(Instruction::Mul, ExtTy); 1383 Cost += 1384 2 * ConcreteTTI->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy); 1385 Cost += ConcreteTTI->getArithmeticInstrCost(Instruction::LShr, RetTy, 1386 TTI::OK_AnyValue, 1387 TTI::OK_UniformConstantValue); 1388 Cost += ConcreteTTI->getArithmeticInstrCost(Instruction::Shl, RetTy, 1389 TTI::OK_AnyValue, 1390 TTI::OK_UniformConstantValue); 1391 Cost += ConcreteTTI->getArithmeticInstrCost(Instruction::Or, RetTy); 1392 return Cost; 1393 } 1394 case Intrinsic::sadd_with_overflow: 1395 case Intrinsic::ssub_with_overflow: { 1396 Type *SumTy = RetTy->getContainedType(0); 1397 Type *OverflowTy = RetTy->getContainedType(1); 1398 unsigned Opcode = IID == Intrinsic::sadd_with_overflow 1399 ? BinaryOperator::Add 1400 : BinaryOperator::Sub; 1401 1402 // LHSSign -> LHS >= 0 1403 // RHSSign -> RHS >= 0 1404 // SumSign -> Sum >= 0 1405 // 1406 // Add: 1407 // Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign) 1408 // Sub: 1409 // Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign) 1410 unsigned Cost = 0; 1411 Cost += ConcreteTTI->getArithmeticInstrCost(Opcode, SumTy); 1412 Cost += 3 * ConcreteTTI->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, 1413 OverflowTy, nullptr); 1414 Cost += 2 * ConcreteTTI->getCmpSelInstrCost( 1415 BinaryOperator::ICmp, OverflowTy, OverflowTy, nullptr); 1416 Cost += 1417 ConcreteTTI->getArithmeticInstrCost(BinaryOperator::And, OverflowTy); 1418 return Cost; 1419 } 1420 case Intrinsic::uadd_with_overflow: 1421 case Intrinsic::usub_with_overflow: { 1422 Type *SumTy = RetTy->getContainedType(0); 1423 Type *OverflowTy = RetTy->getContainedType(1); 1424 unsigned Opcode = IID == Intrinsic::uadd_with_overflow 1425 ? BinaryOperator::Add 1426 : BinaryOperator::Sub; 1427 1428 unsigned Cost = 0; 1429 Cost += ConcreteTTI->getArithmeticInstrCost(Opcode, SumTy); 1430 Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, 1431 OverflowTy, nullptr); 1432 return Cost; 1433 } 1434 case Intrinsic::smul_with_overflow: 1435 case Intrinsic::umul_with_overflow: { 1436 Type *MulTy = RetTy->getContainedType(0); 1437 Type *OverflowTy = RetTy->getContainedType(1); 1438 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2; 1439 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize); 1440 1441 unsigned ExtOp = 1442 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; 1443 1444 unsigned Cost = 0; 1445 Cost += 2 * ConcreteTTI->getCastInstrCost(ExtOp, ExtTy, MulTy); 1446 Cost += ConcreteTTI->getArithmeticInstrCost(Instruction::Mul, ExtTy); 1447 Cost += 1448 2 * ConcreteTTI->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy); 1449 Cost += ConcreteTTI->getArithmeticInstrCost(Instruction::LShr, MulTy, 1450 TTI::OK_AnyValue, 1451 TTI::OK_UniformConstantValue); 1452 1453 if (IID == Intrinsic::smul_with_overflow) 1454 Cost += ConcreteTTI->getArithmeticInstrCost( 1455 Instruction::AShr, MulTy, TTI::OK_AnyValue, 1456 TTI::OK_UniformConstantValue); 1457 1458 Cost += ConcreteTTI->getCmpSelInstrCost(BinaryOperator::ICmp, MulTy, 1459 OverflowTy, nullptr); 1460 return Cost; 1461 } 1462 case Intrinsic::ctpop: 1463 ISDs.push_back(ISD::CTPOP); 1464 // In case of legalization use TCC_Expensive. This is cheaper than a 1465 // library call but still not a cheap instruction. 1466 SingleCallCost = TargetTransformInfo::TCC_Expensive; 1467 break; 1468 // FIXME: ctlz, cttz, ... 1469 } 1470 1471 const TargetLoweringBase *TLI = getTLI(); 1472 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy); 1473 1474 SmallVector<unsigned, 2> LegalCost; 1475 SmallVector<unsigned, 2> CustomCost; 1476 for (unsigned ISD : ISDs) { 1477 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 1478 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() && 1479 TLI->isFAbsFree(LT.second)) { 1480 return 0; 1481 } 1482 1483 // The operation is legal. Assume it costs 1. 1484 // If the type is split to multiple registers, assume that there is some 1485 // overhead to this. 1486 // TODO: Once we have extract/insert subvector cost we need to use them. 1487 if (LT.first > 1) 1488 LegalCost.push_back(LT.first * 2); 1489 else 1490 LegalCost.push_back(LT.first * 1); 1491 } else if (!TLI->isOperationExpand(ISD, LT.second)) { 1492 // If the operation is custom lowered then assume 1493 // that the code is twice as expensive. 1494 CustomCost.push_back(LT.first * 2); 1495 } 1496 } 1497 1498 auto MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end()); 1499 if (MinLegalCostI != LegalCost.end()) 1500 return *MinLegalCostI; 1501 1502 auto MinCustomCostI = 1503 std::min_element(CustomCost.begin(), CustomCost.end()); 1504 if (MinCustomCostI != CustomCost.end()) 1505 return *MinCustomCostI; 1506 1507 // If we can't lower fmuladd into an FMA estimate the cost as a floating 1508 // point mul followed by an add. 1509 if (IID == Intrinsic::fmuladd) 1510 return ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) + 1511 ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy); 1512 1513 // Else, assume that we need to scalarize this intrinsic. For math builtins 1514 // this will emit a costly libcall, adding call overhead and spills. Make it 1515 // very expensive. 1516 if (RetTy->isVectorTy()) { 1517 unsigned ScalarizationCost = 1518 ((ScalarizationCostPassed != std::numeric_limits<unsigned>::max()) 1519 ? ScalarizationCostPassed 1520 : getScalarizationOverhead(RetTy, true, false)); 1521 unsigned ScalarCalls = RetTy->getVectorNumElements(); 1522 SmallVector<Type *, 4> ScalarTys; 1523 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 1524 Type *Ty = Tys[i]; 1525 if (Ty->isVectorTy()) 1526 Ty = Ty->getScalarType(); 1527 ScalarTys.push_back(Ty); 1528 } 1529 unsigned ScalarCost = ConcreteTTI->getIntrinsicInstrCost( 1530 IID, RetTy->getScalarType(), ScalarTys, FMF); 1531 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 1532 if (Tys[i]->isVectorTy()) { 1533 if (ScalarizationCostPassed == std::numeric_limits<unsigned>::max()) 1534 ScalarizationCost += getScalarizationOverhead(Tys[i], false, true); 1535 ScalarCalls = std::max(ScalarCalls, Tys[i]->getVectorNumElements()); 1536 } 1537 } 1538 1539 return ScalarCalls * ScalarCost + ScalarizationCost; 1540 } 1541 1542 // This is going to be turned into a library call, make it expensive. 1543 return SingleCallCost; 1544 } 1545 1546 /// Compute a cost of the given call instruction. 1547 /// 1548 /// Compute the cost of calling function F with return type RetTy and 1549 /// argument types Tys. F might be nullptr, in this case the cost of an 1550 /// arbitrary call with the specified signature will be returned. 1551 /// This is used, for instance, when we estimate call of a vector 1552 /// counterpart of the given function. 1553 /// \param F Called function, might be nullptr. 1554 /// \param RetTy Return value types. 1555 /// \param Tys Argument types. 1556 /// \returns The cost of Call instruction. getCallInstrCost(Function * F,Type * RetTy,ArrayRef<Type * > Tys)1557 unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) { 1558 return 10; 1559 } 1560 getNumberOfParts(Type * Tp)1561 unsigned getNumberOfParts(Type *Tp) { 1562 std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Tp); 1563 return LT.first; 1564 } 1565 getAddressComputationCost(Type * Ty,ScalarEvolution *,const SCEV *)1566 unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *, 1567 const SCEV *) { 1568 return 0; 1569 } 1570 1571 /// Try to calculate arithmetic and shuffle op costs for reduction operations. 1572 /// We're assuming that reduction operation are performing the following way: 1573 /// 1. Non-pairwise reduction 1574 /// %val1 = shufflevector<n x t> %val, <n x t> %undef, 1575 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef> 1576 /// \----------------v-------------/ \----------v------------/ 1577 /// n/2 elements n/2 elements 1578 /// %red1 = op <n x t> %val, <n x t> val1 1579 /// After this operation we have a vector %red1 where only the first n/2 1580 /// elements are meaningful, the second n/2 elements are undefined and can be 1581 /// dropped. All other operations are actually working with the vector of 1582 /// length n/2, not n, though the real vector length is still n. 1583 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef, 1584 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef> 1585 /// \----------------v-------------/ \----------v------------/ 1586 /// n/4 elements 3*n/4 elements 1587 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of 1588 /// length n/2, the resulting vector has length n/4 etc. 1589 /// 2. Pairwise reduction: 1590 /// Everything is the same except for an additional shuffle operation which 1591 /// is used to produce operands for pairwise kind of reductions. 1592 /// %val1 = shufflevector<n x t> %val, <n x t> %undef, 1593 /// <n x i32> <i32 0, i32 2, ..., i32 n-2, i32 undef, ..., i32 undef> 1594 /// \-------------v----------/ \----------v------------/ 1595 /// n/2 elements n/2 elements 1596 /// %val2 = shufflevector<n x t> %val, <n x t> %undef, 1597 /// <n x i32> <i32 1, i32 3, ..., i32 n-1, i32 undef, ..., i32 undef> 1598 /// \-------------v----------/ \----------v------------/ 1599 /// n/2 elements n/2 elements 1600 /// %red1 = op <n x t> %val1, <n x t> val2 1601 /// Again, the operation is performed on <n x t> vector, but the resulting 1602 /// vector %red1 is <n/2 x t> vector. 1603 /// 1604 /// The cost model should take into account that the actual length of the 1605 /// vector is reduced on each iteration. getArithmeticReductionCost(unsigned Opcode,Type * Ty,bool IsPairwise)1606 unsigned getArithmeticReductionCost(unsigned Opcode, Type *Ty, 1607 bool IsPairwise) { 1608 assert(Ty->isVectorTy() && "Expect a vector type"); 1609 Type *ScalarTy = Ty->getVectorElementType(); 1610 unsigned NumVecElts = Ty->getVectorNumElements(); 1611 unsigned NumReduxLevels = Log2_32(NumVecElts); 1612 unsigned ArithCost = 0; 1613 unsigned ShuffleCost = 0; 1614 auto *ConcreteTTI = static_cast<T *>(this); 1615 std::pair<unsigned, MVT> LT = 1616 ConcreteTTI->getTLI()->getTypeLegalizationCost(DL, Ty); 1617 unsigned LongVectorCount = 0; 1618 unsigned MVTLen = 1619 LT.second.isVector() ? LT.second.getVectorNumElements() : 1; 1620 while (NumVecElts > MVTLen) { 1621 NumVecElts /= 2; 1622 Type *SubTy = VectorType::get(ScalarTy, NumVecElts); 1623 // Assume the pairwise shuffles add a cost. 1624 ShuffleCost += (IsPairwise + 1) * 1625 ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty, 1626 NumVecElts, SubTy); 1627 ArithCost += ConcreteTTI->getArithmeticInstrCost(Opcode, SubTy); 1628 Ty = SubTy; 1629 ++LongVectorCount; 1630 } 1631 1632 NumReduxLevels -= LongVectorCount; 1633 1634 // The minimal length of the vector is limited by the real length of vector 1635 // operations performed on the current platform. That's why several final 1636 // reduction operations are performed on the vectors with the same 1637 // architecture-dependent length. 1638 1639 // Non pairwise reductions need one shuffle per reduction level. Pairwise 1640 // reductions need two shuffles on every level, but the last one. On that 1641 // level one of the shuffles is <0, u, u, ...> which is identity. 1642 unsigned NumShuffles = NumReduxLevels; 1643 if (IsPairwise && NumReduxLevels >= 1) 1644 NumShuffles += NumReduxLevels - 1; 1645 ShuffleCost += NumShuffles * 1646 ConcreteTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 1647 0, Ty); 1648 ArithCost += NumReduxLevels * 1649 ConcreteTTI->getArithmeticInstrCost(Opcode, Ty); 1650 return ShuffleCost + ArithCost + 1651 ConcreteTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); 1652 } 1653 1654 /// Try to calculate op costs for min/max reduction operations. 1655 /// \param CondTy Conditional type for the Select instruction. getMinMaxReductionCost(Type * Ty,Type * CondTy,bool IsPairwise,bool)1656 unsigned getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwise, 1657 bool) { 1658 assert(Ty->isVectorTy() && "Expect a vector type"); 1659 Type *ScalarTy = Ty->getVectorElementType(); 1660 Type *ScalarCondTy = CondTy->getVectorElementType(); 1661 unsigned NumVecElts = Ty->getVectorNumElements(); 1662 unsigned NumReduxLevels = Log2_32(NumVecElts); 1663 unsigned CmpOpcode; 1664 if (Ty->isFPOrFPVectorTy()) { 1665 CmpOpcode = Instruction::FCmp; 1666 } else { 1667 assert(Ty->isIntOrIntVectorTy() && 1668 "expecting floating point or integer type for min/max reduction"); 1669 CmpOpcode = Instruction::ICmp; 1670 } 1671 unsigned MinMaxCost = 0; 1672 unsigned ShuffleCost = 0; 1673 auto *ConcreteTTI = static_cast<T *>(this); 1674 std::pair<unsigned, MVT> LT = 1675 ConcreteTTI->getTLI()->getTypeLegalizationCost(DL, Ty); 1676 unsigned LongVectorCount = 0; 1677 unsigned MVTLen = 1678 LT.second.isVector() ? LT.second.getVectorNumElements() : 1; 1679 while (NumVecElts > MVTLen) { 1680 NumVecElts /= 2; 1681 Type *SubTy = VectorType::get(ScalarTy, NumVecElts); 1682 CondTy = VectorType::get(ScalarCondTy, NumVecElts); 1683 1684 // Assume the pairwise shuffles add a cost. 1685 ShuffleCost += (IsPairwise + 1) * 1686 ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty, 1687 NumVecElts, SubTy); 1688 MinMaxCost += 1689 ConcreteTTI->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy, nullptr) + 1690 ConcreteTTI->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy, 1691 nullptr); 1692 Ty = SubTy; 1693 ++LongVectorCount; 1694 } 1695 1696 NumReduxLevels -= LongVectorCount; 1697 1698 // The minimal length of the vector is limited by the real length of vector 1699 // operations performed on the current platform. That's why several final 1700 // reduction opertions are perfomed on the vectors with the same 1701 // architecture-dependent length. 1702 1703 // Non pairwise reductions need one shuffle per reduction level. Pairwise 1704 // reductions need two shuffles on every level, but the last one. On that 1705 // level one of the shuffles is <0, u, u, ...> which is identity. 1706 unsigned NumShuffles = NumReduxLevels; 1707 if (IsPairwise && NumReduxLevels >= 1) 1708 NumShuffles += NumReduxLevels - 1; 1709 ShuffleCost += NumShuffles * 1710 ConcreteTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 1711 0, Ty); 1712 MinMaxCost += 1713 NumReduxLevels * 1714 (ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) + 1715 ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy, 1716 nullptr)); 1717 // The last min/max should be in vector registers and we counted it above. 1718 // So just need a single extractelement. 1719 return ShuffleCost + MinMaxCost + 1720 ConcreteTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); 1721 } 1722 getVectorSplitCost()1723 unsigned getVectorSplitCost() { return 1; } 1724 1725 /// @} 1726 }; 1727 1728 /// Concrete BasicTTIImpl that can be used if no further customization 1729 /// is needed. 1730 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> { 1731 using BaseT = BasicTTIImplBase<BasicTTIImpl>; 1732 1733 friend class BasicTTIImplBase<BasicTTIImpl>; 1734 1735 const TargetSubtargetInfo *ST; 1736 const TargetLoweringBase *TLI; 1737 getST()1738 const TargetSubtargetInfo *getST() const { return ST; } getTLI()1739 const TargetLoweringBase *getTLI() const { return TLI; } 1740 1741 public: 1742 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F); 1743 }; 1744 1745 } // end namespace llvm 1746 1747 #endif // LLVM_CODEGEN_BASICTTIIMPL_H 1748