1 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// \file 10 /// This file provides a helper that implements much of the TTI interface in 11 /// terms of the target-independent code generator and TargetLowering 12 /// interfaces. 13 /// 14 //===----------------------------------------------------------------------===// 15 16 #ifndef LLVM_CODEGEN_BASICTTIIMPL_H 17 #define LLVM_CODEGEN_BASICTTIIMPL_H 18 19 #include "llvm/Analysis/LoopInfo.h" 20 #include "llvm/Analysis/TargetTransformInfoImpl.h" 21 #include "llvm/Support/CommandLine.h" 22 #include "llvm/Target/TargetLowering.h" 23 #include "llvm/Target/TargetSubtargetInfo.h" 24 #include "llvm/Analysis/TargetLibraryInfo.h" 25 26 namespace llvm { 27 28 extern cl::opt<unsigned> PartialUnrollingThreshold; 29 30 /// \brief Base class which can be used to help build a TTI implementation. 31 /// 32 /// This class provides as much implementation of the TTI interface as is 33 /// possible using the target independent parts of the code generator. 34 /// 35 /// In order to subclass it, your class must implement a getST() method to 36 /// return the subtarget, and a getTLI() method to return the target lowering. 37 /// We need these methods implemented in the derived class so that this class 38 /// doesn't have to duplicate storage for them. 39 template <typename T> 40 class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { 41 private: 42 typedef TargetTransformInfoImplCRTPBase<T> BaseT; 43 typedef TargetTransformInfo TTI; 44 45 /// Estimate the overhead of scalarizing an instruction. Insert and Extract 46 /// are set if the result needs to be inserted and/or extracted from vectors. getScalarizationOverhead(Type * Ty,bool Insert,bool Extract)47 unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) { 48 assert(Ty->isVectorTy() && "Can only scalarize vectors"); 49 unsigned Cost = 0; 50 51 for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { 52 if (Insert) 53 Cost += static_cast<T *>(this) 54 ->getVectorInstrCost(Instruction::InsertElement, Ty, i); 55 if (Extract) 56 Cost += static_cast<T *>(this) 57 ->getVectorInstrCost(Instruction::ExtractElement, Ty, i); 58 } 59 60 return Cost; 61 } 62 63 /// Estimate the cost overhead of SK_Alternate shuffle. getAltShuffleOverhead(Type * Ty)64 unsigned getAltShuffleOverhead(Type *Ty) { 65 assert(Ty->isVectorTy() && "Can only shuffle vectors"); 66 unsigned Cost = 0; 67 // Shuffle cost is equal to the cost of extracting element from its argument 68 // plus the cost of inserting them onto the result vector. 69 70 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from 71 // index 0 of first vector, index 1 of second vector,index 2 of first 72 // vector and finally index 3 of second vector and insert them at index 73 // <0,1,2,3> of result vector. 74 for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { 75 Cost += static_cast<T *>(this) 76 ->getVectorInstrCost(Instruction::InsertElement, Ty, i); 77 Cost += static_cast<T *>(this) 78 ->getVectorInstrCost(Instruction::ExtractElement, Ty, i); 79 } 80 return Cost; 81 } 82 83 /// \brief Local query method delegates up to T which *must* implement this! getST()84 const TargetSubtargetInfo *getST() const { 85 return static_cast<const T *>(this)->getST(); 86 } 87 88 /// \brief Local query method delegates up to T which *must* implement this! getTLI()89 const TargetLoweringBase *getTLI() const { 90 return static_cast<const T *>(this)->getTLI(); 91 } 92 93 protected: BasicTTIImplBase(const TargetMachine * TM,const DataLayout & DL)94 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) 95 : BaseT(DL) {} 96 97 using TargetTransformInfoImplBase::DL; 98 99 public: 100 // Provide value semantics. MSVC requires that we spell all of these out. BasicTTIImplBase(const BasicTTIImplBase & Arg)101 BasicTTIImplBase(const BasicTTIImplBase &Arg) 102 : BaseT(static_cast<const BaseT &>(Arg)) {} BasicTTIImplBase(BasicTTIImplBase && Arg)103 BasicTTIImplBase(BasicTTIImplBase &&Arg) 104 : BaseT(std::move(static_cast<BaseT &>(Arg))) {} 105 106 /// \name Scalar TTI Implementations 107 /// @{ 108 hasBranchDivergence()109 bool hasBranchDivergence() { return false; } 110 isSourceOfDivergence(const Value * V)111 bool isSourceOfDivergence(const Value *V) { return false; } 112 isLegalAddImmediate(int64_t imm)113 bool isLegalAddImmediate(int64_t imm) { 114 return getTLI()->isLegalAddImmediate(imm); 115 } 116 isLegalICmpImmediate(int64_t imm)117 bool isLegalICmpImmediate(int64_t imm) { 118 return getTLI()->isLegalICmpImmediate(imm); 119 } 120 isLegalAddressingMode(Type * Ty,GlobalValue * BaseGV,int64_t BaseOffset,bool HasBaseReg,int64_t Scale,unsigned AddrSpace)121 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, 122 bool HasBaseReg, int64_t Scale, 123 unsigned AddrSpace) { 124 TargetLoweringBase::AddrMode AM; 125 AM.BaseGV = BaseGV; 126 AM.BaseOffs = BaseOffset; 127 AM.HasBaseReg = HasBaseReg; 128 AM.Scale = Scale; 129 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace); 130 } 131 getScalingFactorCost(Type * Ty,GlobalValue * BaseGV,int64_t BaseOffset,bool HasBaseReg,int64_t Scale,unsigned AddrSpace)132 int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, 133 bool HasBaseReg, int64_t Scale, unsigned AddrSpace) { 134 TargetLoweringBase::AddrMode AM; 135 AM.BaseGV = BaseGV; 136 AM.BaseOffs = BaseOffset; 137 AM.HasBaseReg = HasBaseReg; 138 AM.Scale = Scale; 139 return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace); 140 } 141 isTruncateFree(Type * Ty1,Type * Ty2)142 bool isTruncateFree(Type *Ty1, Type *Ty2) { 143 return getTLI()->isTruncateFree(Ty1, Ty2); 144 } 145 isProfitableToHoist(Instruction * I)146 bool isProfitableToHoist(Instruction *I) { 147 return getTLI()->isProfitableToHoist(I); 148 } 149 isTypeLegal(Type * Ty)150 bool isTypeLegal(Type *Ty) { 151 EVT VT = getTLI()->getValueType(DL, Ty); 152 return getTLI()->isTypeLegal(VT); 153 } 154 getIntrinsicCost(Intrinsic::ID IID,Type * RetTy,ArrayRef<const Value * > Arguments)155 unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, 156 ArrayRef<const Value *> Arguments) { 157 return BaseT::getIntrinsicCost(IID, RetTy, Arguments); 158 } 159 getIntrinsicCost(Intrinsic::ID IID,Type * RetTy,ArrayRef<Type * > ParamTys)160 unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, 161 ArrayRef<Type *> ParamTys) { 162 if (IID == Intrinsic::cttz) { 163 if (getTLI()->isCheapToSpeculateCttz()) 164 return TargetTransformInfo::TCC_Basic; 165 return TargetTransformInfo::TCC_Expensive; 166 } 167 168 if (IID == Intrinsic::ctlz) { 169 if (getTLI()->isCheapToSpeculateCtlz()) 170 return TargetTransformInfo::TCC_Basic; 171 return TargetTransformInfo::TCC_Expensive; 172 } 173 174 return BaseT::getIntrinsicCost(IID, RetTy, ParamTys); 175 } 176 getJumpBufAlignment()177 unsigned getJumpBufAlignment() { return getTLI()->getJumpBufAlignment(); } 178 getJumpBufSize()179 unsigned getJumpBufSize() { return getTLI()->getJumpBufSize(); } 180 shouldBuildLookupTables()181 bool shouldBuildLookupTables() { 182 const TargetLoweringBase *TLI = getTLI(); 183 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) || 184 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other); 185 } 186 haveFastSqrt(Type * Ty)187 bool haveFastSqrt(Type *Ty) { 188 const TargetLoweringBase *TLI = getTLI(); 189 EVT VT = TLI->getValueType(DL, Ty); 190 return TLI->isTypeLegal(VT) && 191 TLI->isOperationLegalOrCustom(ISD::FSQRT, VT); 192 } 193 getFPOpCost(Type * Ty)194 unsigned getFPOpCost(Type *Ty) { 195 // By default, FP instructions are no more expensive since they are 196 // implemented in HW. Target specific TTI can override this. 197 return TargetTransformInfo::TCC_Basic; 198 } 199 getOperationCost(unsigned Opcode,Type * Ty,Type * OpTy)200 unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) { 201 const TargetLoweringBase *TLI = getTLI(); 202 switch (Opcode) { 203 default: break; 204 case Instruction::Trunc: { 205 if (TLI->isTruncateFree(OpTy, Ty)) 206 return TargetTransformInfo::TCC_Free; 207 return TargetTransformInfo::TCC_Basic; 208 } 209 case Instruction::ZExt: { 210 if (TLI->isZExtFree(OpTy, Ty)) 211 return TargetTransformInfo::TCC_Free; 212 return TargetTransformInfo::TCC_Basic; 213 } 214 } 215 216 return BaseT::getOperationCost(Opcode, Ty, OpTy); 217 } 218 getUnrollingPreferences(Loop * L,TTI::UnrollingPreferences & UP)219 void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP) { 220 // This unrolling functionality is target independent, but to provide some 221 // motivation for its intended use, for x86: 222 223 // According to the Intel 64 and IA-32 Architectures Optimization Reference 224 // Manual, Intel Core models and later have a loop stream detector (and 225 // associated uop queue) that can benefit from partial unrolling. 226 // The relevant requirements are: 227 // - The loop must have no more than 4 (8 for Nehalem and later) branches 228 // taken, and none of them may be calls. 229 // - The loop can have no more than 18 (28 for Nehalem and later) uops. 230 231 // According to the Software Optimization Guide for AMD Family 15h 232 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor 233 // and loop buffer which can benefit from partial unrolling. 234 // The relevant requirements are: 235 // - The loop must have fewer than 16 branches 236 // - The loop must have less than 40 uops in all executed loop branches 237 238 // The number of taken branches in a loop is hard to estimate here, and 239 // benchmarking has revealed that it is better not to be conservative when 240 // estimating the branch count. As a result, we'll ignore the branch limits 241 // until someone finds a case where it matters in practice. 242 243 unsigned MaxOps; 244 const TargetSubtargetInfo *ST = getST(); 245 if (PartialUnrollingThreshold.getNumOccurrences() > 0) 246 MaxOps = PartialUnrollingThreshold; 247 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0) 248 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize; 249 else 250 return; 251 252 // Scan the loop: don't unroll loops with calls. 253 for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; 254 ++I) { 255 BasicBlock *BB = *I; 256 257 for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J) 258 if (isa<CallInst>(J) || isa<InvokeInst>(J)) { 259 ImmutableCallSite CS(&*J); 260 if (const Function *F = CS.getCalledFunction()) { 261 if (!static_cast<T *>(this)->isLoweredToCall(F)) 262 continue; 263 } 264 265 return; 266 } 267 } 268 269 // Enable runtime and partial unrolling up to the specified size. 270 UP.Partial = UP.Runtime = true; 271 UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps; 272 } 273 274 /// @} 275 276 /// \name Vector TTI Implementations 277 /// @{ 278 getNumberOfRegisters(bool Vector)279 unsigned getNumberOfRegisters(bool Vector) { return Vector ? 0 : 1; } 280 getRegisterBitWidth(bool Vector)281 unsigned getRegisterBitWidth(bool Vector) { return 32; } 282 getMaxInterleaveFactor(unsigned VF)283 unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } 284 285 unsigned getArithmeticInstrCost( 286 unsigned Opcode, Type *Ty, 287 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, 288 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, 289 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, 290 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None) { 291 // Check if any of the operands are vector operands. 292 const TargetLoweringBase *TLI = getTLI(); 293 int ISD = TLI->InstructionOpcodeToISD(Opcode); 294 assert(ISD && "Invalid opcode"); 295 296 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 297 298 bool IsFloat = Ty->getScalarType()->isFloatingPointTy(); 299 // Assume that floating point arithmetic operations cost twice as much as 300 // integer operations. 301 unsigned OpCost = (IsFloat ? 2 : 1); 302 303 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 304 // The operation is legal. Assume it costs 1. 305 // TODO: Once we have extract/insert subvector cost we need to use them. 306 return LT.first * OpCost; 307 } 308 309 if (!TLI->isOperationExpand(ISD, LT.second)) { 310 // If the operation is custom lowered then assume 311 // thare the code is twice as expensive. 312 return LT.first * 2 * OpCost; 313 } 314 315 // Else, assume that we need to scalarize this op. 316 if (Ty->isVectorTy()) { 317 unsigned Num = Ty->getVectorNumElements(); 318 unsigned Cost = static_cast<T *>(this) 319 ->getArithmeticInstrCost(Opcode, Ty->getScalarType()); 320 // return the cost of multiple scalar invocation plus the cost of 321 // inserting 322 // and extracting the values. 323 return getScalarizationOverhead(Ty, true, true) + Num * Cost; 324 } 325 326 // We don't know anything about this scalar instruction. 327 return OpCost; 328 } 329 getShuffleCost(TTI::ShuffleKind Kind,Type * Tp,int Index,Type * SubTp)330 unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, 331 Type *SubTp) { 332 if (Kind == TTI::SK_Alternate) { 333 return getAltShuffleOverhead(Tp); 334 } 335 return 1; 336 } 337 getCastInstrCost(unsigned Opcode,Type * Dst,Type * Src)338 unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { 339 const TargetLoweringBase *TLI = getTLI(); 340 int ISD = TLI->InstructionOpcodeToISD(Opcode); 341 assert(ISD && "Invalid opcode"); 342 std::pair<unsigned, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, Src); 343 std::pair<unsigned, MVT> DstLT = TLI->getTypeLegalizationCost(DL, Dst); 344 345 // Check for NOOP conversions. 346 if (SrcLT.first == DstLT.first && 347 SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) { 348 349 // Bitcast between types that are legalized to the same type are free. 350 if (Opcode == Instruction::BitCast || Opcode == Instruction::Trunc) 351 return 0; 352 } 353 354 if (Opcode == Instruction::Trunc && 355 TLI->isTruncateFree(SrcLT.second, DstLT.second)) 356 return 0; 357 358 if (Opcode == Instruction::ZExt && 359 TLI->isZExtFree(SrcLT.second, DstLT.second)) 360 return 0; 361 362 // If the cast is marked as legal (or promote) then assume low cost. 363 if (SrcLT.first == DstLT.first && 364 TLI->isOperationLegalOrPromote(ISD, DstLT.second)) 365 return 1; 366 367 // Handle scalar conversions. 368 if (!Src->isVectorTy() && !Dst->isVectorTy()) { 369 370 // Scalar bitcasts are usually free. 371 if (Opcode == Instruction::BitCast) 372 return 0; 373 374 // Just check the op cost. If the operation is legal then assume it costs 375 // 1. 376 if (!TLI->isOperationExpand(ISD, DstLT.second)) 377 return 1; 378 379 // Assume that illegal scalar instruction are expensive. 380 return 4; 381 } 382 383 // Check vector-to-vector casts. 384 if (Dst->isVectorTy() && Src->isVectorTy()) { 385 386 // If the cast is between same-sized registers, then the check is simple. 387 if (SrcLT.first == DstLT.first && 388 SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) { 389 390 // Assume that Zext is done using AND. 391 if (Opcode == Instruction::ZExt) 392 return 1; 393 394 // Assume that sext is done using SHL and SRA. 395 if (Opcode == Instruction::SExt) 396 return 2; 397 398 // Just check the op cost. If the operation is legal then assume it 399 // costs 400 // 1 and multiply by the type-legalization overhead. 401 if (!TLI->isOperationExpand(ISD, DstLT.second)) 402 return SrcLT.first * 1; 403 } 404 405 // If we are converting vectors and the operation is illegal, or 406 // if the vectors are legalized to different types, estimate the 407 // scalarization costs. 408 unsigned Num = Dst->getVectorNumElements(); 409 unsigned Cost = static_cast<T *>(this)->getCastInstrCost( 410 Opcode, Dst->getScalarType(), Src->getScalarType()); 411 412 // Return the cost of multiple scalar invocation plus the cost of 413 // inserting and extracting the values. 414 return getScalarizationOverhead(Dst, true, true) + Num * Cost; 415 } 416 417 // We already handled vector-to-vector and scalar-to-scalar conversions. 418 // This 419 // is where we handle bitcast between vectors and scalars. We need to assume 420 // that the conversion is scalarized in one way or another. 421 if (Opcode == Instruction::BitCast) 422 // Illegal bitcasts are done by storing and loading from a stack slot. 423 return (Src->isVectorTy() ? getScalarizationOverhead(Src, false, true) 424 : 0) + 425 (Dst->isVectorTy() ? getScalarizationOverhead(Dst, true, false) 426 : 0); 427 428 llvm_unreachable("Unhandled cast"); 429 } 430 getCFInstrCost(unsigned Opcode)431 unsigned getCFInstrCost(unsigned Opcode) { 432 // Branches are assumed to be predicted. 433 return 0; 434 } 435 getCmpSelInstrCost(unsigned Opcode,Type * ValTy,Type * CondTy)436 unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { 437 const TargetLoweringBase *TLI = getTLI(); 438 int ISD = TLI->InstructionOpcodeToISD(Opcode); 439 assert(ISD && "Invalid opcode"); 440 441 // Selects on vectors are actually vector selects. 442 if (ISD == ISD::SELECT) { 443 assert(CondTy && "CondTy must exist"); 444 if (CondTy->isVectorTy()) 445 ISD = ISD::VSELECT; 446 } 447 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 448 449 if (!(ValTy->isVectorTy() && !LT.second.isVector()) && 450 !TLI->isOperationExpand(ISD, LT.second)) { 451 // The operation is legal. Assume it costs 1. Multiply 452 // by the type-legalization overhead. 453 return LT.first * 1; 454 } 455 456 // Otherwise, assume that the cast is scalarized. 457 if (ValTy->isVectorTy()) { 458 unsigned Num = ValTy->getVectorNumElements(); 459 if (CondTy) 460 CondTy = CondTy->getScalarType(); 461 unsigned Cost = static_cast<T *>(this)->getCmpSelInstrCost( 462 Opcode, ValTy->getScalarType(), CondTy); 463 464 // Return the cost of multiple scalar invocation plus the cost of 465 // inserting 466 // and extracting the values. 467 return getScalarizationOverhead(ValTy, true, false) + Num * Cost; 468 } 469 470 // Unknown scalar opcode. 471 return 1; 472 } 473 getVectorInstrCost(unsigned Opcode,Type * Val,unsigned Index)474 unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { 475 std::pair<unsigned, MVT> LT = 476 getTLI()->getTypeLegalizationCost(DL, Val->getScalarType()); 477 478 return LT.first; 479 } 480 getMemoryOpCost(unsigned Opcode,Type * Src,unsigned Alignment,unsigned AddressSpace)481 unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 482 unsigned AddressSpace) { 483 assert(!Src->isVoidTy() && "Invalid type"); 484 std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Src); 485 486 // Assuming that all loads of legal types cost 1. 487 unsigned Cost = LT.first; 488 489 if (Src->isVectorTy() && 490 Src->getPrimitiveSizeInBits() < LT.second.getSizeInBits()) { 491 // This is a vector load that legalizes to a larger type than the vector 492 // itself. Unless the corresponding extending load or truncating store is 493 // legal, then this will scalarize. 494 TargetLowering::LegalizeAction LA = TargetLowering::Expand; 495 EVT MemVT = getTLI()->getValueType(DL, Src); 496 if (Opcode == Instruction::Store) 497 LA = getTLI()->getTruncStoreAction(LT.second, MemVT); 498 else 499 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT); 500 501 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { 502 // This is a vector load/store for some illegal type that is scalarized. 503 // We must account for the cost of building or decomposing the vector. 504 Cost += getScalarizationOverhead(Src, Opcode != Instruction::Store, 505 Opcode == Instruction::Store); 506 } 507 } 508 509 return Cost; 510 } 511 getInterleavedMemoryOpCost(unsigned Opcode,Type * VecTy,unsigned Factor,ArrayRef<unsigned> Indices,unsigned Alignment,unsigned AddressSpace)512 unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, 513 unsigned Factor, 514 ArrayRef<unsigned> Indices, 515 unsigned Alignment, 516 unsigned AddressSpace) { 517 VectorType *VT = dyn_cast<VectorType>(VecTy); 518 assert(VT && "Expect a vector type for interleaved memory op"); 519 520 unsigned NumElts = VT->getNumElements(); 521 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor"); 522 523 unsigned NumSubElts = NumElts / Factor; 524 VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts); 525 526 // Firstly, the cost of load/store operation. 527 unsigned Cost = static_cast<T *>(this)->getMemoryOpCost( 528 Opcode, VecTy, Alignment, AddressSpace); 529 530 // Then plus the cost of interleave operation. 531 if (Opcode == Instruction::Load) { 532 // The interleave cost is similar to extract sub vectors' elements 533 // from the wide vector, and insert them into sub vectors. 534 // 535 // E.g. An interleaved load of factor 2 (with one member of index 0): 536 // %vec = load <8 x i32>, <8 x i32>* %ptr 537 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 538 // The cost is estimated as extract elements at 0, 2, 4, 6 from the 539 // <8 x i32> vector and insert them into a <4 x i32> vector. 540 541 assert(Indices.size() <= Factor && 542 "Interleaved memory op has too many members"); 543 544 for (unsigned Index : Indices) { 545 assert(Index < Factor && "Invalid index for interleaved memory op"); 546 547 // Extract elements from loaded vector for each sub vector. 548 for (unsigned i = 0; i < NumSubElts; i++) 549 Cost += static_cast<T *>(this)->getVectorInstrCost( 550 Instruction::ExtractElement, VT, Index + i * Factor); 551 } 552 553 unsigned InsSubCost = 0; 554 for (unsigned i = 0; i < NumSubElts; i++) 555 InsSubCost += static_cast<T *>(this)->getVectorInstrCost( 556 Instruction::InsertElement, SubVT, i); 557 558 Cost += Indices.size() * InsSubCost; 559 } else { 560 // The interleave cost is extract all elements from sub vectors, and 561 // insert them into the wide vector. 562 // 563 // E.g. An interleaved store of factor 2: 564 // %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7> 565 // store <8 x i32> %interleaved.vec, <8 x i32>* %ptr 566 // The cost is estimated as extract all elements from both <4 x i32> 567 // vectors and insert into the <8 x i32> vector. 568 569 unsigned ExtSubCost = 0; 570 for (unsigned i = 0; i < NumSubElts; i++) 571 ExtSubCost += static_cast<T *>(this)->getVectorInstrCost( 572 Instruction::ExtractElement, SubVT, i); 573 Cost += ExtSubCost * Factor; 574 575 for (unsigned i = 0; i < NumElts; i++) 576 Cost += static_cast<T *>(this) 577 ->getVectorInstrCost(Instruction::InsertElement, VT, i); 578 } 579 580 return Cost; 581 } 582 getIntrinsicInstrCost(Intrinsic::ID IID,Type * RetTy,ArrayRef<Type * > Tys)583 unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, 584 ArrayRef<Type *> Tys) { 585 unsigned ISD = 0; 586 switch (IID) { 587 default: { 588 // Assume that we need to scalarize this intrinsic. 589 unsigned ScalarizationCost = 0; 590 unsigned ScalarCalls = 1; 591 Type *ScalarRetTy = RetTy; 592 if (RetTy->isVectorTy()) { 593 ScalarizationCost = getScalarizationOverhead(RetTy, true, false); 594 ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements()); 595 ScalarRetTy = RetTy->getScalarType(); 596 } 597 SmallVector<Type *, 4> ScalarTys; 598 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 599 Type *Ty = Tys[i]; 600 if (Ty->isVectorTy()) { 601 ScalarizationCost += getScalarizationOverhead(Ty, false, true); 602 ScalarCalls = std::max(ScalarCalls, Ty->getVectorNumElements()); 603 Ty = Ty->getScalarType(); 604 } 605 ScalarTys.push_back(Ty); 606 } 607 if (ScalarCalls == 1) 608 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap. 609 610 unsigned ScalarCost = static_cast<T *>(this)->getIntrinsicInstrCost( 611 IID, ScalarRetTy, ScalarTys); 612 613 return ScalarCalls * ScalarCost + ScalarizationCost; 614 } 615 // Look for intrinsics that can be lowered directly or turned into a scalar 616 // intrinsic call. 617 case Intrinsic::sqrt: 618 ISD = ISD::FSQRT; 619 break; 620 case Intrinsic::sin: 621 ISD = ISD::FSIN; 622 break; 623 case Intrinsic::cos: 624 ISD = ISD::FCOS; 625 break; 626 case Intrinsic::exp: 627 ISD = ISD::FEXP; 628 break; 629 case Intrinsic::exp2: 630 ISD = ISD::FEXP2; 631 break; 632 case Intrinsic::log: 633 ISD = ISD::FLOG; 634 break; 635 case Intrinsic::log10: 636 ISD = ISD::FLOG10; 637 break; 638 case Intrinsic::log2: 639 ISD = ISD::FLOG2; 640 break; 641 case Intrinsic::fabs: 642 ISD = ISD::FABS; 643 break; 644 case Intrinsic::minnum: 645 ISD = ISD::FMINNUM; 646 break; 647 case Intrinsic::maxnum: 648 ISD = ISD::FMAXNUM; 649 break; 650 case Intrinsic::copysign: 651 ISD = ISD::FCOPYSIGN; 652 break; 653 case Intrinsic::floor: 654 ISD = ISD::FFLOOR; 655 break; 656 case Intrinsic::ceil: 657 ISD = ISD::FCEIL; 658 break; 659 case Intrinsic::trunc: 660 ISD = ISD::FTRUNC; 661 break; 662 case Intrinsic::nearbyint: 663 ISD = ISD::FNEARBYINT; 664 break; 665 case Intrinsic::rint: 666 ISD = ISD::FRINT; 667 break; 668 case Intrinsic::round: 669 ISD = ISD::FROUND; 670 break; 671 case Intrinsic::pow: 672 ISD = ISD::FPOW; 673 break; 674 case Intrinsic::fma: 675 ISD = ISD::FMA; 676 break; 677 case Intrinsic::fmuladd: 678 ISD = ISD::FMA; 679 break; 680 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. 681 case Intrinsic::lifetime_start: 682 case Intrinsic::lifetime_end: 683 return 0; 684 case Intrinsic::masked_store: 685 return static_cast<T *>(this) 686 ->getMaskedMemoryOpCost(Instruction::Store, Tys[0], 0, 0); 687 case Intrinsic::masked_load: 688 return static_cast<T *>(this) 689 ->getMaskedMemoryOpCost(Instruction::Load, RetTy, 0, 0); 690 } 691 692 const TargetLoweringBase *TLI = getTLI(); 693 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy); 694 695 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 696 // The operation is legal. Assume it costs 1. 697 // If the type is split to multiple registers, assume that there is some 698 // overhead to this. 699 // TODO: Once we have extract/insert subvector cost we need to use them. 700 if (LT.first > 1) 701 return LT.first * 2; 702 return LT.first * 1; 703 } 704 705 if (!TLI->isOperationExpand(ISD, LT.second)) { 706 // If the operation is custom lowered then assume 707 // thare the code is twice as expensive. 708 return LT.first * 2; 709 } 710 711 // If we can't lower fmuladd into an FMA estimate the cost as a floating 712 // point mul followed by an add. 713 if (IID == Intrinsic::fmuladd) 714 return static_cast<T *>(this) 715 ->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) + 716 static_cast<T *>(this) 717 ->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy); 718 719 // Else, assume that we need to scalarize this intrinsic. For math builtins 720 // this will emit a costly libcall, adding call overhead and spills. Make it 721 // very expensive. 722 if (RetTy->isVectorTy()) { 723 unsigned ScalarizationCost = getScalarizationOverhead(RetTy, true, false); 724 unsigned ScalarCalls = RetTy->getVectorNumElements(); 725 SmallVector<Type *, 4> ScalarTys; 726 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 727 Type *Ty = Tys[i]; 728 if (Ty->isVectorTy()) 729 Ty = Ty->getScalarType(); 730 ScalarTys.push_back(Ty); 731 } 732 unsigned ScalarCost = static_cast<T *>(this)->getIntrinsicInstrCost( 733 IID, RetTy->getScalarType(), ScalarTys); 734 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 735 if (Tys[i]->isVectorTy()) { 736 ScalarizationCost += getScalarizationOverhead(Tys[i], false, true); 737 ScalarCalls = std::max(ScalarCalls, Tys[i]->getVectorNumElements()); 738 } 739 } 740 741 return ScalarCalls * ScalarCost + ScalarizationCost; 742 } 743 744 // This is going to be turned into a library call, make it expensive. 745 return 10; 746 } 747 748 /// \brief Compute a cost of the given call instruction. 749 /// 750 /// Compute the cost of calling function F with return type RetTy and 751 /// argument types Tys. F might be nullptr, in this case the cost of an 752 /// arbitrary call with the specified signature will be returned. 753 /// This is used, for instance, when we estimate call of a vector 754 /// counterpart of the given function. 755 /// \param F Called function, might be nullptr. 756 /// \param RetTy Return value types. 757 /// \param Tys Argument types. 758 /// \returns The cost of Call instruction. getCallInstrCost(Function * F,Type * RetTy,ArrayRef<Type * > Tys)759 unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) { 760 return 10; 761 } 762 getNumberOfParts(Type * Tp)763 unsigned getNumberOfParts(Type *Tp) { 764 std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Tp); 765 return LT.first; 766 } 767 getAddressComputationCost(Type * Ty,bool IsComplex)768 unsigned getAddressComputationCost(Type *Ty, bool IsComplex) { return 0; } 769 getReductionCost(unsigned Opcode,Type * Ty,bool IsPairwise)770 unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise) { 771 assert(Ty->isVectorTy() && "Expect a vector type"); 772 unsigned NumVecElts = Ty->getVectorNumElements(); 773 unsigned NumReduxLevels = Log2_32(NumVecElts); 774 unsigned ArithCost = 775 NumReduxLevels * 776 static_cast<T *>(this)->getArithmeticInstrCost(Opcode, Ty); 777 // Assume the pairwise shuffles add a cost. 778 unsigned ShuffleCost = 779 NumReduxLevels * (IsPairwise + 1) * 780 static_cast<T *>(this) 781 ->getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts / 2, Ty); 782 return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true); 783 } 784 785 /// @} 786 }; 787 788 /// \brief Concrete BasicTTIImpl that can be used if no further customization 789 /// is needed. 790 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> { 791 typedef BasicTTIImplBase<BasicTTIImpl> BaseT; 792 friend class BasicTTIImplBase<BasicTTIImpl>; 793 794 const TargetSubtargetInfo *ST; 795 const TargetLoweringBase *TLI; 796 getST()797 const TargetSubtargetInfo *getST() const { return ST; } getTLI()798 const TargetLoweringBase *getTLI() const { return TLI; } 799 800 public: 801 explicit BasicTTIImpl(const TargetMachine *ST, const Function &F); 802 803 // Provide value semantics. MSVC requires that we spell all of these out. BasicTTIImpl(const BasicTTIImpl & Arg)804 BasicTTIImpl(const BasicTTIImpl &Arg) 805 : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {} BasicTTIImpl(BasicTTIImpl && Arg)806 BasicTTIImpl(BasicTTIImpl &&Arg) 807 : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)), 808 TLI(std::move(Arg.TLI)) {} 809 }; 810 811 } 812 813 #endif 814