1 //===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This file a TargetTransformInfo::Concept conforming object specific to the 11 /// AMDGPU target machine. It uses the target's detailed information to 12 /// provide more precise answers to certain TTI queries, while letting the 13 /// target independent and default TTI implementations handle the rest. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H 18 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H 19 20 #include "AMDGPU.h" 21 #include "AMDGPUSubtarget.h" 22 #include "AMDGPUTargetMachine.h" 23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 24 #include "Utils/AMDGPUBaseInfo.h" 25 #include "llvm/ADT/ArrayRef.h" 26 #include "llvm/Analysis/TargetTransformInfo.h" 27 #include "llvm/CodeGen/BasicTTIImpl.h" 28 #include "llvm/IR/Function.h" 29 #include "llvm/MC/SubtargetFeature.h" 30 #include "llvm/Support/MathExtras.h" 31 #include <cassert> 32 33 namespace llvm { 34 35 class AMDGPUTargetLowering; 36 class InstCombiner; 37 class Loop; 38 class ScalarEvolution; 39 class Type; 40 class Value; 41 42 class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> { 43 using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>; 44 using TTI = TargetTransformInfo; 45 46 friend BaseT; 47 48 Triple TargetTriple; 49 50 const GCNSubtarget *ST; 51 const TargetLoweringBase *TLI; 52 getST()53 const TargetSubtargetInfo *getST() const { return ST; } getTLI()54 const TargetLoweringBase *getTLI() const { return TLI; } 55 56 public: AMDGPUTTIImpl(const AMDGPUTargetMachine * TM,const Function & F)57 explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) 58 : BaseT(TM, F.getParent()->getDataLayout()), 59 TargetTriple(TM->getTargetTriple()), 60 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))), 61 TLI(ST->getTargetLowering()) {} 62 63 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 64 TTI::UnrollingPreferences &UP); 65 66 void getPeelingPreferences(Loop *L, ScalarEvolution &SE, 67 TTI::PeelingPreferences &PP); 68 }; 69 70 class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> { 71 using BaseT = BasicTTIImplBase<GCNTTIImpl>; 72 using TTI = TargetTransformInfo; 73 74 friend BaseT; 75 76 const GCNSubtarget *ST; 77 const SITargetLowering *TLI; 78 AMDGPUTTIImpl CommonTTI; 79 bool IsGraphics; 80 bool HasFP32Denormals; 81 bool HasFP64FP16Denormals; 82 unsigned MaxVGPRs; 83 84 const FeatureBitset InlineFeatureIgnoreList = { 85 // Codegen control options which don't matter. 86 AMDGPU::FeatureEnableLoadStoreOpt, 87 AMDGPU::FeatureEnableSIScheduler, 88 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, 89 AMDGPU::FeatureFlatForGlobal, 90 AMDGPU::FeaturePromoteAlloca, 91 AMDGPU::FeatureUnalignedScratchAccess, 92 AMDGPU::FeatureUnalignedAccessMode, 93 94 AMDGPU::FeatureAutoWaitcntBeforeBarrier, 95 96 // Property of the kernel/environment which can't actually differ. 97 AMDGPU::FeatureSGPRInitBug, 98 AMDGPU::FeatureXNACK, 99 AMDGPU::FeatureTrapHandler, 100 101 // The default assumption needs to be ecc is enabled, but no directly 102 // exposed operations depend on it, so it can be safely inlined. 103 AMDGPU::FeatureSRAMECC, 104 105 // Perf-tuning features 106 AMDGPU::FeatureFastFMAF32, 107 AMDGPU::HalfRate64Ops 108 }; 109 getST()110 const GCNSubtarget *getST() const { return ST; } getTLI()111 const AMDGPUTargetLowering *getTLI() const { return TLI; } 112 getFullRateInstrCost()113 static inline int getFullRateInstrCost() { 114 return TargetTransformInfo::TCC_Basic; 115 } 116 117 static inline int getHalfRateInstrCost( 118 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) { 119 return CostKind == TTI::TCK_CodeSize ? 2 120 : 2 * TargetTransformInfo::TCC_Basic; 121 } 122 123 // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe 124 // should be 2 or 4. 125 static inline int getQuarterRateInstrCost( 126 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) { 127 return CostKind == TTI::TCK_CodeSize ? 2 128 : 4 * TargetTransformInfo::TCC_Basic; 129 } 130 131 // On some parts, normal fp64 operations are half rate, and others 132 // quarter. This also applies to some integer operations. 133 inline int get64BitInstrCost( 134 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const { 135 return ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind) 136 : getQuarterRateInstrCost(CostKind); 137 } 138 139 public: GCNTTIImpl(const AMDGPUTargetMachine * TM,const Function & F)140 explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) 141 : BaseT(TM, F.getParent()->getDataLayout()), 142 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))), 143 TLI(ST->getTargetLowering()), CommonTTI(TM, F), 144 IsGraphics(AMDGPU::isGraphics(F.getCallingConv())), 145 MaxVGPRs(ST->getMaxNumVGPRs( 146 std::max(ST->getWavesPerEU(F).first, 147 ST->getWavesPerEUForWorkGroup( 148 ST->getFlatWorkGroupSizes(F).second)))) { 149 AMDGPU::SIModeRegisterDefaults Mode(F); 150 HasFP32Denormals = Mode.allFP32Denormals(); 151 HasFP64FP16Denormals = Mode.allFP64FP16Denormals(); 152 } 153 hasBranchDivergence()154 bool hasBranchDivergence() { return true; } 155 bool useGPUDivergenceAnalysis() const; 156 157 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 158 TTI::UnrollingPreferences &UP); 159 160 void getPeelingPreferences(Loop *L, ScalarEvolution &SE, 161 TTI::PeelingPreferences &PP); 162 getPopcntSupport(unsigned TyWidth)163 TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) { 164 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 165 return TTI::PSK_FastHardware; 166 } 167 168 unsigned getHardwareNumberOfRegisters(bool Vector) const; 169 unsigned getNumberOfRegisters(bool Vector) const; 170 unsigned getNumberOfRegisters(unsigned RCID) const; 171 unsigned getRegisterBitWidth(bool Vector) const; 172 unsigned getMinVectorRegisterBitWidth() const; 173 unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, 174 unsigned ChainSizeInBytes, 175 VectorType *VecTy) const; 176 unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, 177 unsigned ChainSizeInBytes, 178 VectorType *VecTy) const; 179 unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; 180 181 bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, 182 unsigned AddrSpace) const; 183 bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, 184 unsigned AddrSpace) const; 185 bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, 186 unsigned AddrSpace) const; 187 Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, 188 unsigned SrcAddrSpace, unsigned DestAddrSpace, 189 unsigned SrcAlign, unsigned DestAlign) const; 190 191 void getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type *> &OpsOut, 192 LLVMContext &Context, 193 unsigned RemainingBytes, 194 unsigned SrcAddrSpace, 195 unsigned DestAddrSpace, 196 unsigned SrcAlign, 197 unsigned DestAlign) const; 198 unsigned getMaxInterleaveFactor(unsigned VF); 199 200 bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const; 201 202 int getArithmeticInstrCost( 203 unsigned Opcode, Type *Ty, 204 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, 205 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, 206 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, 207 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, 208 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, 209 ArrayRef<const Value *> Args = ArrayRef<const Value *>(), 210 const Instruction *CxtI = nullptr); 211 212 unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind); 213 214 bool isInlineAsmSourceOfDivergence(const CallInst *CI, 215 ArrayRef<unsigned> Indices = {}) const; 216 217 int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); 218 bool isSourceOfDivergence(const Value *V) const; 219 bool isAlwaysUniform(const Value *V) const; 220 getFlatAddressSpace()221 unsigned getFlatAddressSpace() const { 222 // Don't bother running InferAddressSpaces pass on graphics shaders which 223 // don't use flat addressing. 224 if (IsGraphics) 225 return -1; 226 return AMDGPUAS::FLAT_ADDRESS; 227 } 228 229 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, 230 Intrinsic::ID IID) const; 231 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, 232 Value *NewV) const; 233 234 bool canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1, 235 InstCombiner &IC) const; 236 Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, 237 IntrinsicInst &II) const; 238 Optional<Value *> simplifyDemandedVectorEltsIntrinsic( 239 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 240 APInt &UndefElts2, APInt &UndefElts3, 241 std::function<void(Instruction *, unsigned, APInt, APInt &)> 242 SimplifyAndSetOp) const; 243 getVectorSplitCost()244 unsigned getVectorSplitCost() { return 0; } 245 246 unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, 247 VectorType *SubTp); 248 249 bool areInlineCompatible(const Function *Caller, 250 const Function *Callee) const; 251 getInliningThresholdMultiplier()252 unsigned getInliningThresholdMultiplier() { return 11; } 253 getInlinerVectorBonusPercent()254 int getInlinerVectorBonusPercent() { return 0; } 255 256 int getArithmeticReductionCost( 257 unsigned Opcode, 258 VectorType *Ty, 259 bool IsPairwise, 260 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); 261 262 int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 263 TTI::TargetCostKind CostKind); 264 int getMinMaxReductionCost( 265 VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned, 266 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); 267 }; 268 269 class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> { 270 using BaseT = BasicTTIImplBase<R600TTIImpl>; 271 using TTI = TargetTransformInfo; 272 273 friend BaseT; 274 275 const R600Subtarget *ST; 276 const AMDGPUTargetLowering *TLI; 277 AMDGPUTTIImpl CommonTTI; 278 279 public: R600TTIImpl(const AMDGPUTargetMachine * TM,const Function & F)280 explicit R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F) 281 : BaseT(TM, F.getParent()->getDataLayout()), 282 ST(static_cast<const R600Subtarget*>(TM->getSubtargetImpl(F))), 283 TLI(ST->getTargetLowering()), 284 CommonTTI(TM, F) {} 285 getST()286 const R600Subtarget *getST() const { return ST; } getTLI()287 const AMDGPUTargetLowering *getTLI() const { return TLI; } 288 289 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 290 TTI::UnrollingPreferences &UP); 291 void getPeelingPreferences(Loop *L, ScalarEvolution &SE, 292 TTI::PeelingPreferences &PP); 293 unsigned getHardwareNumberOfRegisters(bool Vec) const; 294 unsigned getNumberOfRegisters(bool Vec) const; 295 unsigned getRegisterBitWidth(bool Vector) const; 296 unsigned getMinVectorRegisterBitWidth() const; 297 unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; 298 bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, 299 unsigned AddrSpace) const; 300 bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, 301 unsigned AddrSpace) const; 302 bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, 303 unsigned AddrSpace) const; 304 unsigned getMaxInterleaveFactor(unsigned VF); 305 unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind); 306 int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); 307 }; 308 309 } // end namespace llvm 310 311 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H 312