• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file a TargetTransformInfo::Concept conforming object specific to the
11 /// AMDGPU target machine. It uses the target's detailed information to
12 /// provide more precise answers to certain TTI queries, while letting the
13 /// target independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
18 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
19 
20 #include "AMDGPU.h"
21 #include "AMDGPUSubtarget.h"
22 #include "AMDGPUTargetMachine.h"
23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24 #include "Utils/AMDGPUBaseInfo.h"
25 #include "llvm/ADT/ArrayRef.h"
26 #include "llvm/Analysis/TargetTransformInfo.h"
27 #include "llvm/CodeGen/BasicTTIImpl.h"
28 #include "llvm/IR/Function.h"
29 #include "llvm/MC/SubtargetFeature.h"
30 #include "llvm/Support/MathExtras.h"
31 #include <cassert>
32 
33 namespace llvm {
34 
35 class AMDGPUTargetLowering;
36 class InstCombiner;
37 class Loop;
38 class ScalarEvolution;
39 class Type;
40 class Value;
41 
42 class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
43   using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>;
44   using TTI = TargetTransformInfo;
45 
46   friend BaseT;
47 
48   Triple TargetTriple;
49 
50   const GCNSubtarget *ST;
51   const TargetLoweringBase *TLI;
52 
getST()53   const TargetSubtargetInfo *getST() const { return ST; }
getTLI()54   const TargetLoweringBase *getTLI() const { return TLI; }
55 
56 public:
AMDGPUTTIImpl(const AMDGPUTargetMachine * TM,const Function & F)57   explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
58       : BaseT(TM, F.getParent()->getDataLayout()),
59         TargetTriple(TM->getTargetTriple()),
60         ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
61         TLI(ST->getTargetLowering()) {}
62 
63   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
64                                TTI::UnrollingPreferences &UP);
65 
66   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
67                              TTI::PeelingPreferences &PP);
68 };
69 
70 class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
71   using BaseT = BasicTTIImplBase<GCNTTIImpl>;
72   using TTI = TargetTransformInfo;
73 
74   friend BaseT;
75 
76   const GCNSubtarget *ST;
77   const SITargetLowering *TLI;
78   AMDGPUTTIImpl CommonTTI;
79   bool IsGraphics;
80   bool HasFP32Denormals;
81   bool HasFP64FP16Denormals;
82   unsigned MaxVGPRs;
83 
84   const FeatureBitset InlineFeatureIgnoreList = {
85     // Codegen control options which don't matter.
86     AMDGPU::FeatureEnableLoadStoreOpt,
87     AMDGPU::FeatureEnableSIScheduler,
88     AMDGPU::FeatureEnableUnsafeDSOffsetFolding,
89     AMDGPU::FeatureFlatForGlobal,
90     AMDGPU::FeaturePromoteAlloca,
91     AMDGPU::FeatureUnalignedScratchAccess,
92     AMDGPU::FeatureUnalignedAccessMode,
93 
94     AMDGPU::FeatureAutoWaitcntBeforeBarrier,
95 
96     // Property of the kernel/environment which can't actually differ.
97     AMDGPU::FeatureSGPRInitBug,
98     AMDGPU::FeatureXNACK,
99     AMDGPU::FeatureTrapHandler,
100 
101     // The default assumption needs to be ecc is enabled, but no directly
102     // exposed operations depend on it, so it can be safely inlined.
103     AMDGPU::FeatureSRAMECC,
104 
105     // Perf-tuning features
106     AMDGPU::FeatureFastFMAF32,
107     AMDGPU::HalfRate64Ops
108   };
109 
getST()110   const GCNSubtarget *getST() const { return ST; }
getTLI()111   const AMDGPUTargetLowering *getTLI() const { return TLI; }
112 
getFullRateInstrCost()113   static inline int getFullRateInstrCost() {
114     return TargetTransformInfo::TCC_Basic;
115   }
116 
117   static inline int getHalfRateInstrCost(
118       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
119     return CostKind == TTI::TCK_CodeSize ? 2
120                                          : 2 * TargetTransformInfo::TCC_Basic;
121   }
122 
123   // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
124   // should be 2 or 4.
125   static inline int getQuarterRateInstrCost(
126       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
127     return CostKind == TTI::TCK_CodeSize ? 2
128                                          : 4 * TargetTransformInfo::TCC_Basic;
129   }
130 
131   // On some parts, normal fp64 operations are half rate, and others
132   // quarter. This also applies to some integer operations.
133   inline int get64BitInstrCost(
134       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const {
135     return ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
136                                   : getQuarterRateInstrCost(CostKind);
137   }
138 
139 public:
GCNTTIImpl(const AMDGPUTargetMachine * TM,const Function & F)140   explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
141       : BaseT(TM, F.getParent()->getDataLayout()),
142         ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
143         TLI(ST->getTargetLowering()), CommonTTI(TM, F),
144         IsGraphics(AMDGPU::isGraphics(F.getCallingConv())),
145         MaxVGPRs(ST->getMaxNumVGPRs(
146             std::max(ST->getWavesPerEU(F).first,
147                      ST->getWavesPerEUForWorkGroup(
148                          ST->getFlatWorkGroupSizes(F).second)))) {
149     AMDGPU::SIModeRegisterDefaults Mode(F);
150     HasFP32Denormals = Mode.allFP32Denormals();
151     HasFP64FP16Denormals = Mode.allFP64FP16Denormals();
152   }
153 
hasBranchDivergence()154   bool hasBranchDivergence() { return true; }
155   bool useGPUDivergenceAnalysis() const;
156 
157   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
158                                TTI::UnrollingPreferences &UP);
159 
160   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
161                              TTI::PeelingPreferences &PP);
162 
getPopcntSupport(unsigned TyWidth)163   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
164     assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
165     return TTI::PSK_FastHardware;
166   }
167 
168   unsigned getHardwareNumberOfRegisters(bool Vector) const;
169   unsigned getNumberOfRegisters(bool Vector) const;
170   unsigned getNumberOfRegisters(unsigned RCID) const;
171   unsigned getRegisterBitWidth(bool Vector) const;
172   unsigned getMinVectorRegisterBitWidth() const;
173   unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
174                                unsigned ChainSizeInBytes,
175                                VectorType *VecTy) const;
176   unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
177                                 unsigned ChainSizeInBytes,
178                                 VectorType *VecTy) const;
179   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
180 
181   bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
182                                   unsigned AddrSpace) const;
183   bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
184                                    unsigned AddrSpace) const;
185   bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
186                                     unsigned AddrSpace) const;
187   Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
188                                   unsigned SrcAddrSpace, unsigned DestAddrSpace,
189                                   unsigned SrcAlign, unsigned DestAlign) const;
190 
191   void getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type *> &OpsOut,
192                                          LLVMContext &Context,
193                                          unsigned RemainingBytes,
194                                          unsigned SrcAddrSpace,
195                                          unsigned DestAddrSpace,
196                                          unsigned SrcAlign,
197                                          unsigned DestAlign) const;
198   unsigned getMaxInterleaveFactor(unsigned VF);
199 
200   bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;
201 
202   int getArithmeticInstrCost(
203       unsigned Opcode, Type *Ty,
204       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
205       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
206       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
207       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
208       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
209       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
210       const Instruction *CxtI = nullptr);
211 
212   unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
213 
214   bool isInlineAsmSourceOfDivergence(const CallInst *CI,
215                                      ArrayRef<unsigned> Indices = {}) const;
216 
217   int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
218   bool isSourceOfDivergence(const Value *V) const;
219   bool isAlwaysUniform(const Value *V) const;
220 
getFlatAddressSpace()221   unsigned getFlatAddressSpace() const {
222     // Don't bother running InferAddressSpaces pass on graphics shaders which
223     // don't use flat addressing.
224     if (IsGraphics)
225       return -1;
226     return AMDGPUAS::FLAT_ADDRESS;
227   }
228 
229   bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
230                                   Intrinsic::ID IID) const;
231   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
232                                           Value *NewV) const;
233 
234   bool canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
235                                  InstCombiner &IC) const;
236   Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
237                                                IntrinsicInst &II) const;
238   Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
239       InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
240       APInt &UndefElts2, APInt &UndefElts3,
241       std::function<void(Instruction *, unsigned, APInt, APInt &)>
242           SimplifyAndSetOp) const;
243 
getVectorSplitCost()244   unsigned getVectorSplitCost() { return 0; }
245 
246   unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
247                           VectorType *SubTp);
248 
249   bool areInlineCompatible(const Function *Caller,
250                            const Function *Callee) const;
251 
getInliningThresholdMultiplier()252   unsigned getInliningThresholdMultiplier() { return 11; }
253 
getInlinerVectorBonusPercent()254   int getInlinerVectorBonusPercent() { return 0; }
255 
256   int getArithmeticReductionCost(
257       unsigned Opcode,
258       VectorType *Ty,
259       bool IsPairwise,
260       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
261 
262   int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
263                             TTI::TargetCostKind CostKind);
264   int getMinMaxReductionCost(
265     VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned,
266     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
267 };
268 
269 class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> {
270   using BaseT = BasicTTIImplBase<R600TTIImpl>;
271   using TTI = TargetTransformInfo;
272 
273   friend BaseT;
274 
275   const R600Subtarget *ST;
276   const AMDGPUTargetLowering *TLI;
277   AMDGPUTTIImpl CommonTTI;
278 
279 public:
R600TTIImpl(const AMDGPUTargetMachine * TM,const Function & F)280   explicit R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
281     : BaseT(TM, F.getParent()->getDataLayout()),
282       ST(static_cast<const R600Subtarget*>(TM->getSubtargetImpl(F))),
283       TLI(ST->getTargetLowering()),
284       CommonTTI(TM, F) {}
285 
getST()286   const R600Subtarget *getST() const { return ST; }
getTLI()287   const AMDGPUTargetLowering *getTLI() const { return TLI; }
288 
289   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
290                                TTI::UnrollingPreferences &UP);
291   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
292                              TTI::PeelingPreferences &PP);
293   unsigned getHardwareNumberOfRegisters(bool Vec) const;
294   unsigned getNumberOfRegisters(bool Vec) const;
295   unsigned getRegisterBitWidth(bool Vector) const;
296   unsigned getMinVectorRegisterBitWidth() const;
297   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
298   bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
299                                   unsigned AddrSpace) const;
300   bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
301                                    unsigned AddrSpace) const;
302   bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
303                                     unsigned AddrSpace) const;
304   unsigned getMaxInterleaveFactor(unsigned VF);
305   unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
306   int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
307 };
308 
309 } // end namespace llvm
310 
311 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
312