1 //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16
17 #include "AMDGPUTargetTransformInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "Utils/AMDGPUBaseInfo.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/Analysis/LoopInfo.h"
22 #include "llvm/Analysis/TargetTransformInfo.h"
23 #include "llvm/Analysis/ValueTracking.h"
24 #include "llvm/CodeGen/ISDOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/Argument.h"
27 #include "llvm/IR/Attributes.h"
28 #include "llvm/IR/BasicBlock.h"
29 #include "llvm/IR/CallingConv.h"
30 #include "llvm/IR/DataLayout.h"
31 #include "llvm/IR/DerivedTypes.h"
32 #include "llvm/IR/Function.h"
33 #include "llvm/IR/Instruction.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/IR/IntrinsicInst.h"
36 #include "llvm/IR/Module.h"
37 #include "llvm/IR/PatternMatch.h"
38 #include "llvm/IR/Type.h"
39 #include "llvm/IR/Value.h"
40 #include "llvm/MC/SubtargetFeature.h"
41 #include "llvm/Support/Casting.h"
42 #include "llvm/Support/CommandLine.h"
43 #include "llvm/Support/Debug.h"
44 #include "llvm/Support/ErrorHandling.h"
45 #include "llvm/Support/MachineValueType.h"
46 #include "llvm/Support/raw_ostream.h"
47 #include "llvm/Target/TargetMachine.h"
48 #include <algorithm>
49 #include <cassert>
50 #include <limits>
51 #include <utility>
52
53 using namespace llvm;
54
55 #define DEBUG_TYPE "AMDGPUtti"
56
57 static cl::opt<unsigned> UnrollThresholdPrivate(
58 "amdgpu-unroll-threshold-private",
59 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
60 cl::init(2700), cl::Hidden);
61
62 static cl::opt<unsigned> UnrollThresholdLocal(
63 "amdgpu-unroll-threshold-local",
64 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
65 cl::init(1000), cl::Hidden);
66
67 static cl::opt<unsigned> UnrollThresholdIf(
68 "amdgpu-unroll-threshold-if",
69 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
70 cl::init(150), cl::Hidden);
71
dependsOnLocalPhi(const Loop * L,const Value * Cond,unsigned Depth=0)72 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
73 unsigned Depth = 0) {
74 const Instruction *I = dyn_cast<Instruction>(Cond);
75 if (!I)
76 return false;
77
78 for (const Value *V : I->operand_values()) {
79 if (!L->contains(I))
80 continue;
81 if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
82 if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
83 return SubLoop->contains(PHI); }))
84 return true;
85 } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
86 return true;
87 }
88 return false;
89 }
90
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP)91 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
92 TTI::UnrollingPreferences &UP) {
93 const Function &F = *L->getHeader()->getParent();
94 UP.Threshold = AMDGPU::getIntegerAttribute(F, "amdgpu-unroll-threshold", 300);
95 UP.MaxCount = std::numeric_limits<unsigned>::max();
96 UP.Partial = true;
97
98 // TODO: Do we want runtime unrolling?
99
100 // Maximum alloca size than can fit registers. Reserve 16 registers.
101 const unsigned MaxAlloca = (256 - 16) * 4;
102 unsigned ThresholdPrivate = UnrollThresholdPrivate;
103 unsigned ThresholdLocal = UnrollThresholdLocal;
104 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
105 for (const BasicBlock *BB : L->getBlocks()) {
106 const DataLayout &DL = BB->getModule()->getDataLayout();
107 unsigned LocalGEPsSeen = 0;
108
109 if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
110 return SubLoop->contains(BB); }))
111 continue; // Block belongs to an inner loop.
112
113 for (const Instruction &I : *BB) {
114 // Unroll a loop which contains an "if" statement whose condition
115 // defined by a PHI belonging to the loop. This may help to eliminate
116 // if region and potentially even PHI itself, saving on both divergence
117 // and registers used for the PHI.
118 // Add a small bonus for each of such "if" statements.
119 if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
120 if (UP.Threshold < MaxBoost && Br->isConditional()) {
121 BasicBlock *Succ0 = Br->getSuccessor(0);
122 BasicBlock *Succ1 = Br->getSuccessor(1);
123 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
124 (L->contains(Succ1) && L->isLoopExiting(Succ1)))
125 continue;
126 if (dependsOnLocalPhi(L, Br->getCondition())) {
127 UP.Threshold += UnrollThresholdIf;
128 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
129 << " for loop:\n"
130 << *L << " due to " << *Br << '\n');
131 if (UP.Threshold >= MaxBoost)
132 return;
133 }
134 }
135 continue;
136 }
137
138 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
139 if (!GEP)
140 continue;
141
142 unsigned AS = GEP->getAddressSpace();
143 unsigned Threshold = 0;
144 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
145 Threshold = ThresholdPrivate;
146 else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
147 Threshold = ThresholdLocal;
148 else
149 continue;
150
151 if (UP.Threshold >= Threshold)
152 continue;
153
154 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
155 const Value *Ptr = GEP->getPointerOperand();
156 const AllocaInst *Alloca =
157 dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
158 if (!Alloca || !Alloca->isStaticAlloca())
159 continue;
160 Type *Ty = Alloca->getAllocatedType();
161 unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
162 if (AllocaSize > MaxAlloca)
163 continue;
164 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
165 AS == AMDGPUAS::REGION_ADDRESS) {
166 LocalGEPsSeen++;
167 // Inhibit unroll for local memory if we have seen addressing not to
168 // a variable, most likely we will be unable to combine it.
169 // Do not unroll too deep inner loops for local memory to give a chance
170 // to unroll an outer loop for a more important reason.
171 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
172 (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
173 !isa<Argument>(GEP->getPointerOperand())))
174 continue;
175 }
176
177 // Check if GEP depends on a value defined by this loop itself.
178 bool HasLoopDef = false;
179 for (const Value *Op : GEP->operands()) {
180 const Instruction *Inst = dyn_cast<Instruction>(Op);
181 if (!Inst || L->isLoopInvariant(Op))
182 continue;
183
184 if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
185 return SubLoop->contains(Inst); }))
186 continue;
187 HasLoopDef = true;
188 break;
189 }
190 if (!HasLoopDef)
191 continue;
192
193 // We want to do whatever we can to limit the number of alloca
194 // instructions that make it through to the code generator. allocas
195 // require us to use indirect addressing, which is slow and prone to
196 // compiler bugs. If this loop does an address calculation on an
197 // alloca ptr, then we want to use a higher than normal loop unroll
198 // threshold. This will give SROA a better chance to eliminate these
199 // allocas.
200 //
201 // We also want to have more unrolling for local memory to let ds
202 // instructions with different offsets combine.
203 //
204 // Don't use the maximum allowed value here as it will make some
205 // programs way too big.
206 UP.Threshold = Threshold;
207 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
208 << " for loop:\n"
209 << *L << " due to " << *GEP << '\n');
210 if (UP.Threshold >= MaxBoost)
211 return;
212 }
213 }
214 }
215
getHardwareNumberOfRegisters(bool Vec) const216 unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
217 // The concept of vector registers doesn't really exist. Some packed vector
218 // operations operate on the normal 32-bit registers.
219 return 256;
220 }
221
getNumberOfRegisters(bool Vec) const222 unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
223 // This is really the number of registers to fill when vectorizing /
224 // interleaving loops, so we lie to avoid trying to use all registers.
225 return getHardwareNumberOfRegisters(Vec) >> 3;
226 }
227
getRegisterBitWidth(bool Vector) const228 unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const {
229 return 32;
230 }
231
getMinVectorRegisterBitWidth() const232 unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
233 return 32;
234 }
235
getLoadVectorFactor(unsigned VF,unsigned LoadSize,unsigned ChainSizeInBytes,VectorType * VecTy) const236 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
237 unsigned ChainSizeInBytes,
238 VectorType *VecTy) const {
239 unsigned VecRegBitWidth = VF * LoadSize;
240 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
241 // TODO: Support element-size less than 32bit?
242 return 128 / LoadSize;
243
244 return VF;
245 }
246
getStoreVectorFactor(unsigned VF,unsigned StoreSize,unsigned ChainSizeInBytes,VectorType * VecTy) const247 unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
248 unsigned ChainSizeInBytes,
249 VectorType *VecTy) const {
250 unsigned VecRegBitWidth = VF * StoreSize;
251 if (VecRegBitWidth > 128)
252 return 128 / StoreSize;
253
254 return VF;
255 }
256
getLoadStoreVecRegBitWidth(unsigned AddrSpace) const257 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
258 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
259 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
260 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
261 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
262 return 512;
263 }
264
265 if (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
266 AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
267 AddrSpace == AMDGPUAS::REGION_ADDRESS)
268 return 128;
269
270 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
271 return 8 * ST->getMaxPrivateElementSize();
272
273 llvm_unreachable("unhandled address space");
274 }
275
isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,unsigned Alignment,unsigned AddrSpace) const276 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
277 unsigned Alignment,
278 unsigned AddrSpace) const {
279 // We allow vectorization of flat stores, even though we may need to decompose
280 // them later if they may access private memory. We don't have enough context
281 // here, and legalization can handle it.
282 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
283 return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
284 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
285 }
286 return true;
287 }
288
isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,unsigned Alignment,unsigned AddrSpace) const289 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
290 unsigned Alignment,
291 unsigned AddrSpace) const {
292 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
293 }
294
isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,unsigned Alignment,unsigned AddrSpace) const295 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
296 unsigned Alignment,
297 unsigned AddrSpace) const {
298 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
299 }
300
getMaxInterleaveFactor(unsigned VF)301 unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
302 // Disable unrolling if the loop is not vectorized.
303 // TODO: Enable this again.
304 if (VF == 1)
305 return 1;
306
307 return 8;
308 }
309
getTgtMemIntrinsic(IntrinsicInst * Inst,MemIntrinsicInfo & Info) const310 bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
311 MemIntrinsicInfo &Info) const {
312 switch (Inst->getIntrinsicID()) {
313 case Intrinsic::amdgcn_atomic_inc:
314 case Intrinsic::amdgcn_atomic_dec:
315 case Intrinsic::amdgcn_ds_ordered_add:
316 case Intrinsic::amdgcn_ds_ordered_swap:
317 case Intrinsic::amdgcn_ds_fadd:
318 case Intrinsic::amdgcn_ds_fmin:
319 case Intrinsic::amdgcn_ds_fmax: {
320 auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
321 auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
322 if (!Ordering || !Volatile)
323 return false; // Invalid.
324
325 unsigned OrderingVal = Ordering->getZExtValue();
326 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
327 return false;
328
329 Info.PtrVal = Inst->getArgOperand(0);
330 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
331 Info.ReadMem = true;
332 Info.WriteMem = true;
333 Info.IsVolatile = !Volatile->isNullValue();
334 return true;
335 }
336 default:
337 return false;
338 }
339 }
340
getArithmeticInstrCost(unsigned Opcode,Type * Ty,TTI::OperandValueKind Opd1Info,TTI::OperandValueKind Opd2Info,TTI::OperandValueProperties Opd1PropInfo,TTI::OperandValueProperties Opd2PropInfo,ArrayRef<const Value * > Args,const Instruction * CxtI)341 int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
342 TTI::OperandValueKind Opd1Info,
343 TTI::OperandValueKind Opd2Info,
344 TTI::OperandValueProperties Opd1PropInfo,
345 TTI::OperandValueProperties Opd2PropInfo,
346 ArrayRef<const Value *> Args,
347 const Instruction *CxtI) {
348 EVT OrigTy = TLI->getValueType(DL, Ty);
349 if (!OrigTy.isSimple()) {
350 return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
351 Opd1PropInfo, Opd2PropInfo);
352 }
353
354 // Legalize the type.
355 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
356 int ISD = TLI->InstructionOpcodeToISD(Opcode);
357
358 // Because we don't have any legal vector operations, but the legal types, we
359 // need to account for split vectors.
360 unsigned NElts = LT.second.isVector() ?
361 LT.second.getVectorNumElements() : 1;
362
363 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
364
365 switch (ISD) {
366 case ISD::SHL:
367 case ISD::SRL:
368 case ISD::SRA:
369 if (SLT == MVT::i64)
370 return get64BitInstrCost() * LT.first * NElts;
371
372 if (ST->has16BitInsts() && SLT == MVT::i16)
373 NElts = (NElts + 1) / 2;
374
375 // i32
376 return getFullRateInstrCost() * LT.first * NElts;
377 case ISD::ADD:
378 case ISD::SUB:
379 case ISD::AND:
380 case ISD::OR:
381 case ISD::XOR:
382 if (SLT == MVT::i64) {
383 // and, or and xor are typically split into 2 VALU instructions.
384 return 2 * getFullRateInstrCost() * LT.first * NElts;
385 }
386
387 if (ST->has16BitInsts() && SLT == MVT::i16)
388 NElts = (NElts + 1) / 2;
389
390 return LT.first * NElts * getFullRateInstrCost();
391 case ISD::MUL: {
392 const int QuarterRateCost = getQuarterRateInstrCost();
393 if (SLT == MVT::i64) {
394 const int FullRateCost = getFullRateInstrCost();
395 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
396 }
397
398 if (ST->has16BitInsts() && SLT == MVT::i16)
399 NElts = (NElts + 1) / 2;
400
401 // i32
402 return QuarterRateCost * NElts * LT.first;
403 }
404 case ISD::FADD:
405 case ISD::FSUB:
406 case ISD::FMUL:
407 if (SLT == MVT::f64)
408 return LT.first * NElts * get64BitInstrCost();
409
410 if (ST->has16BitInsts() && SLT == MVT::f16)
411 NElts = (NElts + 1) / 2;
412
413 if (SLT == MVT::f32 || SLT == MVT::f16)
414 return LT.first * NElts * getFullRateInstrCost();
415 break;
416 case ISD::FDIV:
417 case ISD::FREM:
418 // FIXME: frem should be handled separately. The fdiv in it is most of it,
419 // but the current lowering is also not entirely correct.
420 if (SLT == MVT::f64) {
421 int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
422 // Add cost of workaround.
423 if (!ST->hasUsableDivScaleConditionOutput())
424 Cost += 3 * getFullRateInstrCost();
425
426 return LT.first * Cost * NElts;
427 }
428
429 if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
430 // TODO: This is more complicated, unsafe flags etc.
431 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
432 (SLT == MVT::f16 && ST->has16BitInsts())) {
433 return LT.first * getQuarterRateInstrCost() * NElts;
434 }
435 }
436
437 if (SLT == MVT::f16 && ST->has16BitInsts()) {
438 // 2 x v_cvt_f32_f16
439 // f32 rcp
440 // f32 fmul
441 // v_cvt_f16_f32
442 // f16 div_fixup
443 int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost();
444 return LT.first * Cost * NElts;
445 }
446
447 if (SLT == MVT::f32 || SLT == MVT::f16) {
448 int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
449
450 if (!HasFP32Denormals) {
451 // FP mode switches.
452 Cost += 2 * getFullRateInstrCost();
453 }
454
455 return LT.first * NElts * Cost;
456 }
457 break;
458 default:
459 break;
460 }
461
462 return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
463 Opd1PropInfo, Opd2PropInfo);
464 }
465
466 template <typename T>
getIntrinsicInstrCost(Intrinsic::ID ID,Type * RetTy,ArrayRef<T * > Args,FastMathFlags FMF,unsigned VF)467 int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
468 ArrayRef<T *> Args,
469 FastMathFlags FMF, unsigned VF) {
470 if (ID != Intrinsic::fma)
471 return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
472
473 EVT OrigTy = TLI->getValueType(DL, RetTy);
474 if (!OrigTy.isSimple()) {
475 return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
476 }
477
478 // Legalize the type.
479 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
480
481 unsigned NElts = LT.second.isVector() ?
482 LT.second.getVectorNumElements() : 1;
483
484 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
485
486 if (SLT == MVT::f64)
487 return LT.first * NElts * get64BitInstrCost();
488
489 if (ST->has16BitInsts() && SLT == MVT::f16)
490 NElts = (NElts + 1) / 2;
491
492 return LT.first * NElts * (ST->hasFastFMAF32() ? getHalfRateInstrCost()
493 : getQuarterRateInstrCost());
494 }
495
getIntrinsicInstrCost(Intrinsic::ID ID,Type * RetTy,ArrayRef<Value * > Args,FastMathFlags FMF,unsigned VF)496 int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
497 ArrayRef<Value*> Args, FastMathFlags FMF,
498 unsigned VF) {
499 return getIntrinsicInstrCost<Value>(ID, RetTy, Args, FMF, VF);
500 }
501
getIntrinsicInstrCost(Intrinsic::ID ID,Type * RetTy,ArrayRef<Type * > Tys,FastMathFlags FMF,unsigned ScalarizationCostPassed)502 int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
503 ArrayRef<Type *> Tys, FastMathFlags FMF,
504 unsigned ScalarizationCostPassed) {
505 return getIntrinsicInstrCost<Type>(ID, RetTy, Tys, FMF,
506 ScalarizationCostPassed);
507 }
508
getCFInstrCost(unsigned Opcode)509 unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode) {
510 // XXX - For some reason this isn't called for switch.
511 switch (Opcode) {
512 case Instruction::Br:
513 case Instruction::Ret:
514 return 10;
515 default:
516 return BaseT::getCFInstrCost(Opcode);
517 }
518 }
519
getArithmeticReductionCost(unsigned Opcode,Type * Ty,bool IsPairwise)520 int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *Ty,
521 bool IsPairwise) {
522 EVT OrigTy = TLI->getValueType(DL, Ty);
523
524 // Computes cost on targets that have packed math instructions(which support
525 // 16-bit types only).
526 if (IsPairwise ||
527 !ST->hasVOP3PInsts() ||
528 OrigTy.getScalarSizeInBits() != 16)
529 return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise);
530
531 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
532 return LT.first * getFullRateInstrCost();
533 }
534
getMinMaxReductionCost(Type * Ty,Type * CondTy,bool IsPairwise,bool IsUnsigned)535 int GCNTTIImpl::getMinMaxReductionCost(Type *Ty, Type *CondTy,
536 bool IsPairwise,
537 bool IsUnsigned) {
538 EVT OrigTy = TLI->getValueType(DL, Ty);
539
540 // Computes cost on targets that have packed math instructions(which support
541 // 16-bit types only).
542 if (IsPairwise ||
543 !ST->hasVOP3PInsts() ||
544 OrigTy.getScalarSizeInBits() != 16)
545 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned);
546
547 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
548 return LT.first * getHalfRateInstrCost();
549 }
550
getVectorInstrCost(unsigned Opcode,Type * ValTy,unsigned Index)551 int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
552 unsigned Index) {
553 switch (Opcode) {
554 case Instruction::ExtractElement:
555 case Instruction::InsertElement: {
556 unsigned EltSize
557 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
558 if (EltSize < 32) {
559 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
560 return 0;
561 return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
562 }
563
564 // Extracts are just reads of a subregister, so are free. Inserts are
565 // considered free because we don't want to have any cost for scalarizing
566 // operations, and we don't have to copy into a different register class.
567
568 // Dynamic indexing isn't free and is best avoided.
569 return Index == ~0u ? 2 : 0;
570 }
571 default:
572 return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
573 }
574 }
575
576
577
isArgPassedInSGPR(const Argument * A)578 static bool isArgPassedInSGPR(const Argument *A) {
579 const Function *F = A->getParent();
580
581 // Arguments to compute shaders are never a source of divergence.
582 CallingConv::ID CC = F->getCallingConv();
583 switch (CC) {
584 case CallingConv::AMDGPU_KERNEL:
585 case CallingConv::SPIR_KERNEL:
586 return true;
587 case CallingConv::AMDGPU_VS:
588 case CallingConv::AMDGPU_LS:
589 case CallingConv::AMDGPU_HS:
590 case CallingConv::AMDGPU_ES:
591 case CallingConv::AMDGPU_GS:
592 case CallingConv::AMDGPU_PS:
593 case CallingConv::AMDGPU_CS:
594 // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
595 // Everything else is in VGPRs.
596 return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) ||
597 F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal);
598 default:
599 // TODO: Should calls support inreg for SGPR inputs?
600 return false;
601 }
602 }
603
604 /// \returns true if the result of the value could potentially be
605 /// different across workitems in a wavefront.
isSourceOfDivergence(const Value * V) const606 bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
607 if (const Argument *A = dyn_cast<Argument>(V))
608 return !isArgPassedInSGPR(A);
609
610 // Loads from the private and flat address spaces are divergent, because
611 // threads can execute the load instruction with the same inputs and get
612 // different results.
613 //
614 // All other loads are not divergent, because if threads issue loads with the
615 // same arguments, they will always get the same result.
616 if (const LoadInst *Load = dyn_cast<LoadInst>(V))
617 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
618 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
619
620 // Atomics are divergent because they are executed sequentially: when an
621 // atomic operation refers to the same address in each thread, then each
622 // thread after the first sees the value written by the previous thread as
623 // original value.
624 if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
625 return true;
626
627 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
628 return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
629
630 // Assume all function calls are a source of divergence.
631 if (isa<CallInst>(V) || isa<InvokeInst>(V))
632 return true;
633
634 return false;
635 }
636
isAlwaysUniform(const Value * V) const637 bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
638 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
639 switch (Intrinsic->getIntrinsicID()) {
640 default:
641 return false;
642 case Intrinsic::amdgcn_readfirstlane:
643 case Intrinsic::amdgcn_readlane:
644 case Intrinsic::amdgcn_icmp:
645 case Intrinsic::amdgcn_fcmp:
646 return true;
647 }
648 }
649 return false;
650 }
651
collectFlatAddressOperands(SmallVectorImpl<int> & OpIndexes,Intrinsic::ID IID) const652 bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
653 Intrinsic::ID IID) const {
654 switch (IID) {
655 case Intrinsic::amdgcn_atomic_inc:
656 case Intrinsic::amdgcn_atomic_dec:
657 case Intrinsic::amdgcn_ds_fadd:
658 case Intrinsic::amdgcn_ds_fmin:
659 case Intrinsic::amdgcn_ds_fmax:
660 case Intrinsic::amdgcn_is_shared:
661 case Intrinsic::amdgcn_is_private:
662 OpIndexes.push_back(0);
663 return true;
664 default:
665 return false;
666 }
667 }
668
rewriteIntrinsicWithAddressSpace(IntrinsicInst * II,Value * OldV,Value * NewV) const669 bool GCNTTIImpl::rewriteIntrinsicWithAddressSpace(
670 IntrinsicInst *II, Value *OldV, Value *NewV) const {
671 auto IntrID = II->getIntrinsicID();
672 switch (IntrID) {
673 case Intrinsic::amdgcn_atomic_inc:
674 case Intrinsic::amdgcn_atomic_dec:
675 case Intrinsic::amdgcn_ds_fadd:
676 case Intrinsic::amdgcn_ds_fmin:
677 case Intrinsic::amdgcn_ds_fmax: {
678 const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
679 if (!IsVolatile->isZero())
680 return false;
681 Module *M = II->getParent()->getParent()->getParent();
682 Type *DestTy = II->getType();
683 Type *SrcTy = NewV->getType();
684 Function *NewDecl =
685 Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
686 II->setArgOperand(0, NewV);
687 II->setCalledFunction(NewDecl);
688 return true;
689 }
690 case Intrinsic::amdgcn_is_shared:
691 case Intrinsic::amdgcn_is_private: {
692 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
693 AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
694 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
695 LLVMContext &Ctx = NewV->getType()->getContext();
696 ConstantInt *NewVal = (TrueAS == NewAS) ?
697 ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
698 II->replaceAllUsesWith(NewVal);
699 II->eraseFromParent();
700 return true;
701 }
702 default:
703 return false;
704 }
705 }
706
getShuffleCost(TTI::ShuffleKind Kind,Type * Tp,int Index,Type * SubTp)707 unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
708 Type *SubTp) {
709 if (ST->hasVOP3PInsts()) {
710 VectorType *VT = cast<VectorType>(Tp);
711 if (VT->getNumElements() == 2 &&
712 DL.getTypeSizeInBits(VT->getElementType()) == 16) {
713 // With op_sel VOP3P instructions freely can access the low half or high
714 // half of a register, so any swizzle is free.
715
716 switch (Kind) {
717 case TTI::SK_Broadcast:
718 case TTI::SK_Reverse:
719 case TTI::SK_PermuteSingleSrc:
720 return 0;
721 default:
722 break;
723 }
724 }
725 }
726
727 return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
728 }
729
areInlineCompatible(const Function * Caller,const Function * Callee) const730 bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
731 const Function *Callee) const {
732 const TargetMachine &TM = getTLI()->getTargetMachine();
733 const GCNSubtarget *CallerST
734 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
735 const GCNSubtarget *CalleeST
736 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
737
738 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
739 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
740
741 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
742 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
743 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
744 return false;
745
746 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
747 // no way to support merge for backend defined attributes.
748 AMDGPU::SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
749 AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
750 return CallerMode.isInlineCompatible(CalleeMode);
751 }
752
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP)753 void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
754 TTI::UnrollingPreferences &UP) {
755 CommonTTI.getUnrollingPreferences(L, SE, UP);
756 }
757
getUserCost(const User * U,ArrayRef<const Value * > Operands)758 unsigned GCNTTIImpl::getUserCost(const User *U,
759 ArrayRef<const Value *> Operands) {
760 const Instruction *I = dyn_cast<Instruction>(U);
761 if (!I)
762 return BaseT::getUserCost(U, Operands);
763
764 // Estimate different operations to be optimized out
765 switch (I->getOpcode()) {
766 case Instruction::ExtractElement: {
767 ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1));
768 unsigned Idx = -1;
769 if (CI)
770 Idx = CI->getZExtValue();
771 return getVectorInstrCost(I->getOpcode(), I->getOperand(0)->getType(), Idx);
772 }
773 case Instruction::InsertElement: {
774 ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(2));
775 unsigned Idx = -1;
776 if (CI)
777 Idx = CI->getZExtValue();
778 return getVectorInstrCost(I->getOpcode(), I->getType(), Idx);
779 }
780 case Instruction::Call: {
781 if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
782 SmallVector<Value *, 4> Args(II->arg_operands());
783 FastMathFlags FMF;
784 if (auto *FPMO = dyn_cast<FPMathOperator>(II))
785 FMF = FPMO->getFastMathFlags();
786 return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), Args,
787 FMF);
788 } else {
789 return BaseT::getUserCost(U, Operands);
790 }
791 }
792 case Instruction::ShuffleVector: {
793 const ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I);
794 Type *Ty = Shuffle->getType();
795 Type *SrcTy = Shuffle->getOperand(0)->getType();
796
797 // TODO: Identify and add costs for insert subvector, etc.
798 int SubIndex;
799 if (Shuffle->isExtractSubvectorMask(SubIndex))
800 return getShuffleCost(TTI::SK_ExtractSubvector, SrcTy, SubIndex, Ty);
801
802 if (Shuffle->changesLength())
803 return BaseT::getUserCost(U, Operands);
804
805 if (Shuffle->isIdentity())
806 return 0;
807
808 if (Shuffle->isReverse())
809 return getShuffleCost(TTI::SK_Reverse, Ty, 0, nullptr);
810
811 if (Shuffle->isSelect())
812 return getShuffleCost(TTI::SK_Select, Ty, 0, nullptr);
813
814 if (Shuffle->isTranspose())
815 return getShuffleCost(TTI::SK_Transpose, Ty, 0, nullptr);
816
817 if (Shuffle->isZeroEltSplat())
818 return getShuffleCost(TTI::SK_Broadcast, Ty, 0, nullptr);
819
820 if (Shuffle->isSingleSource())
821 return getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 0, nullptr);
822
823 return getShuffleCost(TTI::SK_PermuteTwoSrc, Ty, 0, nullptr);
824 }
825 case Instruction::ZExt:
826 case Instruction::SExt:
827 case Instruction::FPToUI:
828 case Instruction::FPToSI:
829 case Instruction::FPExt:
830 case Instruction::PtrToInt:
831 case Instruction::IntToPtr:
832 case Instruction::SIToFP:
833 case Instruction::UIToFP:
834 case Instruction::Trunc:
835 case Instruction::FPTrunc:
836 case Instruction::BitCast:
837 case Instruction::AddrSpaceCast: {
838 return getCastInstrCost(I->getOpcode(), I->getType(),
839 I->getOperand(0)->getType(), I);
840 }
841 case Instruction::Add:
842 case Instruction::FAdd:
843 case Instruction::Sub:
844 case Instruction::FSub:
845 case Instruction::Mul:
846 case Instruction::FMul:
847 case Instruction::UDiv:
848 case Instruction::SDiv:
849 case Instruction::FDiv:
850 case Instruction::URem:
851 case Instruction::SRem:
852 case Instruction::FRem:
853 case Instruction::Shl:
854 case Instruction::LShr:
855 case Instruction::AShr:
856 case Instruction::And:
857 case Instruction::Or:
858 case Instruction::Xor:
859 case Instruction::FNeg: {
860 return getArithmeticInstrCost(I->getOpcode(), I->getType(),
861 TTI::OK_AnyValue, TTI::OK_AnyValue,
862 TTI::OP_None, TTI::OP_None, Operands, I);
863 }
864 default:
865 break;
866 }
867
868 return BaseT::getUserCost(U, Operands);
869 }
870
getHardwareNumberOfRegisters(bool Vec) const871 unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
872 return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
873 }
874
getNumberOfRegisters(bool Vec) const875 unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
876 return getHardwareNumberOfRegisters(Vec);
877 }
878
getRegisterBitWidth(bool Vector) const879 unsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const {
880 return 32;
881 }
882
getMinVectorRegisterBitWidth() const883 unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const {
884 return 32;
885 }
886
getLoadStoreVecRegBitWidth(unsigned AddrSpace) const887 unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
888 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
889 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS)
890 return 128;
891 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
892 AddrSpace == AMDGPUAS::REGION_ADDRESS)
893 return 64;
894 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
895 return 32;
896
897 if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
898 AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
899 (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
900 AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
901 return 128;
902 llvm_unreachable("unhandled address space");
903 }
904
isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,unsigned Alignment,unsigned AddrSpace) const905 bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
906 unsigned Alignment,
907 unsigned AddrSpace) const {
908 // We allow vectorization of flat stores, even though we may need to decompose
909 // them later if they may access private memory. We don't have enough context
910 // here, and legalization can handle it.
911 return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS);
912 }
913
isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,unsigned Alignment,unsigned AddrSpace) const914 bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
915 unsigned Alignment,
916 unsigned AddrSpace) const {
917 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
918 }
919
isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,unsigned Alignment,unsigned AddrSpace) const920 bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
921 unsigned Alignment,
922 unsigned AddrSpace) const {
923 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
924 }
925
getMaxInterleaveFactor(unsigned VF)926 unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
927 // Disable unrolling if the loop is not vectorized.
928 // TODO: Enable this again.
929 if (VF == 1)
930 return 1;
931
932 return 8;
933 }
934
getCFInstrCost(unsigned Opcode)935 unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode) {
936 // XXX - For some reason this isn't called for switch.
937 switch (Opcode) {
938 case Instruction::Br:
939 case Instruction::Ret:
940 return 10;
941 default:
942 return BaseT::getCFInstrCost(Opcode);
943 }
944 }
945
getVectorInstrCost(unsigned Opcode,Type * ValTy,unsigned Index)946 int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
947 unsigned Index) {
948 switch (Opcode) {
949 case Instruction::ExtractElement:
950 case Instruction::InsertElement: {
951 unsigned EltSize
952 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
953 if (EltSize < 32) {
954 return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
955 }
956
957 // Extracts are just reads of a subregister, so are free. Inserts are
958 // considered free because we don't want to have any cost for scalarizing
959 // operations, and we don't have to copy into a different register class.
960
961 // Dynamic indexing isn't free and is best avoided.
962 return Index == ~0u ? 2 : 0;
963 }
964 default:
965 return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
966 }
967 }
968
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP)969 void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
970 TTI::UnrollingPreferences &UP) {
971 CommonTTI.getUnrollingPreferences(L, SE, UP);
972 }
973