• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 static cl::opt<bool> EnableVGPRIndexMode(
49   "amdgpu-vgpr-index-mode",
50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51   cl::init(false));
52 
53 static cl::opt<bool> EnableFlatScratch(
54   "amdgpu-enable-flat-scratch",
55   cl::desc("Use flat scratch instructions"),
56   cl::init(false));
57 
58 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
59                            cl::desc("Enable the use of AA during codegen."),
60                            cl::init(true));
61 
62 GCNSubtarget::~GCNSubtarget() = default;
63 
64 R600Subtarget &
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)65 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
66                                                StringRef GPU, StringRef FS) {
67   SmallString<256> FullFS("+promote-alloca,");
68   FullFS += FS;
69   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
70 
71   HasMulU24 = getGeneration() >= EVERGREEN;
72   HasMulI24 = hasCaymanISA();
73 
74   return *this;
75 }
76 
77 GCNSubtarget &
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)78 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
79                                               StringRef GPU, StringRef FS) {
80   // Determine default and user-specified characteristics
81   //
82   // We want to be able to turn these off, but making this a subtarget feature
83   // for SI has the unhelpful behavior that it unsets everything else if you
84   // disable it.
85   //
86   // Similarly we want enable-prt-strict-null to be on by default and not to
87   // unset everything else if it is disabled
88 
89   // Assuming ECC is enabled is the conservative default.
90   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,");
91 
92   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
93     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
94 
95   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
96 
97   // Disable mutually exclusive bits.
98   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
99     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
100       FullFS += "-wavefrontsize16,";
101     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
102       FullFS += "-wavefrontsize32,";
103     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
104       FullFS += "-wavefrontsize64,";
105   }
106 
107   FullFS += FS;
108 
109   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
110 
111   // We don't support FP64 for EG/NI atm.
112   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
113 
114   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
115   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
116   // variants of MUBUF instructions.
117   if (!hasAddr64() && !FS.contains("flat-for-global")) {
118     FlatForGlobal = true;
119   }
120 
121   // Set defaults if needed.
122   if (MaxPrivateElementSize == 0)
123     MaxPrivateElementSize = 4;
124 
125   if (LDSBankCount == 0)
126     LDSBankCount = 32;
127 
128   if (TT.getArch() == Triple::amdgcn) {
129     if (LocalMemorySize == 0)
130       LocalMemorySize = 32768;
131 
132     // Do something sensible for unspecified target.
133     if (!HasMovrel && !HasVGPRIndexMode)
134       HasMovrel = true;
135   }
136 
137   // Don't crash on invalid devices.
138   if (WavefrontSizeLog2 == 0)
139     WavefrontSizeLog2 = 5;
140 
141   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
142 
143   // Disable XNACK on targets where it is not enabled by default unless it is
144   // explicitly requested.
145   if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) {
146     ToggleFeature(AMDGPU::FeatureXNACK);
147     EnableXNACK = false;
148   }
149 
150   // ECC is on by default, but turn it off if the hardware doesn't support it
151   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
152   // ECC.
153   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
154     ToggleFeature(AMDGPU::FeatureSRAMECC);
155     EnableSRAMECC = false;
156   }
157 
158   return *this;
159 }
160 
AMDGPUSubtarget(const Triple & TT)161 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
162   TargetTriple(TT),
163   Has16BitInsts(false),
164   HasMadMixInsts(false),
165   HasMadMacF32Insts(false),
166   HasDsSrc2Insts(false),
167   HasSDWA(false),
168   HasVOP3PInsts(false),
169   HasMulI24(true),
170   HasMulU24(true),
171   HasInv2PiInlineImm(false),
172   HasFminFmaxLegacy(true),
173   EnablePromoteAlloca(false),
174   HasTrigReducedRange(false),
175   MaxWavesPerEU(10),
176   LocalMemorySize(0),
177   WavefrontSizeLog2(0)
178   { }
179 
GCNSubtarget(const Triple & TT,StringRef GPU,StringRef FS,const GCNTargetMachine & TM)180 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
181                            const GCNTargetMachine &TM) :
182     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
183     AMDGPUSubtarget(TT),
184     TargetTriple(TT),
185     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
186     InstrItins(getInstrItineraryForCPU(GPU)),
187     LDSBankCount(0),
188     MaxPrivateElementSize(0),
189 
190     FastFMAF32(false),
191     FastDenormalF32(false),
192     HalfRate64Ops(false),
193 
194     FlatForGlobal(false),
195     AutoWaitcntBeforeBarrier(false),
196     UnalignedScratchAccess(false),
197     UnalignedAccessMode(false),
198 
199     HasApertureRegs(false),
200     EnableXNACK(false),
201     DoesNotSupportXNACK(false),
202     EnableCuMode(false),
203     TrapHandler(false),
204 
205     EnableLoadStoreOpt(false),
206     EnableUnsafeDSOffsetFolding(false),
207     EnableSIScheduler(false),
208     EnableDS128(false),
209     EnablePRTStrictNull(false),
210     DumpCode(false),
211 
212     FP64(false),
213     GCN3Encoding(false),
214     CIInsts(false),
215     GFX8Insts(false),
216     GFX9Insts(false),
217     GFX10Insts(false),
218     GFX10_3Insts(false),
219     GFX7GFX8GFX9Insts(false),
220     SGPRInitBug(false),
221     HasSMemRealTime(false),
222     HasIntClamp(false),
223     HasFmaMixInsts(false),
224     HasMovrel(false),
225     HasVGPRIndexMode(false),
226     HasScalarStores(false),
227     HasScalarAtomics(false),
228     HasSDWAOmod(false),
229     HasSDWAScalar(false),
230     HasSDWASdst(false),
231     HasSDWAMac(false),
232     HasSDWAOutModsVOPC(false),
233     HasDPP(false),
234     HasDPP8(false),
235     HasR128A16(false),
236     HasGFX10A16(false),
237     HasG16(false),
238     HasNSAEncoding(false),
239     GFX10_BEncoding(false),
240     HasDLInsts(false),
241     HasDot1Insts(false),
242     HasDot2Insts(false),
243     HasDot3Insts(false),
244     HasDot4Insts(false),
245     HasDot5Insts(false),
246     HasDot6Insts(false),
247     HasMAIInsts(false),
248     HasPkFmacF16Inst(false),
249     HasAtomicFaddInsts(false),
250     EnableSRAMECC(false),
251     DoesNotSupportSRAMECC(false),
252     HasNoSdstCMPX(false),
253     HasVscnt(false),
254     HasGetWaveIdInst(false),
255     HasSMemTimeInst(false),
256     HasRegisterBanking(false),
257     HasVOP3Literal(false),
258     HasNoDataDepHazard(false),
259     FlatAddressSpace(false),
260     FlatInstOffsets(false),
261     FlatGlobalInsts(false),
262     FlatScratchInsts(false),
263     ScalarFlatScratchInsts(false),
264     AddNoCarryInsts(false),
265     HasUnpackedD16VMem(false),
266     LDSMisalignedBug(false),
267     HasMFMAInlineLiteralBug(false),
268     UnalignedBufferAccess(false),
269     UnalignedDSAccess(false),
270 
271     ScalarizeGlobal(false),
272 
273     HasVcmpxPermlaneHazard(false),
274     HasVMEMtoScalarWriteHazard(false),
275     HasSMEMtoVectorWriteHazard(false),
276     HasInstFwdPrefetchBug(false),
277     HasVcmpxExecWARHazard(false),
278     HasLdsBranchVmemWARHazard(false),
279     HasNSAtoVMEMBug(false),
280     HasOffset3fBug(false),
281     HasFlatSegmentOffsetBug(false),
282     HasImageStoreD16Bug(false),
283     HasImageGather4D16Bug(false),
284 
285     FeatureDisable(false),
286     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
287     TLInfo(TM, *this),
288     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
289   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
290   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
291   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
292   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
293   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
294   InstSelector.reset(new AMDGPUInstructionSelector(
295   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
296 }
297 
enableFlatScratch() const298 bool GCNSubtarget::enableFlatScratch() const {
299   return EnableFlatScratch && hasFlatScratchInsts();
300 }
301 
getConstantBusLimit(unsigned Opcode) const302 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
303   if (getGeneration() < GFX10)
304     return 1;
305 
306   switch (Opcode) {
307   case AMDGPU::V_LSHLREV_B64:
308   case AMDGPU::V_LSHLREV_B64_gfx10:
309   case AMDGPU::V_LSHL_B64:
310   case AMDGPU::V_LSHRREV_B64:
311   case AMDGPU::V_LSHRREV_B64_gfx10:
312   case AMDGPU::V_LSHR_B64:
313   case AMDGPU::V_ASHRREV_I64:
314   case AMDGPU::V_ASHRREV_I64_gfx10:
315   case AMDGPU::V_ASHR_I64:
316     return 1;
317   }
318 
319   return 2;
320 }
321 
getMaxLocalMemSizeWithWaveCount(unsigned NWaves,const Function & F) const322 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
323   const Function &F) const {
324   if (NWaves == 1)
325     return getLocalMemorySize();
326   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
327   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
328   if (!WorkGroupsPerCu)
329     return 0;
330   unsigned MaxWaves = getMaxWavesPerEU();
331   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
332 }
333 
334 // FIXME: Should return min,max range.
getOccupancyWithLocalMemSize(uint32_t Bytes,const Function & F) const335 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
336   const Function &F) const {
337   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
338   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
339   if (!MaxWorkGroupsPerCu)
340     return 0;
341 
342   const unsigned WaveSize = getWavefrontSize();
343 
344   // FIXME: Do we need to account for alignment requirement of LDS rounding the
345   // size up?
346   // Compute restriction based on LDS usage
347   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
348 
349   // This can be queried with more LDS than is possible, so just assume the
350   // worst.
351   if (NumGroups == 0)
352     return 1;
353 
354   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
355 
356   // Round to the number of waves.
357   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
358   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
359 
360   // Clamp to the maximum possible number of waves.
361   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
362 
363   // FIXME: Needs to be a multiple of the group size?
364   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
365 
366   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
367          "computed invalid occupancy");
368   return MaxWaves;
369 }
370 
371 unsigned
getOccupancyWithLocalMemSize(const MachineFunction & MF) const372 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
373   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
374   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
375 }
376 
377 std::pair<unsigned, unsigned>
getDefaultFlatWorkGroupSize(CallingConv::ID CC) const378 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
379   switch (CC) {
380   case CallingConv::AMDGPU_VS:
381   case CallingConv::AMDGPU_LS:
382   case CallingConv::AMDGPU_HS:
383   case CallingConv::AMDGPU_ES:
384   case CallingConv::AMDGPU_GS:
385   case CallingConv::AMDGPU_PS:
386     return std::make_pair(1, getWavefrontSize());
387   default:
388     return std::make_pair(1u, getMaxFlatWorkGroupSize());
389   }
390 }
391 
getFlatWorkGroupSizes(const Function & F) const392 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
393   const Function &F) const {
394   // Default minimum/maximum flat work group sizes.
395   std::pair<unsigned, unsigned> Default =
396     getDefaultFlatWorkGroupSize(F.getCallingConv());
397 
398   // Requested minimum/maximum flat work group sizes.
399   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
400     F, "amdgpu-flat-work-group-size", Default);
401 
402   // Make sure requested minimum is less than requested maximum.
403   if (Requested.first > Requested.second)
404     return Default;
405 
406   // Make sure requested values do not violate subtarget's specifications.
407   if (Requested.first < getMinFlatWorkGroupSize())
408     return Default;
409   if (Requested.second > getMaxFlatWorkGroupSize())
410     return Default;
411 
412   return Requested;
413 }
414 
getWavesPerEU(const Function & F) const415 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
416   const Function &F) const {
417   // Default minimum/maximum number of waves per execution unit.
418   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
419 
420   // Default/requested minimum/maximum flat work group sizes.
421   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
422 
423   // If minimum/maximum flat work group sizes were explicitly requested using
424   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
425   // number of waves per execution unit to values implied by requested
426   // minimum/maximum flat work group sizes.
427   unsigned MinImpliedByFlatWorkGroupSize =
428     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
429   Default.first = MinImpliedByFlatWorkGroupSize;
430   bool RequestedFlatWorkGroupSize =
431       F.hasFnAttribute("amdgpu-flat-work-group-size");
432 
433   // Requested minimum/maximum number of waves per execution unit.
434   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
435     F, "amdgpu-waves-per-eu", Default, true);
436 
437   // Make sure requested minimum is less than requested maximum.
438   if (Requested.second && Requested.first > Requested.second)
439     return Default;
440 
441   // Make sure requested values do not violate subtarget's specifications.
442   if (Requested.first < getMinWavesPerEU() ||
443       Requested.second > getMaxWavesPerEU())
444     return Default;
445 
446   // Make sure requested values are compatible with values implied by requested
447   // minimum/maximum flat work group sizes.
448   if (RequestedFlatWorkGroupSize &&
449       Requested.first < MinImpliedByFlatWorkGroupSize)
450     return Default;
451 
452   return Requested;
453 }
454 
getReqdWorkGroupSize(const Function & Kernel,unsigned Dim)455 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
456   auto Node = Kernel.getMetadata("reqd_work_group_size");
457   if (Node && Node->getNumOperands() == 3)
458     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
459   return std::numeric_limits<unsigned>::max();
460 }
461 
getMaxWorkitemID(const Function & Kernel,unsigned Dimension) const462 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
463                                            unsigned Dimension) const {
464   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
465   if (ReqdSize != std::numeric_limits<unsigned>::max())
466     return ReqdSize - 1;
467   return getFlatWorkGroupSizes(Kernel).second - 1;
468 }
469 
makeLIDRangeMetadata(Instruction * I) const470 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
471   Function *Kernel = I->getParent()->getParent();
472   unsigned MinSize = 0;
473   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
474   bool IdQuery = false;
475 
476   // If reqd_work_group_size is present it narrows value down.
477   if (auto *CI = dyn_cast<CallInst>(I)) {
478     const Function *F = CI->getCalledFunction();
479     if (F) {
480       unsigned Dim = UINT_MAX;
481       switch (F->getIntrinsicID()) {
482       case Intrinsic::amdgcn_workitem_id_x:
483       case Intrinsic::r600_read_tidig_x:
484         IdQuery = true;
485         LLVM_FALLTHROUGH;
486       case Intrinsic::r600_read_local_size_x:
487         Dim = 0;
488         break;
489       case Intrinsic::amdgcn_workitem_id_y:
490       case Intrinsic::r600_read_tidig_y:
491         IdQuery = true;
492         LLVM_FALLTHROUGH;
493       case Intrinsic::r600_read_local_size_y:
494         Dim = 1;
495         break;
496       case Intrinsic::amdgcn_workitem_id_z:
497       case Intrinsic::r600_read_tidig_z:
498         IdQuery = true;
499         LLVM_FALLTHROUGH;
500       case Intrinsic::r600_read_local_size_z:
501         Dim = 2;
502         break;
503       default:
504         break;
505       }
506 
507       if (Dim <= 3) {
508         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
509         if (ReqdSize != std::numeric_limits<unsigned>::max())
510           MinSize = MaxSize = ReqdSize;
511       }
512     }
513   }
514 
515   if (!MaxSize)
516     return false;
517 
518   // Range metadata is [Lo, Hi). For ID query we need to pass max size
519   // as Hi. For size query we need to pass Hi + 1.
520   if (IdQuery)
521     MinSize = 0;
522   else
523     ++MaxSize;
524 
525   MDBuilder MDB(I->getContext());
526   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
527                                                   APInt(32, MaxSize));
528   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
529   return true;
530 }
531 
getExplicitKernArgSize(const Function & F,Align & MaxAlign) const532 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
533                                                  Align &MaxAlign) const {
534   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
535          F.getCallingConv() == CallingConv::SPIR_KERNEL);
536 
537   const DataLayout &DL = F.getParent()->getDataLayout();
538   uint64_t ExplicitArgBytes = 0;
539   MaxAlign = Align(1);
540 
541   for (const Argument &Arg : F.args()) {
542     const bool IsByRef = Arg.hasByRefAttr();
543     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
544     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
545     if (!Alignment)
546       Alignment = DL.getABITypeAlign(ArgTy);
547 
548     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
549     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
550     MaxAlign = max(MaxAlign, Alignment);
551   }
552 
553   return ExplicitArgBytes;
554 }
555 
getKernArgSegmentSize(const Function & F,Align & MaxAlign) const556 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
557                                                 Align &MaxAlign) const {
558   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
559 
560   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
561 
562   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
563   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
564   if (ImplicitBytes != 0) {
565     const Align Alignment = getAlignmentForImplicitArgPtr();
566     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
567   }
568 
569   // Being able to dereference past the end is useful for emitting scalar loads.
570   return alignTo(TotalSize, 4);
571 }
572 
R600Subtarget(const Triple & TT,StringRef GPU,StringRef FS,const TargetMachine & TM)573 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
574                              const TargetMachine &TM) :
575   R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
576   AMDGPUSubtarget(TT),
577   InstrInfo(*this),
578   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
579   FMA(false),
580   CaymanISA(false),
581   CFALUBug(false),
582   HasVertexCache(false),
583   R600ALUInst(false),
584   FP64(false),
585   TexVTXClauseSize(0),
586   Gen(R600),
587   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
588   InstrItins(getInstrItineraryForCPU(GPU)) { }
589 
overrideSchedPolicy(MachineSchedPolicy & Policy,unsigned NumRegionInstrs) const590 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
591                                       unsigned NumRegionInstrs) const {
592   // Track register pressure so the scheduler can try to decrease
593   // pressure once register usage is above the threshold defined by
594   // SIRegisterInfo::getRegPressureSetLimit()
595   Policy.ShouldTrackPressure = true;
596 
597   // Enabling both top down and bottom up scheduling seems to give us less
598   // register spills than just using one of these approaches on its own.
599   Policy.OnlyTopDown = false;
600   Policy.OnlyBottomUp = false;
601 
602   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
603   if (!enableSIScheduler())
604     Policy.ShouldTrackLaneMasks = true;
605 }
606 
hasMadF16() const607 bool GCNSubtarget::hasMadF16() const {
608   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
609 }
610 
useVGPRIndexMode() const611 bool GCNSubtarget::useVGPRIndexMode() const {
612   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
613 }
614 
useAA() const615 bool GCNSubtarget::useAA() const { return UseAA; }
616 
getOccupancyWithNumSGPRs(unsigned SGPRs) const617 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
618   if (getGeneration() >= AMDGPUSubtarget::GFX10)
619     return getMaxWavesPerEU();
620 
621   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
622     if (SGPRs <= 80)
623       return 10;
624     if (SGPRs <= 88)
625       return 9;
626     if (SGPRs <= 100)
627       return 8;
628     return 7;
629   }
630   if (SGPRs <= 48)
631     return 10;
632   if (SGPRs <= 56)
633     return 9;
634   if (SGPRs <= 64)
635     return 8;
636   if (SGPRs <= 72)
637     return 7;
638   if (SGPRs <= 80)
639     return 6;
640   return 5;
641 }
642 
getOccupancyWithNumVGPRs(unsigned VGPRs) const643 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
644   unsigned MaxWaves = getMaxWavesPerEU();
645   unsigned Granule = getVGPRAllocGranule();
646   if (VGPRs < Granule)
647     return MaxWaves;
648   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
649   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
650 }
651 
getReservedNumSGPRs(const MachineFunction & MF) const652 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
653   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
654   if (getGeneration() >= AMDGPUSubtarget::GFX10)
655     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
656 
657   if (MFI.hasFlatScratchInit()) {
658     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
659       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
660     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
661       return 4; // FLAT_SCRATCH, VCC (in that order).
662   }
663 
664   if (isXNACKEnabled())
665     return 4; // XNACK, VCC (in that order).
666   return 2; // VCC.
667 }
668 
computeOccupancy(const Function & F,unsigned LDSSize,unsigned NumSGPRs,unsigned NumVGPRs) const669 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
670                                         unsigned NumSGPRs,
671                                         unsigned NumVGPRs) const {
672   unsigned Occupancy =
673     std::min(getMaxWavesPerEU(),
674              getOccupancyWithLocalMemSize(LDSSize, F));
675   if (NumSGPRs)
676     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
677   if (NumVGPRs)
678     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
679   return Occupancy;
680 }
681 
getMaxNumSGPRs(const MachineFunction & MF) const682 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
683   const Function &F = MF.getFunction();
684   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
685 
686   // Compute maximum number of SGPRs function can use using default/requested
687   // minimum number of waves per execution unit.
688   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
689   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
690   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
691 
692   // Check if maximum number of SGPRs was explicitly requested using
693   // "amdgpu-num-sgpr" attribute.
694   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
695     unsigned Requested = AMDGPU::getIntegerAttribute(
696       F, "amdgpu-num-sgpr", MaxNumSGPRs);
697 
698     // Make sure requested value does not violate subtarget's specifications.
699     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
700       Requested = 0;
701 
702     // If more SGPRs are required to support the input user/system SGPRs,
703     // increase to accommodate them.
704     //
705     // FIXME: This really ends up using the requested number of SGPRs + number
706     // of reserved special registers in total. Theoretically you could re-use
707     // the last input registers for these special registers, but this would
708     // require a lot of complexity to deal with the weird aliasing.
709     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
710     if (Requested && Requested < InputNumSGPRs)
711       Requested = InputNumSGPRs;
712 
713     // Make sure requested value is compatible with values implied by
714     // default/requested minimum/maximum number of waves per execution unit.
715     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
716       Requested = 0;
717     if (WavesPerEU.second &&
718         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
719       Requested = 0;
720 
721     if (Requested)
722       MaxNumSGPRs = Requested;
723   }
724 
725   if (hasSGPRInitBug())
726     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
727 
728   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
729                   MaxAddressableNumSGPRs);
730 }
731 
getMaxNumVGPRs(const MachineFunction & MF) const732 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
733   const Function &F = MF.getFunction();
734   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
735 
736   // Compute maximum number of VGPRs function can use using default/requested
737   // minimum number of waves per execution unit.
738   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
739   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
740 
741   // Check if maximum number of VGPRs was explicitly requested using
742   // "amdgpu-num-vgpr" attribute.
743   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
744     unsigned Requested = AMDGPU::getIntegerAttribute(
745       F, "amdgpu-num-vgpr", MaxNumVGPRs);
746 
747     // Make sure requested value is compatible with values implied by
748     // default/requested minimum/maximum number of waves per execution unit.
749     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
750       Requested = 0;
751     if (WavesPerEU.second &&
752         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
753       Requested = 0;
754 
755     if (Requested)
756       MaxNumVGPRs = Requested;
757   }
758 
759   return MaxNumVGPRs;
760 }
761 
adjustSchedDependency(SUnit * Def,int DefOpIdx,SUnit * Use,int UseOpIdx,SDep & Dep) const762 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
763                                          int UseOpIdx, SDep &Dep) const {
764   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
765       !Def->isInstr() || !Use->isInstr())
766     return;
767 
768   MachineInstr *DefI = Def->getInstr();
769   MachineInstr *UseI = Use->getInstr();
770 
771   if (DefI->isBundle()) {
772     const SIRegisterInfo *TRI = getRegisterInfo();
773     auto Reg = Dep.getReg();
774     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
775     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
776     unsigned Lat = 0;
777     for (++I; I != E && I->isBundledWithPred(); ++I) {
778       if (I->modifiesRegister(Reg, TRI))
779         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
780       else if (Lat)
781         --Lat;
782     }
783     Dep.setLatency(Lat);
784   } else if (UseI->isBundle()) {
785     const SIRegisterInfo *TRI = getRegisterInfo();
786     auto Reg = Dep.getReg();
787     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
788     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
789     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
790     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
791       if (I->readsRegister(Reg, TRI))
792         break;
793       --Lat;
794     }
795     Dep.setLatency(Lat);
796   }
797 }
798 
799 namespace {
800 struct FillMFMAShadowMutation : ScheduleDAGMutation {
801   const SIInstrInfo *TII;
802 
803   ScheduleDAGMI *DAG;
804 
FillMFMAShadowMutation__anonbf56e0390111::FillMFMAShadowMutation805   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
806 
isSALU__anonbf56e0390111::FillMFMAShadowMutation807   bool isSALU(const SUnit *SU) const {
808     const MachineInstr *MI = SU->getInstr();
809     return MI && TII->isSALU(*MI) && !MI->isTerminator();
810   }
811 
isVALU__anonbf56e0390111::FillMFMAShadowMutation812   bool isVALU(const SUnit *SU) const {
813     const MachineInstr *MI = SU->getInstr();
814     return MI && TII->isVALU(*MI);
815   }
816 
canAddEdge__anonbf56e0390111::FillMFMAShadowMutation817   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
818     if (Pred->NodeNum < Succ->NodeNum)
819       return true;
820 
821     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
822 
823     for (unsigned I = 0; I < Succs.size(); ++I) {
824       for (const SDep &SI : Succs[I]->Succs) {
825         const SUnit *SU = SI.getSUnit();
826         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
827           Succs.push_back(SU);
828       }
829     }
830 
831     SmallPtrSet<const SUnit*, 32> Visited;
832     while (!Preds.empty()) {
833       const SUnit *SU = Preds.pop_back_val();
834       if (llvm::is_contained(Succs, SU))
835         return false;
836       Visited.insert(SU);
837       for (const SDep &SI : SU->Preds)
838         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
839           Preds.push_back(SI.getSUnit());
840     }
841 
842     return true;
843   }
844 
845   // Link as much SALU intructions in chain as possible. Return the size
846   // of the chain. Links up to MaxChain instructions.
linkSALUChain__anonbf56e0390111::FillMFMAShadowMutation847   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
848                          SmallPtrSetImpl<SUnit *> &Visited) const {
849     SmallVector<SUnit *, 8> Worklist({To});
850     unsigned Linked = 0;
851 
852     while (!Worklist.empty() && MaxChain-- > 0) {
853       SUnit *SU = Worklist.pop_back_val();
854       if (!Visited.insert(SU).second)
855         continue;
856 
857       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
858                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
859 
860       if (SU->addPred(SDep(From, SDep::Artificial), false))
861         ++Linked;
862 
863       for (SDep &SI : From->Succs) {
864         SUnit *SUv = SI.getSUnit();
865         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
866           SUv->addPred(SDep(SU, SDep::Artificial), false);
867       }
868 
869       for (SDep &SI : SU->Succs) {
870         SUnit *Succ = SI.getSUnit();
871         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
872           Worklist.push_back(Succ);
873       }
874     }
875 
876     return Linked;
877   }
878 
apply__anonbf56e0390111::FillMFMAShadowMutation879   void apply(ScheduleDAGInstrs *DAGInstrs) override {
880     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
881     if (!ST.hasMAIInsts() || DisablePowerSched)
882       return;
883     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
884     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
885     if (!TSchedModel || DAG->SUnits.empty())
886       return;
887 
888     // Scan for MFMA long latency instructions and try to add a dependency
889     // of available SALU instructions to give them a chance to fill MFMA
890     // shadow. That is desirable to fill MFMA shadow with SALU instructions
891     // rather than VALU to prevent power consumption bursts and throttle.
892     auto LastSALU = DAG->SUnits.begin();
893     auto E = DAG->SUnits.end();
894     SmallPtrSet<SUnit*, 32> Visited;
895     for (SUnit &SU : DAG->SUnits) {
896       MachineInstr &MAI = *SU.getInstr();
897       if (!TII->isMAI(MAI) ||
898            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
899            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
900         continue;
901 
902       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
903 
904       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
905                  dbgs() << "Need " << Lat
906                         << " instructions to cover latency.\n");
907 
908       // Find up to Lat independent scalar instructions as early as
909       // possible such that they can be scheduled after this MFMA.
910       for ( ; Lat && LastSALU != E; ++LastSALU) {
911         if (Visited.count(&*LastSALU))
912           continue;
913 
914         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
915           continue;
916 
917         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
918       }
919     }
920   }
921 };
922 } // namespace
923 
getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>> & Mutations) const924 void GCNSubtarget::getPostRAMutations(
925     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
926   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
927 }
928 
get(const MachineFunction & MF)929 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
930   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
931     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
932   else
933     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
934 }
935 
get(const TargetMachine & TM,const Function & F)936 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
937   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
938     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
939   else
940     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
941 }
942