1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29
30 using namespace llvm;
31
32 #define DEBUG_TYPE "amdgpu-subtarget"
33
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42
43 static cl::opt<bool> DisablePowerSched(
44 "amdgpu-disable-power-sched",
45 cl::desc("Disable scheduling to minimize mAI power bursts"),
46 cl::init(false));
47
48 static cl::opt<bool> EnableVGPRIndexMode(
49 "amdgpu-vgpr-index-mode",
50 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51 cl::init(false));
52
53 GCNSubtarget::~GCNSubtarget() = default;
54
55 R600Subtarget &
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
57 StringRef GPU, StringRef FS) {
58 SmallString<256> FullFS("+promote-alloca,");
59 FullFS += FS;
60 ParseSubtargetFeatures(GPU, FullFS);
61
62 // FIXME: I don't think think Evergreen has any useful support for
63 // denormals, but should be checked. Should we issue a warning somewhere
64 // if someone tries to enable these?
65 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
66 FP32Denormals = false;
67 }
68
69 HasMulU24 = getGeneration() >= EVERGREEN;
70 HasMulI24 = hasCaymanISA();
71
72 return *this;
73 }
74
75 GCNSubtarget &
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)76 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
77 StringRef GPU, StringRef FS) {
78 // Determine default and user-specified characteristics
79 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
80 // enabled, but some instructions do not respect them and they run at the
81 // double precision rate, so don't enable by default.
82 //
83 // We want to be able to turn these off, but making this a subtarget feature
84 // for SI has the unhelpful behavior that it unsets everything else if you
85 // disable it.
86 //
87 // Similarly we want enable-prt-strict-null to be on by default and not to
88 // unset everything else if it is disabled
89
90 // Assuming ECC is enabled is the conservative default.
91 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
92
93 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
94 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
95
96 // FIXME: I don't think think Evergreen has any useful support for
97 // denormals, but should be checked. Should we issue a warning somewhere
98 // if someone tries to enable these?
99 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
100 FullFS += "+fp64-fp16-denormals,";
101 } else {
102 FullFS += "-fp32-denormals,";
103 }
104
105 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
106
107 // Disable mutually exclusive bits.
108 if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
109 if (FS.find_lower("wavefrontsize16") == StringRef::npos)
110 FullFS += "-wavefrontsize16,";
111 if (FS.find_lower("wavefrontsize32") == StringRef::npos)
112 FullFS += "-wavefrontsize32,";
113 if (FS.find_lower("wavefrontsize64") == StringRef::npos)
114 FullFS += "-wavefrontsize64,";
115 }
116
117 FullFS += FS;
118
119 ParseSubtargetFeatures(GPU, FullFS);
120
121 // We don't support FP64 for EG/NI atm.
122 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
123
124 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
125 // on VI and newer hardware to avoid assertion failures due to missing ADDR64
126 // variants of MUBUF instructions.
127 if (!hasAddr64() && !FS.contains("flat-for-global")) {
128 FlatForGlobal = true;
129 }
130
131 // Set defaults if needed.
132 if (MaxPrivateElementSize == 0)
133 MaxPrivateElementSize = 4;
134
135 if (LDSBankCount == 0)
136 LDSBankCount = 32;
137
138 if (TT.getArch() == Triple::amdgcn) {
139 if (LocalMemorySize == 0)
140 LocalMemorySize = 32768;
141
142 // Do something sensible for unspecified target.
143 if (!HasMovrel && !HasVGPRIndexMode)
144 HasMovrel = true;
145 }
146
147 // Don't crash on invalid devices.
148 if (WavefrontSize == 0)
149 WavefrontSize = 64;
150
151 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
152
153 if (DoesNotSupportXNACK && EnableXNACK) {
154 ToggleFeature(AMDGPU::FeatureXNACK);
155 EnableXNACK = false;
156 }
157
158 // ECC is on by default, but turn it off if the hardware doesn't support it
159 // anyway. This matters for the gfx9 targets with d16 loads, but don't support
160 // ECC.
161 if (DoesNotSupportSRAMECC && EnableSRAMECC) {
162 ToggleFeature(AMDGPU::FeatureSRAMECC);
163 EnableSRAMECC = false;
164 }
165
166 return *this;
167 }
168
AMDGPUSubtarget(const Triple & TT)169 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
170 TargetTriple(TT),
171 Has16BitInsts(false),
172 HasMadMixInsts(false),
173 FP32Denormals(false),
174 FPExceptions(false),
175 HasSDWA(false),
176 HasVOP3PInsts(false),
177 HasMulI24(true),
178 HasMulU24(true),
179 HasInv2PiInlineImm(false),
180 HasFminFmaxLegacy(true),
181 EnablePromoteAlloca(false),
182 HasTrigReducedRange(false),
183 MaxWavesPerEU(10),
184 LocalMemorySize(0),
185 WavefrontSize(0)
186 { }
187
GCNSubtarget(const Triple & TT,StringRef GPU,StringRef FS,const GCNTargetMachine & TM)188 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
189 const GCNTargetMachine &TM) :
190 AMDGPUGenSubtargetInfo(TT, GPU, FS),
191 AMDGPUSubtarget(TT),
192 TargetTriple(TT),
193 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
194 InstrItins(getInstrItineraryForCPU(GPU)),
195 LDSBankCount(0),
196 MaxPrivateElementSize(0),
197
198 FastFMAF32(false),
199 HalfRate64Ops(false),
200
201 FP64FP16Denormals(false),
202 FlatForGlobal(false),
203 AutoWaitcntBeforeBarrier(false),
204 CodeObjectV3(false),
205 UnalignedScratchAccess(false),
206 UnalignedBufferAccess(false),
207
208 HasApertureRegs(false),
209 EnableXNACK(false),
210 DoesNotSupportXNACK(false),
211 EnableCuMode(false),
212 TrapHandler(false),
213
214 EnableLoadStoreOpt(false),
215 EnableUnsafeDSOffsetFolding(false),
216 EnableSIScheduler(false),
217 EnableDS128(false),
218 EnablePRTStrictNull(false),
219 DumpCode(false),
220
221 FP64(false),
222 GCN3Encoding(false),
223 CIInsts(false),
224 GFX8Insts(false),
225 GFX9Insts(false),
226 GFX10Insts(false),
227 GFX7GFX8GFX9Insts(false),
228 SGPRInitBug(false),
229 HasSMemRealTime(false),
230 HasIntClamp(false),
231 HasFmaMixInsts(false),
232 HasMovrel(false),
233 HasVGPRIndexMode(false),
234 HasScalarStores(false),
235 HasScalarAtomics(false),
236 HasSDWAOmod(false),
237 HasSDWAScalar(false),
238 HasSDWASdst(false),
239 HasSDWAMac(false),
240 HasSDWAOutModsVOPC(false),
241 HasDPP(false),
242 HasDPP8(false),
243 HasR128A16(false),
244 HasNSAEncoding(false),
245 HasDLInsts(false),
246 HasDot1Insts(false),
247 HasDot2Insts(false),
248 HasDot3Insts(false),
249 HasDot4Insts(false),
250 HasDot5Insts(false),
251 HasDot6Insts(false),
252 HasMAIInsts(false),
253 HasPkFmacF16Inst(false),
254 HasAtomicFaddInsts(false),
255 EnableSRAMECC(false),
256 DoesNotSupportSRAMECC(false),
257 HasNoSdstCMPX(false),
258 HasVscnt(false),
259 HasRegisterBanking(false),
260 HasVOP3Literal(false),
261 HasNoDataDepHazard(false),
262 FlatAddressSpace(false),
263 FlatInstOffsets(false),
264 FlatGlobalInsts(false),
265 FlatScratchInsts(false),
266 ScalarFlatScratchInsts(false),
267 AddNoCarryInsts(false),
268 HasUnpackedD16VMem(false),
269 LDSMisalignedBug(false),
270 HasMFMAInlineLiteralBug(false),
271
272 ScalarizeGlobal(false),
273
274 HasVcmpxPermlaneHazard(false),
275 HasVMEMtoScalarWriteHazard(false),
276 HasSMEMtoVectorWriteHazard(false),
277 HasInstFwdPrefetchBug(false),
278 HasVcmpxExecWARHazard(false),
279 HasLdsBranchVmemWARHazard(false),
280 HasNSAtoVMEMBug(false),
281 HasOffset3fBug(false),
282 HasFlatSegmentOffsetBug(false),
283
284 FeatureDisable(false),
285 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
286 TLInfo(TM, *this),
287 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
288 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
289 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
290 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
291 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
292 InstSelector.reset(new AMDGPUInstructionSelector(
293 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
294 }
295
getConstantBusLimit(unsigned Opcode) const296 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
297 if (getGeneration() < GFX10)
298 return 1;
299
300 switch (Opcode) {
301 case AMDGPU::V_LSHLREV_B64:
302 case AMDGPU::V_LSHLREV_B64_gfx10:
303 case AMDGPU::V_LSHL_B64:
304 case AMDGPU::V_LSHRREV_B64:
305 case AMDGPU::V_LSHRREV_B64_gfx10:
306 case AMDGPU::V_LSHR_B64:
307 case AMDGPU::V_ASHRREV_I64:
308 case AMDGPU::V_ASHRREV_I64_gfx10:
309 case AMDGPU::V_ASHR_I64:
310 return 1;
311 }
312
313 return 2;
314 }
315
getMaxLocalMemSizeWithWaveCount(unsigned NWaves,const Function & F) const316 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
317 const Function &F) const {
318 if (NWaves == 1)
319 return getLocalMemorySize();
320 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
321 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
322 if (!WorkGroupsPerCu)
323 return 0;
324 unsigned MaxWaves = getMaxWavesPerEU();
325 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
326 }
327
getOccupancyWithLocalMemSize(uint32_t Bytes,const Function & F) const328 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
329 const Function &F) const {
330 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
331 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
332 if (!WorkGroupsPerCu)
333 return 0;
334 unsigned MaxWaves = getMaxWavesPerEU();
335 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
336 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
337 NumWaves = std::min(NumWaves, MaxWaves);
338 NumWaves = std::max(NumWaves, 1u);
339 return NumWaves;
340 }
341
342 unsigned
getOccupancyWithLocalMemSize(const MachineFunction & MF) const343 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
344 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
345 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
346 }
347
348 std::pair<unsigned, unsigned>
getDefaultFlatWorkGroupSize(CallingConv::ID CC) const349 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
350 switch (CC) {
351 case CallingConv::AMDGPU_VS:
352 case CallingConv::AMDGPU_LS:
353 case CallingConv::AMDGPU_HS:
354 case CallingConv::AMDGPU_ES:
355 case CallingConv::AMDGPU_GS:
356 case CallingConv::AMDGPU_PS:
357 return std::make_pair(1, getWavefrontSize());
358 default:
359 return std::make_pair(1u, getMaxFlatWorkGroupSize());
360 }
361 }
362
getFlatWorkGroupSizes(const Function & F) const363 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
364 const Function &F) const {
365 // Default minimum/maximum flat work group sizes.
366 std::pair<unsigned, unsigned> Default =
367 getDefaultFlatWorkGroupSize(F.getCallingConv());
368
369 // Requested minimum/maximum flat work group sizes.
370 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
371 F, "amdgpu-flat-work-group-size", Default);
372
373 // Make sure requested minimum is less than requested maximum.
374 if (Requested.first > Requested.second)
375 return Default;
376
377 // Make sure requested values do not violate subtarget's specifications.
378 if (Requested.first < getMinFlatWorkGroupSize())
379 return Default;
380 if (Requested.second > getMaxFlatWorkGroupSize())
381 return Default;
382
383 return Requested;
384 }
385
getWavesPerEU(const Function & F) const386 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
387 const Function &F) const {
388 // Default minimum/maximum number of waves per execution unit.
389 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
390
391 // Default/requested minimum/maximum flat work group sizes.
392 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
393
394 // If minimum/maximum flat work group sizes were explicitly requested using
395 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
396 // number of waves per execution unit to values implied by requested
397 // minimum/maximum flat work group sizes.
398 unsigned MinImpliedByFlatWorkGroupSize =
399 getMaxWavesPerEU(FlatWorkGroupSizes.second);
400 bool RequestedFlatWorkGroupSize = false;
401
402 if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
403 Default.first = MinImpliedByFlatWorkGroupSize;
404 RequestedFlatWorkGroupSize = true;
405 }
406
407 // Requested minimum/maximum number of waves per execution unit.
408 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
409 F, "amdgpu-waves-per-eu", Default, true);
410
411 // Make sure requested minimum is less than requested maximum.
412 if (Requested.second && Requested.first > Requested.second)
413 return Default;
414
415 // Make sure requested values do not violate subtarget's specifications.
416 if (Requested.first < getMinWavesPerEU() ||
417 Requested.first > getMaxWavesPerEU())
418 return Default;
419 if (Requested.second > getMaxWavesPerEU())
420 return Default;
421
422 // Make sure requested values are compatible with values implied by requested
423 // minimum/maximum flat work group sizes.
424 if (RequestedFlatWorkGroupSize &&
425 Requested.first < MinImpliedByFlatWorkGroupSize)
426 return Default;
427
428 return Requested;
429 }
430
makeLIDRangeMetadata(Instruction * I) const431 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
432 Function *Kernel = I->getParent()->getParent();
433 unsigned MinSize = 0;
434 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
435 bool IdQuery = false;
436
437 // If reqd_work_group_size is present it narrows value down.
438 if (auto *CI = dyn_cast<CallInst>(I)) {
439 const Function *F = CI->getCalledFunction();
440 if (F) {
441 unsigned Dim = UINT_MAX;
442 switch (F->getIntrinsicID()) {
443 case Intrinsic::amdgcn_workitem_id_x:
444 case Intrinsic::r600_read_tidig_x:
445 IdQuery = true;
446 LLVM_FALLTHROUGH;
447 case Intrinsic::r600_read_local_size_x:
448 Dim = 0;
449 break;
450 case Intrinsic::amdgcn_workitem_id_y:
451 case Intrinsic::r600_read_tidig_y:
452 IdQuery = true;
453 LLVM_FALLTHROUGH;
454 case Intrinsic::r600_read_local_size_y:
455 Dim = 1;
456 break;
457 case Intrinsic::amdgcn_workitem_id_z:
458 case Intrinsic::r600_read_tidig_z:
459 IdQuery = true;
460 LLVM_FALLTHROUGH;
461 case Intrinsic::r600_read_local_size_z:
462 Dim = 2;
463 break;
464 default:
465 break;
466 }
467 if (Dim <= 3) {
468 if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
469 if (Node->getNumOperands() == 3)
470 MinSize = MaxSize = mdconst::extract<ConstantInt>(
471 Node->getOperand(Dim))->getZExtValue();
472 }
473 }
474 }
475
476 if (!MaxSize)
477 return false;
478
479 // Range metadata is [Lo, Hi). For ID query we need to pass max size
480 // as Hi. For size query we need to pass Hi + 1.
481 if (IdQuery)
482 MinSize = 0;
483 else
484 ++MaxSize;
485
486 MDBuilder MDB(I->getContext());
487 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
488 APInt(32, MaxSize));
489 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
490 return true;
491 }
492
getExplicitKernArgSize(const Function & F,Align & MaxAlign) const493 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
494 Align &MaxAlign) const {
495 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
496 F.getCallingConv() == CallingConv::SPIR_KERNEL);
497
498 const DataLayout &DL = F.getParent()->getDataLayout();
499 uint64_t ExplicitArgBytes = 0;
500 MaxAlign = Align::None();
501
502 for (const Argument &Arg : F.args()) {
503 Type *ArgTy = Arg.getType();
504
505 const Align Alignment(DL.getABITypeAlignment(ArgTy));
506 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
507 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
508 MaxAlign = std::max(MaxAlign, Alignment);
509 }
510
511 return ExplicitArgBytes;
512 }
513
getKernArgSegmentSize(const Function & F,Align & MaxAlign) const514 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
515 Align &MaxAlign) const {
516 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
517
518 unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
519
520 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
521 unsigned ImplicitBytes = getImplicitArgNumBytes(F);
522 if (ImplicitBytes != 0) {
523 const Align Alignment = getAlignmentForImplicitArgPtr();
524 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
525 }
526
527 // Being able to dereference past the end is useful for emitting scalar loads.
528 return alignTo(TotalSize, 4);
529 }
530
R600Subtarget(const Triple & TT,StringRef GPU,StringRef FS,const TargetMachine & TM)531 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
532 const TargetMachine &TM) :
533 R600GenSubtargetInfo(TT, GPU, FS),
534 AMDGPUSubtarget(TT),
535 InstrInfo(*this),
536 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
537 FMA(false),
538 CaymanISA(false),
539 CFALUBug(false),
540 HasVertexCache(false),
541 R600ALUInst(false),
542 FP64(false),
543 TexVTXClauseSize(0),
544 Gen(R600),
545 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
546 InstrItins(getInstrItineraryForCPU(GPU)) { }
547
overrideSchedPolicy(MachineSchedPolicy & Policy,unsigned NumRegionInstrs) const548 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
549 unsigned NumRegionInstrs) const {
550 // Track register pressure so the scheduler can try to decrease
551 // pressure once register usage is above the threshold defined by
552 // SIRegisterInfo::getRegPressureSetLimit()
553 Policy.ShouldTrackPressure = true;
554
555 // Enabling both top down and bottom up scheduling seems to give us less
556 // register spills than just using one of these approaches on its own.
557 Policy.OnlyTopDown = false;
558 Policy.OnlyBottomUp = false;
559
560 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
561 if (!enableSIScheduler())
562 Policy.ShouldTrackLaneMasks = true;
563 }
564
hasMadF16() const565 bool GCNSubtarget::hasMadF16() const {
566 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
567 }
568
useVGPRIndexMode() const569 bool GCNSubtarget::useVGPRIndexMode() const {
570 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
571 }
572
getOccupancyWithNumSGPRs(unsigned SGPRs) const573 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
574 if (getGeneration() >= AMDGPUSubtarget::GFX10)
575 return getMaxWavesPerEU();
576
577 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
578 if (SGPRs <= 80)
579 return 10;
580 if (SGPRs <= 88)
581 return 9;
582 if (SGPRs <= 100)
583 return 8;
584 return 7;
585 }
586 if (SGPRs <= 48)
587 return 10;
588 if (SGPRs <= 56)
589 return 9;
590 if (SGPRs <= 64)
591 return 8;
592 if (SGPRs <= 72)
593 return 7;
594 if (SGPRs <= 80)
595 return 6;
596 return 5;
597 }
598
getOccupancyWithNumVGPRs(unsigned VGPRs) const599 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
600 unsigned MaxWaves = getMaxWavesPerEU();
601 unsigned Granule = getVGPRAllocGranule();
602 if (VGPRs < Granule)
603 return MaxWaves;
604 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
605 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
606 }
607
getReservedNumSGPRs(const MachineFunction & MF) const608 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
609 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
610 if (getGeneration() >= AMDGPUSubtarget::GFX10)
611 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
612
613 if (MFI.hasFlatScratchInit()) {
614 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
615 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
616 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
617 return 4; // FLAT_SCRATCH, VCC (in that order).
618 }
619
620 if (isXNACKEnabled())
621 return 4; // XNACK, VCC (in that order).
622 return 2; // VCC.
623 }
624
computeOccupancy(const MachineFunction & MF,unsigned LDSSize,unsigned NumSGPRs,unsigned NumVGPRs) const625 unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF,
626 unsigned LDSSize,
627 unsigned NumSGPRs,
628 unsigned NumVGPRs) const {
629 unsigned Occupancy =
630 std::min(getMaxWavesPerEU(),
631 getOccupancyWithLocalMemSize(LDSSize, MF.getFunction()));
632 if (NumSGPRs)
633 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
634 if (NumVGPRs)
635 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
636 return Occupancy;
637 }
638
getMaxNumSGPRs(const MachineFunction & MF) const639 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
640 const Function &F = MF.getFunction();
641 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
642
643 // Compute maximum number of SGPRs function can use using default/requested
644 // minimum number of waves per execution unit.
645 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
646 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
647 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
648
649 // Check if maximum number of SGPRs was explicitly requested using
650 // "amdgpu-num-sgpr" attribute.
651 if (F.hasFnAttribute("amdgpu-num-sgpr")) {
652 unsigned Requested = AMDGPU::getIntegerAttribute(
653 F, "amdgpu-num-sgpr", MaxNumSGPRs);
654
655 // Make sure requested value does not violate subtarget's specifications.
656 if (Requested && (Requested <= getReservedNumSGPRs(MF)))
657 Requested = 0;
658
659 // If more SGPRs are required to support the input user/system SGPRs,
660 // increase to accommodate them.
661 //
662 // FIXME: This really ends up using the requested number of SGPRs + number
663 // of reserved special registers in total. Theoretically you could re-use
664 // the last input registers for these special registers, but this would
665 // require a lot of complexity to deal with the weird aliasing.
666 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
667 if (Requested && Requested < InputNumSGPRs)
668 Requested = InputNumSGPRs;
669
670 // Make sure requested value is compatible with values implied by
671 // default/requested minimum/maximum number of waves per execution unit.
672 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
673 Requested = 0;
674 if (WavesPerEU.second &&
675 Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
676 Requested = 0;
677
678 if (Requested)
679 MaxNumSGPRs = Requested;
680 }
681
682 if (hasSGPRInitBug())
683 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
684
685 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
686 MaxAddressableNumSGPRs);
687 }
688
getMaxNumVGPRs(const MachineFunction & MF) const689 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
690 const Function &F = MF.getFunction();
691 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
692
693 // Compute maximum number of VGPRs function can use using default/requested
694 // minimum number of waves per execution unit.
695 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
696 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
697
698 // Check if maximum number of VGPRs was explicitly requested using
699 // "amdgpu-num-vgpr" attribute.
700 if (F.hasFnAttribute("amdgpu-num-vgpr")) {
701 unsigned Requested = AMDGPU::getIntegerAttribute(
702 F, "amdgpu-num-vgpr", MaxNumVGPRs);
703
704 // Make sure requested value is compatible with values implied by
705 // default/requested minimum/maximum number of waves per execution unit.
706 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
707 Requested = 0;
708 if (WavesPerEU.second &&
709 Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
710 Requested = 0;
711
712 if (Requested)
713 MaxNumVGPRs = Requested;
714 }
715
716 return MaxNumVGPRs;
717 }
718
adjustSchedDependency(SUnit * Src,SUnit * Dst,SDep & Dep) const719 void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
720 SDep &Dep) const {
721 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
722 !Src->isInstr() || !Dst->isInstr())
723 return;
724
725 MachineInstr *SrcI = Src->getInstr();
726 MachineInstr *DstI = Dst->getInstr();
727
728 if (SrcI->isBundle()) {
729 const SIRegisterInfo *TRI = getRegisterInfo();
730 auto Reg = Dep.getReg();
731 MachineBasicBlock::const_instr_iterator I(SrcI->getIterator());
732 MachineBasicBlock::const_instr_iterator E(SrcI->getParent()->instr_end());
733 unsigned Lat = 0;
734 for (++I; I != E && I->isBundledWithPred(); ++I) {
735 if (I->modifiesRegister(Reg, TRI))
736 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
737 else if (Lat)
738 --Lat;
739 }
740 Dep.setLatency(Lat);
741 } else if (DstI->isBundle()) {
742 const SIRegisterInfo *TRI = getRegisterInfo();
743 auto Reg = Dep.getReg();
744 MachineBasicBlock::const_instr_iterator I(DstI->getIterator());
745 MachineBasicBlock::const_instr_iterator E(DstI->getParent()->instr_end());
746 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *SrcI);
747 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
748 if (I->readsRegister(Reg, TRI))
749 break;
750 --Lat;
751 }
752 Dep.setLatency(Lat);
753 }
754 }
755
756 namespace {
757 struct MemOpClusterMutation : ScheduleDAGMutation {
758 const SIInstrInfo *TII;
759
MemOpClusterMutation__anon5b76816c0111::MemOpClusterMutation760 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
761
apply__anon5b76816c0111::MemOpClusterMutation762 void apply(ScheduleDAGInstrs *DAG) override {
763 SUnit *SUa = nullptr;
764 // Search for two consequent memory operations and link them
765 // to prevent scheduler from moving them apart.
766 // In DAG pre-process SUnits are in the original order of
767 // the instructions before scheduling.
768 for (SUnit &SU : DAG->SUnits) {
769 MachineInstr &MI2 = *SU.getInstr();
770 if (!MI2.mayLoad() && !MI2.mayStore()) {
771 SUa = nullptr;
772 continue;
773 }
774 if (!SUa) {
775 SUa = &SU;
776 continue;
777 }
778
779 MachineInstr &MI1 = *SUa->getInstr();
780 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
781 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
782 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
783 (TII->isDS(MI1) && TII->isDS(MI2))) {
784 SU.addPredBarrier(SUa);
785
786 for (const SDep &SI : SU.Preds) {
787 if (SI.getSUnit() != SUa)
788 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
789 }
790
791 if (&SU != &DAG->ExitSU) {
792 for (const SDep &SI : SUa->Succs) {
793 if (SI.getSUnit() != &SU)
794 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
795 }
796 }
797 }
798
799 SUa = &SU;
800 }
801 }
802 };
803
804 struct FillMFMAShadowMutation : ScheduleDAGMutation {
805 const SIInstrInfo *TII;
806
807 ScheduleDAGMI *DAG;
808
FillMFMAShadowMutation__anon5b76816c0111::FillMFMAShadowMutation809 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
810
isSALU__anon5b76816c0111::FillMFMAShadowMutation811 bool isSALU(const SUnit *SU) const {
812 const MachineInstr *MI = SU->getInstr();
813 return MI && TII->isSALU(*MI) && !MI->isTerminator();
814 }
815
isVALU__anon5b76816c0111::FillMFMAShadowMutation816 bool isVALU(const SUnit *SU) const {
817 const MachineInstr *MI = SU->getInstr();
818 return MI && TII->isVALU(*MI);
819 }
820
canAddEdge__anon5b76816c0111::FillMFMAShadowMutation821 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
822 if (Pred->NodeNum < Succ->NodeNum)
823 return true;
824
825 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
826
827 for (unsigned I = 0; I < Succs.size(); ++I) {
828 for (const SDep &SI : Succs[I]->Succs) {
829 const SUnit *SU = SI.getSUnit();
830 if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
831 Succs.push_back(SU);
832 }
833 }
834
835 SmallPtrSet<const SUnit*, 32> Visited;
836 while (!Preds.empty()) {
837 const SUnit *SU = Preds.pop_back_val();
838 if (llvm::find(Succs, SU) != Succs.end())
839 return false;
840 Visited.insert(SU);
841 for (const SDep &SI : SU->Preds)
842 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
843 Preds.push_back(SI.getSUnit());
844 }
845
846 return true;
847 }
848
849 // Link as much SALU intructions in chain as possible. Return the size
850 // of the chain. Links up to MaxChain instructions.
linkSALUChain__anon5b76816c0111::FillMFMAShadowMutation851 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
852 SmallPtrSetImpl<SUnit *> &Visited) const {
853 SmallVector<SUnit *, 8> Worklist({To});
854 unsigned Linked = 0;
855
856 while (!Worklist.empty() && MaxChain-- > 0) {
857 SUnit *SU = Worklist.pop_back_val();
858 if (!Visited.insert(SU).second)
859 continue;
860
861 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
862 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
863
864 if (SU->addPred(SDep(From, SDep::Artificial), false))
865 ++Linked;
866
867 for (SDep &SI : From->Succs) {
868 SUnit *SUv = SI.getSUnit();
869 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
870 SUv->addPred(SDep(SU, SDep::Artificial), false);
871 }
872
873 for (SDep &SI : SU->Succs) {
874 SUnit *Succ = SI.getSUnit();
875 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
876 Worklist.push_back(Succ);
877 }
878 }
879
880 return Linked;
881 }
882
apply__anon5b76816c0111::FillMFMAShadowMutation883 void apply(ScheduleDAGInstrs *DAGInstrs) override {
884 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
885 if (!ST.hasMAIInsts() || DisablePowerSched)
886 return;
887 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
888 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
889 if (!TSchedModel || DAG->SUnits.empty())
890 return;
891
892 // Scan for MFMA long latency instructions and try to add a dependency
893 // of available SALU instructions to give them a chance to fill MFMA
894 // shadow. That is desirable to fill MFMA shadow with SALU instructions
895 // rather than VALU to prevent power consumption bursts and throttle.
896 auto LastSALU = DAG->SUnits.begin();
897 auto E = DAG->SUnits.end();
898 SmallPtrSet<SUnit*, 32> Visited;
899 for (SUnit &SU : DAG->SUnits) {
900 MachineInstr &MAI = *SU.getInstr();
901 if (!TII->isMAI(MAI) ||
902 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
903 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
904 continue;
905
906 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
907
908 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
909 dbgs() << "Need " << Lat
910 << " instructions to cover latency.\n");
911
912 // Find up to Lat independent scalar instructions as early as
913 // possible such that they can be scheduled after this MFMA.
914 for ( ; Lat && LastSALU != E; ++LastSALU) {
915 if (Visited.count(&*LastSALU))
916 continue;
917
918 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
919 continue;
920
921 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
922 }
923 }
924 }
925 };
926 } // namespace
927
getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>> & Mutations) const928 void GCNSubtarget::getPostRAMutations(
929 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
930 Mutations.push_back(std::make_unique<MemOpClusterMutation>(&InstrInfo));
931 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
932 }
933
get(const MachineFunction & MF)934 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
935 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
936 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
937 else
938 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
939 }
940
get(const TargetMachine & TM,const Function & F)941 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
942 if (TM.getTargetTriple().getArch() == Triple::amdgcn)
943 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
944 else
945 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
946 }
947