1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29
30 using namespace llvm;
31
32 #define DEBUG_TYPE "amdgpu-subtarget"
33
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42
43 static cl::opt<bool> DisablePowerSched(
44 "amdgpu-disable-power-sched",
45 cl::desc("Disable scheduling to minimize mAI power bursts"),
46 cl::init(false));
47
48 static cl::opt<bool> EnableVGPRIndexMode(
49 "amdgpu-vgpr-index-mode",
50 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51 cl::init(false));
52
53 static cl::opt<bool> EnableFlatScratch(
54 "amdgpu-enable-flat-scratch",
55 cl::desc("Use flat scratch instructions"),
56 cl::init(false));
57
58 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
59 cl::desc("Enable the use of AA during codegen."),
60 cl::init(true));
61
62 GCNSubtarget::~GCNSubtarget() = default;
63
64 R600Subtarget &
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)65 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
66 StringRef GPU, StringRef FS) {
67 SmallString<256> FullFS("+promote-alloca,");
68 FullFS += FS;
69 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
70
71 HasMulU24 = getGeneration() >= EVERGREEN;
72 HasMulI24 = hasCaymanISA();
73
74 return *this;
75 }
76
77 GCNSubtarget &
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)78 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
79 StringRef GPU, StringRef FS) {
80 // Determine default and user-specified characteristics
81 //
82 // We want to be able to turn these off, but making this a subtarget feature
83 // for SI has the unhelpful behavior that it unsets everything else if you
84 // disable it.
85 //
86 // Similarly we want enable-prt-strict-null to be on by default and not to
87 // unset everything else if it is disabled
88
89 // Assuming ECC is enabled is the conservative default.
90 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,");
91
92 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
93 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
94
95 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
96
97 // Disable mutually exclusive bits.
98 if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
99 if (FS.find_lower("wavefrontsize16") == StringRef::npos)
100 FullFS += "-wavefrontsize16,";
101 if (FS.find_lower("wavefrontsize32") == StringRef::npos)
102 FullFS += "-wavefrontsize32,";
103 if (FS.find_lower("wavefrontsize64") == StringRef::npos)
104 FullFS += "-wavefrontsize64,";
105 }
106
107 FullFS += FS;
108
109 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
110
111 // We don't support FP64 for EG/NI atm.
112 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
113
114 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
115 // on VI and newer hardware to avoid assertion failures due to missing ADDR64
116 // variants of MUBUF instructions.
117 if (!hasAddr64() && !FS.contains("flat-for-global")) {
118 FlatForGlobal = true;
119 }
120
121 // Set defaults if needed.
122 if (MaxPrivateElementSize == 0)
123 MaxPrivateElementSize = 4;
124
125 if (LDSBankCount == 0)
126 LDSBankCount = 32;
127
128 if (TT.getArch() == Triple::amdgcn) {
129 if (LocalMemorySize == 0)
130 LocalMemorySize = 32768;
131
132 // Do something sensible for unspecified target.
133 if (!HasMovrel && !HasVGPRIndexMode)
134 HasMovrel = true;
135 }
136
137 // Don't crash on invalid devices.
138 if (WavefrontSizeLog2 == 0)
139 WavefrontSizeLog2 = 5;
140
141 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
142
143 // Disable XNACK on targets where it is not enabled by default unless it is
144 // explicitly requested.
145 if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) {
146 ToggleFeature(AMDGPU::FeatureXNACK);
147 EnableXNACK = false;
148 }
149
150 // ECC is on by default, but turn it off if the hardware doesn't support it
151 // anyway. This matters for the gfx9 targets with d16 loads, but don't support
152 // ECC.
153 if (DoesNotSupportSRAMECC && EnableSRAMECC) {
154 ToggleFeature(AMDGPU::FeatureSRAMECC);
155 EnableSRAMECC = false;
156 }
157
158 return *this;
159 }
160
AMDGPUSubtarget(const Triple & TT)161 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
162 TargetTriple(TT),
163 Has16BitInsts(false),
164 HasMadMixInsts(false),
165 HasMadMacF32Insts(false),
166 HasDsSrc2Insts(false),
167 HasSDWA(false),
168 HasVOP3PInsts(false),
169 HasMulI24(true),
170 HasMulU24(true),
171 HasInv2PiInlineImm(false),
172 HasFminFmaxLegacy(true),
173 EnablePromoteAlloca(false),
174 HasTrigReducedRange(false),
175 MaxWavesPerEU(10),
176 LocalMemorySize(0),
177 WavefrontSizeLog2(0)
178 { }
179
GCNSubtarget(const Triple & TT,StringRef GPU,StringRef FS,const GCNTargetMachine & TM)180 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
181 const GCNTargetMachine &TM) :
182 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
183 AMDGPUSubtarget(TT),
184 TargetTriple(TT),
185 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
186 InstrItins(getInstrItineraryForCPU(GPU)),
187 LDSBankCount(0),
188 MaxPrivateElementSize(0),
189
190 FastFMAF32(false),
191 FastDenormalF32(false),
192 HalfRate64Ops(false),
193
194 FlatForGlobal(false),
195 AutoWaitcntBeforeBarrier(false),
196 UnalignedScratchAccess(false),
197 UnalignedAccessMode(false),
198
199 HasApertureRegs(false),
200 EnableXNACK(false),
201 DoesNotSupportXNACK(false),
202 EnableCuMode(false),
203 TrapHandler(false),
204
205 EnableLoadStoreOpt(false),
206 EnableUnsafeDSOffsetFolding(false),
207 EnableSIScheduler(false),
208 EnableDS128(false),
209 EnablePRTStrictNull(false),
210 DumpCode(false),
211
212 FP64(false),
213 GCN3Encoding(false),
214 CIInsts(false),
215 GFX8Insts(false),
216 GFX9Insts(false),
217 GFX10Insts(false),
218 GFX10_3Insts(false),
219 GFX7GFX8GFX9Insts(false),
220 SGPRInitBug(false),
221 HasSMemRealTime(false),
222 HasIntClamp(false),
223 HasFmaMixInsts(false),
224 HasMovrel(false),
225 HasVGPRIndexMode(false),
226 HasScalarStores(false),
227 HasScalarAtomics(false),
228 HasSDWAOmod(false),
229 HasSDWAScalar(false),
230 HasSDWASdst(false),
231 HasSDWAMac(false),
232 HasSDWAOutModsVOPC(false),
233 HasDPP(false),
234 HasDPP8(false),
235 HasR128A16(false),
236 HasGFX10A16(false),
237 HasG16(false),
238 HasNSAEncoding(false),
239 GFX10_BEncoding(false),
240 HasDLInsts(false),
241 HasDot1Insts(false),
242 HasDot2Insts(false),
243 HasDot3Insts(false),
244 HasDot4Insts(false),
245 HasDot5Insts(false),
246 HasDot6Insts(false),
247 HasMAIInsts(false),
248 HasPkFmacF16Inst(false),
249 HasAtomicFaddInsts(false),
250 EnableSRAMECC(false),
251 DoesNotSupportSRAMECC(false),
252 HasNoSdstCMPX(false),
253 HasVscnt(false),
254 HasGetWaveIdInst(false),
255 HasSMemTimeInst(false),
256 HasRegisterBanking(false),
257 HasVOP3Literal(false),
258 HasNoDataDepHazard(false),
259 FlatAddressSpace(false),
260 FlatInstOffsets(false),
261 FlatGlobalInsts(false),
262 FlatScratchInsts(false),
263 ScalarFlatScratchInsts(false),
264 AddNoCarryInsts(false),
265 HasUnpackedD16VMem(false),
266 LDSMisalignedBug(false),
267 HasMFMAInlineLiteralBug(false),
268 UnalignedBufferAccess(false),
269 UnalignedDSAccess(false),
270
271 ScalarizeGlobal(false),
272
273 HasVcmpxPermlaneHazard(false),
274 HasVMEMtoScalarWriteHazard(false),
275 HasSMEMtoVectorWriteHazard(false),
276 HasInstFwdPrefetchBug(false),
277 HasVcmpxExecWARHazard(false),
278 HasLdsBranchVmemWARHazard(false),
279 HasNSAtoVMEMBug(false),
280 HasOffset3fBug(false),
281 HasFlatSegmentOffsetBug(false),
282 HasImageStoreD16Bug(false),
283 HasImageGather4D16Bug(false),
284
285 FeatureDisable(false),
286 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
287 TLInfo(TM, *this),
288 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
289 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
290 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
291 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
292 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
293 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
294 InstSelector.reset(new AMDGPUInstructionSelector(
295 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
296 }
297
enableFlatScratch() const298 bool GCNSubtarget::enableFlatScratch() const {
299 return EnableFlatScratch && hasFlatScratchInsts();
300 }
301
getConstantBusLimit(unsigned Opcode) const302 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
303 if (getGeneration() < GFX10)
304 return 1;
305
306 switch (Opcode) {
307 case AMDGPU::V_LSHLREV_B64:
308 case AMDGPU::V_LSHLREV_B64_gfx10:
309 case AMDGPU::V_LSHL_B64:
310 case AMDGPU::V_LSHRREV_B64:
311 case AMDGPU::V_LSHRREV_B64_gfx10:
312 case AMDGPU::V_LSHR_B64:
313 case AMDGPU::V_ASHRREV_I64:
314 case AMDGPU::V_ASHRREV_I64_gfx10:
315 case AMDGPU::V_ASHR_I64:
316 return 1;
317 }
318
319 return 2;
320 }
321
getMaxLocalMemSizeWithWaveCount(unsigned NWaves,const Function & F) const322 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
323 const Function &F) const {
324 if (NWaves == 1)
325 return getLocalMemorySize();
326 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
327 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
328 if (!WorkGroupsPerCu)
329 return 0;
330 unsigned MaxWaves = getMaxWavesPerEU();
331 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
332 }
333
334 // FIXME: Should return min,max range.
getOccupancyWithLocalMemSize(uint32_t Bytes,const Function & F) const335 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
336 const Function &F) const {
337 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
338 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
339 if (!MaxWorkGroupsPerCu)
340 return 0;
341
342 const unsigned WaveSize = getWavefrontSize();
343
344 // FIXME: Do we need to account for alignment requirement of LDS rounding the
345 // size up?
346 // Compute restriction based on LDS usage
347 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
348
349 // This can be queried with more LDS than is possible, so just assume the
350 // worst.
351 if (NumGroups == 0)
352 return 1;
353
354 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
355
356 // Round to the number of waves.
357 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
358 unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
359
360 // Clamp to the maximum possible number of waves.
361 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
362
363 // FIXME: Needs to be a multiple of the group size?
364 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
365
366 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
367 "computed invalid occupancy");
368 return MaxWaves;
369 }
370
371 unsigned
getOccupancyWithLocalMemSize(const MachineFunction & MF) const372 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
373 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
374 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
375 }
376
377 std::pair<unsigned, unsigned>
getDefaultFlatWorkGroupSize(CallingConv::ID CC) const378 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
379 switch (CC) {
380 case CallingConv::AMDGPU_VS:
381 case CallingConv::AMDGPU_LS:
382 case CallingConv::AMDGPU_HS:
383 case CallingConv::AMDGPU_ES:
384 case CallingConv::AMDGPU_GS:
385 case CallingConv::AMDGPU_PS:
386 return std::make_pair(1, getWavefrontSize());
387 default:
388 return std::make_pair(1u, getMaxFlatWorkGroupSize());
389 }
390 }
391
getFlatWorkGroupSizes(const Function & F) const392 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
393 const Function &F) const {
394 // Default minimum/maximum flat work group sizes.
395 std::pair<unsigned, unsigned> Default =
396 getDefaultFlatWorkGroupSize(F.getCallingConv());
397
398 // Requested minimum/maximum flat work group sizes.
399 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
400 F, "amdgpu-flat-work-group-size", Default);
401
402 // Make sure requested minimum is less than requested maximum.
403 if (Requested.first > Requested.second)
404 return Default;
405
406 // Make sure requested values do not violate subtarget's specifications.
407 if (Requested.first < getMinFlatWorkGroupSize())
408 return Default;
409 if (Requested.second > getMaxFlatWorkGroupSize())
410 return Default;
411
412 return Requested;
413 }
414
getWavesPerEU(const Function & F) const415 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
416 const Function &F) const {
417 // Default minimum/maximum number of waves per execution unit.
418 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
419
420 // Default/requested minimum/maximum flat work group sizes.
421 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
422
423 // If minimum/maximum flat work group sizes were explicitly requested using
424 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
425 // number of waves per execution unit to values implied by requested
426 // minimum/maximum flat work group sizes.
427 unsigned MinImpliedByFlatWorkGroupSize =
428 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
429 Default.first = MinImpliedByFlatWorkGroupSize;
430 bool RequestedFlatWorkGroupSize =
431 F.hasFnAttribute("amdgpu-flat-work-group-size");
432
433 // Requested minimum/maximum number of waves per execution unit.
434 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
435 F, "amdgpu-waves-per-eu", Default, true);
436
437 // Make sure requested minimum is less than requested maximum.
438 if (Requested.second && Requested.first > Requested.second)
439 return Default;
440
441 // Make sure requested values do not violate subtarget's specifications.
442 if (Requested.first < getMinWavesPerEU() ||
443 Requested.second > getMaxWavesPerEU())
444 return Default;
445
446 // Make sure requested values are compatible with values implied by requested
447 // minimum/maximum flat work group sizes.
448 if (RequestedFlatWorkGroupSize &&
449 Requested.first < MinImpliedByFlatWorkGroupSize)
450 return Default;
451
452 return Requested;
453 }
454
getReqdWorkGroupSize(const Function & Kernel,unsigned Dim)455 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
456 auto Node = Kernel.getMetadata("reqd_work_group_size");
457 if (Node && Node->getNumOperands() == 3)
458 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
459 return std::numeric_limits<unsigned>::max();
460 }
461
getMaxWorkitemID(const Function & Kernel,unsigned Dimension) const462 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
463 unsigned Dimension) const {
464 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
465 if (ReqdSize != std::numeric_limits<unsigned>::max())
466 return ReqdSize - 1;
467 return getFlatWorkGroupSizes(Kernel).second - 1;
468 }
469
makeLIDRangeMetadata(Instruction * I) const470 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
471 Function *Kernel = I->getParent()->getParent();
472 unsigned MinSize = 0;
473 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
474 bool IdQuery = false;
475
476 // If reqd_work_group_size is present it narrows value down.
477 if (auto *CI = dyn_cast<CallInst>(I)) {
478 const Function *F = CI->getCalledFunction();
479 if (F) {
480 unsigned Dim = UINT_MAX;
481 switch (F->getIntrinsicID()) {
482 case Intrinsic::amdgcn_workitem_id_x:
483 case Intrinsic::r600_read_tidig_x:
484 IdQuery = true;
485 LLVM_FALLTHROUGH;
486 case Intrinsic::r600_read_local_size_x:
487 Dim = 0;
488 break;
489 case Intrinsic::amdgcn_workitem_id_y:
490 case Intrinsic::r600_read_tidig_y:
491 IdQuery = true;
492 LLVM_FALLTHROUGH;
493 case Intrinsic::r600_read_local_size_y:
494 Dim = 1;
495 break;
496 case Intrinsic::amdgcn_workitem_id_z:
497 case Intrinsic::r600_read_tidig_z:
498 IdQuery = true;
499 LLVM_FALLTHROUGH;
500 case Intrinsic::r600_read_local_size_z:
501 Dim = 2;
502 break;
503 default:
504 break;
505 }
506
507 if (Dim <= 3) {
508 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
509 if (ReqdSize != std::numeric_limits<unsigned>::max())
510 MinSize = MaxSize = ReqdSize;
511 }
512 }
513 }
514
515 if (!MaxSize)
516 return false;
517
518 // Range metadata is [Lo, Hi). For ID query we need to pass max size
519 // as Hi. For size query we need to pass Hi + 1.
520 if (IdQuery)
521 MinSize = 0;
522 else
523 ++MaxSize;
524
525 MDBuilder MDB(I->getContext());
526 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
527 APInt(32, MaxSize));
528 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
529 return true;
530 }
531
getExplicitKernArgSize(const Function & F,Align & MaxAlign) const532 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
533 Align &MaxAlign) const {
534 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
535 F.getCallingConv() == CallingConv::SPIR_KERNEL);
536
537 const DataLayout &DL = F.getParent()->getDataLayout();
538 uint64_t ExplicitArgBytes = 0;
539 MaxAlign = Align(1);
540
541 for (const Argument &Arg : F.args()) {
542 const bool IsByRef = Arg.hasByRefAttr();
543 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
544 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
545 if (!Alignment)
546 Alignment = DL.getABITypeAlign(ArgTy);
547
548 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
549 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
550 MaxAlign = max(MaxAlign, Alignment);
551 }
552
553 return ExplicitArgBytes;
554 }
555
getKernArgSegmentSize(const Function & F,Align & MaxAlign) const556 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
557 Align &MaxAlign) const {
558 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
559
560 unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
561
562 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
563 unsigned ImplicitBytes = getImplicitArgNumBytes(F);
564 if (ImplicitBytes != 0) {
565 const Align Alignment = getAlignmentForImplicitArgPtr();
566 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
567 }
568
569 // Being able to dereference past the end is useful for emitting scalar loads.
570 return alignTo(TotalSize, 4);
571 }
572
R600Subtarget(const Triple & TT,StringRef GPU,StringRef FS,const TargetMachine & TM)573 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
574 const TargetMachine &TM) :
575 R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
576 AMDGPUSubtarget(TT),
577 InstrInfo(*this),
578 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
579 FMA(false),
580 CaymanISA(false),
581 CFALUBug(false),
582 HasVertexCache(false),
583 R600ALUInst(false),
584 FP64(false),
585 TexVTXClauseSize(0),
586 Gen(R600),
587 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
588 InstrItins(getInstrItineraryForCPU(GPU)) { }
589
overrideSchedPolicy(MachineSchedPolicy & Policy,unsigned NumRegionInstrs) const590 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
591 unsigned NumRegionInstrs) const {
592 // Track register pressure so the scheduler can try to decrease
593 // pressure once register usage is above the threshold defined by
594 // SIRegisterInfo::getRegPressureSetLimit()
595 Policy.ShouldTrackPressure = true;
596
597 // Enabling both top down and bottom up scheduling seems to give us less
598 // register spills than just using one of these approaches on its own.
599 Policy.OnlyTopDown = false;
600 Policy.OnlyBottomUp = false;
601
602 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
603 if (!enableSIScheduler())
604 Policy.ShouldTrackLaneMasks = true;
605 }
606
hasMadF16() const607 bool GCNSubtarget::hasMadF16() const {
608 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
609 }
610
useVGPRIndexMode() const611 bool GCNSubtarget::useVGPRIndexMode() const {
612 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
613 }
614
useAA() const615 bool GCNSubtarget::useAA() const { return UseAA; }
616
getOccupancyWithNumSGPRs(unsigned SGPRs) const617 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
618 if (getGeneration() >= AMDGPUSubtarget::GFX10)
619 return getMaxWavesPerEU();
620
621 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
622 if (SGPRs <= 80)
623 return 10;
624 if (SGPRs <= 88)
625 return 9;
626 if (SGPRs <= 100)
627 return 8;
628 return 7;
629 }
630 if (SGPRs <= 48)
631 return 10;
632 if (SGPRs <= 56)
633 return 9;
634 if (SGPRs <= 64)
635 return 8;
636 if (SGPRs <= 72)
637 return 7;
638 if (SGPRs <= 80)
639 return 6;
640 return 5;
641 }
642
getOccupancyWithNumVGPRs(unsigned VGPRs) const643 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
644 unsigned MaxWaves = getMaxWavesPerEU();
645 unsigned Granule = getVGPRAllocGranule();
646 if (VGPRs < Granule)
647 return MaxWaves;
648 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
649 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
650 }
651
getReservedNumSGPRs(const MachineFunction & MF) const652 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
653 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
654 if (getGeneration() >= AMDGPUSubtarget::GFX10)
655 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
656
657 if (MFI.hasFlatScratchInit()) {
658 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
659 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
660 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
661 return 4; // FLAT_SCRATCH, VCC (in that order).
662 }
663
664 if (isXNACKEnabled())
665 return 4; // XNACK, VCC (in that order).
666 return 2; // VCC.
667 }
668
computeOccupancy(const Function & F,unsigned LDSSize,unsigned NumSGPRs,unsigned NumVGPRs) const669 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
670 unsigned NumSGPRs,
671 unsigned NumVGPRs) const {
672 unsigned Occupancy =
673 std::min(getMaxWavesPerEU(),
674 getOccupancyWithLocalMemSize(LDSSize, F));
675 if (NumSGPRs)
676 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
677 if (NumVGPRs)
678 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
679 return Occupancy;
680 }
681
getMaxNumSGPRs(const MachineFunction & MF) const682 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
683 const Function &F = MF.getFunction();
684 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
685
686 // Compute maximum number of SGPRs function can use using default/requested
687 // minimum number of waves per execution unit.
688 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
689 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
690 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
691
692 // Check if maximum number of SGPRs was explicitly requested using
693 // "amdgpu-num-sgpr" attribute.
694 if (F.hasFnAttribute("amdgpu-num-sgpr")) {
695 unsigned Requested = AMDGPU::getIntegerAttribute(
696 F, "amdgpu-num-sgpr", MaxNumSGPRs);
697
698 // Make sure requested value does not violate subtarget's specifications.
699 if (Requested && (Requested <= getReservedNumSGPRs(MF)))
700 Requested = 0;
701
702 // If more SGPRs are required to support the input user/system SGPRs,
703 // increase to accommodate them.
704 //
705 // FIXME: This really ends up using the requested number of SGPRs + number
706 // of reserved special registers in total. Theoretically you could re-use
707 // the last input registers for these special registers, but this would
708 // require a lot of complexity to deal with the weird aliasing.
709 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
710 if (Requested && Requested < InputNumSGPRs)
711 Requested = InputNumSGPRs;
712
713 // Make sure requested value is compatible with values implied by
714 // default/requested minimum/maximum number of waves per execution unit.
715 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
716 Requested = 0;
717 if (WavesPerEU.second &&
718 Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
719 Requested = 0;
720
721 if (Requested)
722 MaxNumSGPRs = Requested;
723 }
724
725 if (hasSGPRInitBug())
726 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
727
728 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
729 MaxAddressableNumSGPRs);
730 }
731
getMaxNumVGPRs(const MachineFunction & MF) const732 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
733 const Function &F = MF.getFunction();
734 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
735
736 // Compute maximum number of VGPRs function can use using default/requested
737 // minimum number of waves per execution unit.
738 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
739 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
740
741 // Check if maximum number of VGPRs was explicitly requested using
742 // "amdgpu-num-vgpr" attribute.
743 if (F.hasFnAttribute("amdgpu-num-vgpr")) {
744 unsigned Requested = AMDGPU::getIntegerAttribute(
745 F, "amdgpu-num-vgpr", MaxNumVGPRs);
746
747 // Make sure requested value is compatible with values implied by
748 // default/requested minimum/maximum number of waves per execution unit.
749 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
750 Requested = 0;
751 if (WavesPerEU.second &&
752 Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
753 Requested = 0;
754
755 if (Requested)
756 MaxNumVGPRs = Requested;
757 }
758
759 return MaxNumVGPRs;
760 }
761
adjustSchedDependency(SUnit * Def,int DefOpIdx,SUnit * Use,int UseOpIdx,SDep & Dep) const762 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
763 int UseOpIdx, SDep &Dep) const {
764 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
765 !Def->isInstr() || !Use->isInstr())
766 return;
767
768 MachineInstr *DefI = Def->getInstr();
769 MachineInstr *UseI = Use->getInstr();
770
771 if (DefI->isBundle()) {
772 const SIRegisterInfo *TRI = getRegisterInfo();
773 auto Reg = Dep.getReg();
774 MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
775 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
776 unsigned Lat = 0;
777 for (++I; I != E && I->isBundledWithPred(); ++I) {
778 if (I->modifiesRegister(Reg, TRI))
779 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
780 else if (Lat)
781 --Lat;
782 }
783 Dep.setLatency(Lat);
784 } else if (UseI->isBundle()) {
785 const SIRegisterInfo *TRI = getRegisterInfo();
786 auto Reg = Dep.getReg();
787 MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
788 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
789 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
790 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
791 if (I->readsRegister(Reg, TRI))
792 break;
793 --Lat;
794 }
795 Dep.setLatency(Lat);
796 }
797 }
798
799 namespace {
800 struct FillMFMAShadowMutation : ScheduleDAGMutation {
801 const SIInstrInfo *TII;
802
803 ScheduleDAGMI *DAG;
804
FillMFMAShadowMutation__anonbf56e0390111::FillMFMAShadowMutation805 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
806
isSALU__anonbf56e0390111::FillMFMAShadowMutation807 bool isSALU(const SUnit *SU) const {
808 const MachineInstr *MI = SU->getInstr();
809 return MI && TII->isSALU(*MI) && !MI->isTerminator();
810 }
811
isVALU__anonbf56e0390111::FillMFMAShadowMutation812 bool isVALU(const SUnit *SU) const {
813 const MachineInstr *MI = SU->getInstr();
814 return MI && TII->isVALU(*MI);
815 }
816
canAddEdge__anonbf56e0390111::FillMFMAShadowMutation817 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
818 if (Pred->NodeNum < Succ->NodeNum)
819 return true;
820
821 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
822
823 for (unsigned I = 0; I < Succs.size(); ++I) {
824 for (const SDep &SI : Succs[I]->Succs) {
825 const SUnit *SU = SI.getSUnit();
826 if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
827 Succs.push_back(SU);
828 }
829 }
830
831 SmallPtrSet<const SUnit*, 32> Visited;
832 while (!Preds.empty()) {
833 const SUnit *SU = Preds.pop_back_val();
834 if (llvm::is_contained(Succs, SU))
835 return false;
836 Visited.insert(SU);
837 for (const SDep &SI : SU->Preds)
838 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
839 Preds.push_back(SI.getSUnit());
840 }
841
842 return true;
843 }
844
845 // Link as much SALU intructions in chain as possible. Return the size
846 // of the chain. Links up to MaxChain instructions.
linkSALUChain__anonbf56e0390111::FillMFMAShadowMutation847 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
848 SmallPtrSetImpl<SUnit *> &Visited) const {
849 SmallVector<SUnit *, 8> Worklist({To});
850 unsigned Linked = 0;
851
852 while (!Worklist.empty() && MaxChain-- > 0) {
853 SUnit *SU = Worklist.pop_back_val();
854 if (!Visited.insert(SU).second)
855 continue;
856
857 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
858 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
859
860 if (SU->addPred(SDep(From, SDep::Artificial), false))
861 ++Linked;
862
863 for (SDep &SI : From->Succs) {
864 SUnit *SUv = SI.getSUnit();
865 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
866 SUv->addPred(SDep(SU, SDep::Artificial), false);
867 }
868
869 for (SDep &SI : SU->Succs) {
870 SUnit *Succ = SI.getSUnit();
871 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
872 Worklist.push_back(Succ);
873 }
874 }
875
876 return Linked;
877 }
878
apply__anonbf56e0390111::FillMFMAShadowMutation879 void apply(ScheduleDAGInstrs *DAGInstrs) override {
880 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
881 if (!ST.hasMAIInsts() || DisablePowerSched)
882 return;
883 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
884 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
885 if (!TSchedModel || DAG->SUnits.empty())
886 return;
887
888 // Scan for MFMA long latency instructions and try to add a dependency
889 // of available SALU instructions to give them a chance to fill MFMA
890 // shadow. That is desirable to fill MFMA shadow with SALU instructions
891 // rather than VALU to prevent power consumption bursts and throttle.
892 auto LastSALU = DAG->SUnits.begin();
893 auto E = DAG->SUnits.end();
894 SmallPtrSet<SUnit*, 32> Visited;
895 for (SUnit &SU : DAG->SUnits) {
896 MachineInstr &MAI = *SU.getInstr();
897 if (!TII->isMAI(MAI) ||
898 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
899 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
900 continue;
901
902 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
903
904 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
905 dbgs() << "Need " << Lat
906 << " instructions to cover latency.\n");
907
908 // Find up to Lat independent scalar instructions as early as
909 // possible such that they can be scheduled after this MFMA.
910 for ( ; Lat && LastSALU != E; ++LastSALU) {
911 if (Visited.count(&*LastSALU))
912 continue;
913
914 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
915 continue;
916
917 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
918 }
919 }
920 }
921 };
922 } // namespace
923
getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>> & Mutations) const924 void GCNSubtarget::getPostRAMutations(
925 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
926 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
927 }
928
get(const MachineFunction & MF)929 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
930 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
931 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
932 else
933 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
934 }
935
get(const TargetMachine & TM,const Function & F)936 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
937 if (TM.getTargetTriple().getArch() == Triple::amdgcn)
938 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
939 else
940 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
941 }
942