• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// The AMDGPU target machine contains all of the hardware specific
11 /// information  needed to emit code for R600 and SI GPUs.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUTargetMachine.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUAliasAnalysis.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUInstructionSelector.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPUMacroFusion.h"
22 #include "AMDGPUTargetObjectFile.h"
23 #include "AMDGPUTargetTransformInfo.h"
24 #include "GCNIterativeScheduler.h"
25 #include "GCNSchedStrategy.h"
26 #include "R600MachineScheduler.h"
27 #include "SIMachineFunctionInfo.h"
28 #include "SIMachineScheduler.h"
29 #include "TargetInfo/AMDGPUTargetInfo.h"
30 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
31 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
32 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
33 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
34 #include "llvm/CodeGen/MIRParser/MIParser.h"
35 #include "llvm/CodeGen/Passes.h"
36 #include "llvm/CodeGen/TargetPassConfig.h"
37 #include "llvm/IR/Attributes.h"
38 #include "llvm/IR/Function.h"
39 #include "llvm/IR/LegacyPassManager.h"
40 #include "llvm/InitializePasses.h"
41 #include "llvm/Pass.h"
42 #include "llvm/Support/CommandLine.h"
43 #include "llvm/Support/Compiler.h"
44 #include "llvm/Support/TargetRegistry.h"
45 #include "llvm/Target/TargetLoweringObjectFile.h"
46 #include "llvm/Transforms/IPO.h"
47 #include "llvm/Transforms/IPO/AlwaysInliner.h"
48 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
49 #include "llvm/Transforms/Scalar.h"
50 #include "llvm/Transforms/Scalar/GVN.h"
51 #include "llvm/Transforms/Utils.h"
52 #include "llvm/Transforms/Vectorize.h"
53 #include <memory>
54 
55 using namespace llvm;
56 
57 static cl::opt<bool> EnableR600StructurizeCFG(
58   "r600-ir-structurize",
59   cl::desc("Use StructurizeCFG IR pass"),
60   cl::init(true));
61 
62 static cl::opt<bool> EnableSROA(
63   "amdgpu-sroa",
64   cl::desc("Run SROA after promote alloca pass"),
65   cl::ReallyHidden,
66   cl::init(true));
67 
68 static cl::opt<bool>
69 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
70                         cl::desc("Run early if-conversion"),
71                         cl::init(false));
72 
73 static cl::opt<bool>
74 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
75             cl::desc("Run pre-RA exec mask optimizations"),
76             cl::init(true));
77 
78 static cl::opt<bool> EnableR600IfConvert(
79   "r600-if-convert",
80   cl::desc("Use if conversion pass"),
81   cl::ReallyHidden,
82   cl::init(true));
83 
84 // Option to disable vectorizer for tests.
85 static cl::opt<bool> EnableLoadStoreVectorizer(
86   "amdgpu-load-store-vectorizer",
87   cl::desc("Enable load store vectorizer"),
88   cl::init(true),
89   cl::Hidden);
90 
91 // Option to control global loads scalarization
92 static cl::opt<bool> ScalarizeGlobal(
93   "amdgpu-scalarize-global-loads",
94   cl::desc("Enable global load scalarization"),
95   cl::init(true),
96   cl::Hidden);
97 
98 // Option to run internalize pass.
99 static cl::opt<bool> InternalizeSymbols(
100   "amdgpu-internalize-symbols",
101   cl::desc("Enable elimination of non-kernel functions and unused globals"),
102   cl::init(false),
103   cl::Hidden);
104 
105 // Option to inline all early.
106 static cl::opt<bool> EarlyInlineAll(
107   "amdgpu-early-inline-all",
108   cl::desc("Inline all functions early"),
109   cl::init(false),
110   cl::Hidden);
111 
112 static cl::opt<bool> EnableSDWAPeephole(
113   "amdgpu-sdwa-peephole",
114   cl::desc("Enable SDWA peepholer"),
115   cl::init(true));
116 
117 static cl::opt<bool> EnableDPPCombine(
118   "amdgpu-dpp-combine",
119   cl::desc("Enable DPP combiner"),
120   cl::init(true));
121 
122 // Enable address space based alias analysis
123 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
124   cl::desc("Enable AMDGPU Alias Analysis"),
125   cl::init(true));
126 
127 // Option to run late CFG structurizer
128 static cl::opt<bool, true> LateCFGStructurize(
129   "amdgpu-late-structurize",
130   cl::desc("Enable late CFG structurization"),
131   cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
132   cl::Hidden);
133 
134 static cl::opt<bool, true> EnableAMDGPUFunctionCallsOpt(
135   "amdgpu-function-calls",
136   cl::desc("Enable AMDGPU function call support"),
137   cl::location(AMDGPUTargetMachine::EnableFunctionCalls),
138   cl::init(true),
139   cl::Hidden);
140 
141 // Enable lib calls simplifications
142 static cl::opt<bool> EnableLibCallSimplify(
143   "amdgpu-simplify-libcall",
144   cl::desc("Enable amdgpu library simplifications"),
145   cl::init(true),
146   cl::Hidden);
147 
148 static cl::opt<bool> EnableLowerKernelArguments(
149   "amdgpu-ir-lower-kernel-arguments",
150   cl::desc("Lower kernel argument loads in IR pass"),
151   cl::init(true),
152   cl::Hidden);
153 
154 static cl::opt<bool> EnableRegReassign(
155   "amdgpu-reassign-regs",
156   cl::desc("Enable register reassign optimizations on gfx10+"),
157   cl::init(true),
158   cl::Hidden);
159 
160 // Enable atomic optimization
161 static cl::opt<bool> EnableAtomicOptimizations(
162   "amdgpu-atomic-optimizations",
163   cl::desc("Enable atomic optimizations"),
164   cl::init(false),
165   cl::Hidden);
166 
167 // Enable Mode register optimization
168 static cl::opt<bool> EnableSIModeRegisterPass(
169   "amdgpu-mode-register",
170   cl::desc("Enable mode register pass"),
171   cl::init(true),
172   cl::Hidden);
173 
174 // Option is used in lit tests to prevent deadcoding of patterns inspected.
175 static cl::opt<bool>
176 EnableDCEInRA("amdgpu-dce-in-ra",
177     cl::init(true), cl::Hidden,
178     cl::desc("Enable machine DCE inside regalloc"));
179 
180 static cl::opt<bool> EnableScalarIRPasses(
181   "amdgpu-scalar-ir-passes",
182   cl::desc("Enable scalar IR passes"),
183   cl::init(true),
184   cl::Hidden);
185 
LLVMInitializeAMDGPUTarget()186 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
187   // Register the target
188   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
189   RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
190 
191   PassRegistry *PR = PassRegistry::getPassRegistry();
192   initializeR600ClauseMergePassPass(*PR);
193   initializeR600ControlFlowFinalizerPass(*PR);
194   initializeR600PacketizerPass(*PR);
195   initializeR600ExpandSpecialInstrsPassPass(*PR);
196   initializeR600VectorRegMergerPass(*PR);
197   initializeGlobalISel(*PR);
198   initializeAMDGPUDAGToDAGISelPass(*PR);
199   initializeGCNDPPCombinePass(*PR);
200   initializeSILowerI1CopiesPass(*PR);
201   initializeSILowerSGPRSpillsPass(*PR);
202   initializeSIFixSGPRCopiesPass(*PR);
203   initializeSIFixVGPRCopiesPass(*PR);
204   initializeSIFixupVectorISelPass(*PR);
205   initializeSIFoldOperandsPass(*PR);
206   initializeSIPeepholeSDWAPass(*PR);
207   initializeSIShrinkInstructionsPass(*PR);
208   initializeSIOptimizeExecMaskingPreRAPass(*PR);
209   initializeSILoadStoreOptimizerPass(*PR);
210   initializeAMDGPUFixFunctionBitcastsPass(*PR);
211   initializeAMDGPUAlwaysInlinePass(*PR);
212   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
213   initializeAMDGPUAnnotateUniformValuesPass(*PR);
214   initializeAMDGPUArgumentUsageInfoPass(*PR);
215   initializeAMDGPUAtomicOptimizerPass(*PR);
216   initializeAMDGPULowerKernelArgumentsPass(*PR);
217   initializeAMDGPULowerKernelAttributesPass(*PR);
218   initializeAMDGPULowerIntrinsicsPass(*PR);
219   initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
220   initializeAMDGPUPromoteAllocaPass(*PR);
221   initializeAMDGPUCodeGenPreparePass(*PR);
222   initializeAMDGPUPropagateAttributesEarlyPass(*PR);
223   initializeAMDGPUPropagateAttributesLatePass(*PR);
224   initializeAMDGPURewriteOutArgumentsPass(*PR);
225   initializeAMDGPUUnifyMetadataPass(*PR);
226   initializeSIAnnotateControlFlowPass(*PR);
227   initializeSIInsertWaitcntsPass(*PR);
228   initializeSIModeRegisterPass(*PR);
229   initializeSIWholeQuadModePass(*PR);
230   initializeSILowerControlFlowPass(*PR);
231   initializeSIInsertSkipsPass(*PR);
232   initializeSIMemoryLegalizerPass(*PR);
233   initializeSIOptimizeExecMaskingPass(*PR);
234   initializeSIPreAllocateWWMRegsPass(*PR);
235   initializeSIFormMemoryClausesPass(*PR);
236   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
237   initializeAMDGPUAAWrapperPassPass(*PR);
238   initializeAMDGPUExternalAAWrapperPass(*PR);
239   initializeAMDGPUUseNativeCallsPass(*PR);
240   initializeAMDGPUSimplifyLibCallsPass(*PR);
241   initializeAMDGPUInlinerPass(*PR);
242   initializeAMDGPUPrintfRuntimeBindingPass(*PR);
243   initializeGCNRegBankReassignPass(*PR);
244   initializeGCNNSAReassignPass(*PR);
245 }
246 
createTLOF(const Triple & TT)247 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
248   return std::make_unique<AMDGPUTargetObjectFile>();
249 }
250 
createR600MachineScheduler(MachineSchedContext * C)251 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
252   return new ScheduleDAGMILive(C, std::make_unique<R600SchedStrategy>());
253 }
254 
createSIMachineScheduler(MachineSchedContext * C)255 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
256   return new SIScheduleDAGMI(C);
257 }
258 
259 static ScheduleDAGInstrs *
createGCNMaxOccupancyMachineScheduler(MachineSchedContext * C)260 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
261   ScheduleDAGMILive *DAG =
262     new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
263   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
264   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
265   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
266   return DAG;
267 }
268 
269 static ScheduleDAGInstrs *
createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext * C)270 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
271   auto DAG = new GCNIterativeScheduler(C,
272     GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
273   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
274   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
275   return DAG;
276 }
277 
createMinRegScheduler(MachineSchedContext * C)278 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
279   return new GCNIterativeScheduler(C,
280     GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
281 }
282 
283 static ScheduleDAGInstrs *
createIterativeILPMachineScheduler(MachineSchedContext * C)284 createIterativeILPMachineScheduler(MachineSchedContext *C) {
285   auto DAG = new GCNIterativeScheduler(C,
286     GCNIterativeScheduler::SCHEDULE_ILP);
287   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
288   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
289   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
290   return DAG;
291 }
292 
293 static MachineSchedRegistry
294 R600SchedRegistry("r600", "Run R600's custom scheduler",
295                    createR600MachineScheduler);
296 
297 static MachineSchedRegistry
298 SISchedRegistry("si", "Run SI's custom scheduler",
299                 createSIMachineScheduler);
300 
301 static MachineSchedRegistry
302 GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
303                              "Run GCN scheduler to maximize occupancy",
304                              createGCNMaxOccupancyMachineScheduler);
305 
306 static MachineSchedRegistry
307 IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
308   "Run GCN scheduler to maximize occupancy (experimental)",
309   createIterativeGCNMaxOccupancyMachineScheduler);
310 
311 static MachineSchedRegistry
312 GCNMinRegSchedRegistry("gcn-minreg",
313   "Run GCN iterative scheduler for minimal register usage (experimental)",
314   createMinRegScheduler);
315 
316 static MachineSchedRegistry
317 GCNILPSchedRegistry("gcn-ilp",
318   "Run GCN iterative scheduler for ILP scheduling (experimental)",
319   createIterativeILPMachineScheduler);
320 
computeDataLayout(const Triple & TT)321 static StringRef computeDataLayout(const Triple &TT) {
322   if (TT.getArch() == Triple::r600) {
323     // 32-bit pointers.
324       return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
325              "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
326   }
327 
328   // 32-bit private, local, and region pointers. 64-bit global, constant and
329   // flat, non-integral buffer fat pointers.
330     return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
331          "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
332          "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
333          "-ni:7";
334 }
335 
336 LLVM_READNONE
getGPUOrDefault(const Triple & TT,StringRef GPU)337 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
338   if (!GPU.empty())
339     return GPU;
340 
341   // Need to default to a target with flat support for HSA.
342   if (TT.getArch() == Triple::amdgcn)
343     return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
344 
345   return "r600";
346 }
347 
getEffectiveRelocModel(Optional<Reloc::Model> RM)348 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
349   // The AMDGPU toolchain only supports generating shared objects, so we
350   // must always use PIC.
351   return Reloc::PIC_;
352 }
353 
AMDGPUTargetMachine(const Target & T,const Triple & TT,StringRef CPU,StringRef FS,TargetOptions Options,Optional<Reloc::Model> RM,Optional<CodeModel::Model> CM,CodeGenOpt::Level OptLevel)354 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
355                                          StringRef CPU, StringRef FS,
356                                          TargetOptions Options,
357                                          Optional<Reloc::Model> RM,
358                                          Optional<CodeModel::Model> CM,
359                                          CodeGenOpt::Level OptLevel)
360     : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
361                         FS, Options, getEffectiveRelocModel(RM),
362                         getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
363       TLOF(createTLOF(getTargetTriple())) {
364   initAsmInfo();
365 }
366 
367 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
368 bool AMDGPUTargetMachine::EnableFunctionCalls = false;
369 
370 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
371 
getGPUName(const Function & F) const372 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
373   Attribute GPUAttr = F.getFnAttribute("target-cpu");
374   return GPUAttr.hasAttribute(Attribute::None) ?
375     getTargetCPU() : GPUAttr.getValueAsString();
376 }
377 
getFeatureString(const Function & F) const378 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
379   Attribute FSAttr = F.getFnAttribute("target-features");
380 
381   return FSAttr.hasAttribute(Attribute::None) ?
382     getTargetFeatureString() :
383     FSAttr.getValueAsString();
384 }
385 
386 /// Predicate for Internalize pass.
mustPreserveGV(const GlobalValue & GV)387 static bool mustPreserveGV(const GlobalValue &GV) {
388   if (const Function *F = dyn_cast<Function>(&GV))
389     return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv());
390 
391   return !GV.use_empty();
392 }
393 
adjustPassManager(PassManagerBuilder & Builder)394 void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
395   Builder.DivergentTarget = true;
396 
397   bool EnableOpt = getOptLevel() > CodeGenOpt::None;
398   bool Internalize = InternalizeSymbols;
399   bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls;
400   bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
401   bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
402 
403   if (EnableFunctionCalls) {
404     delete Builder.Inliner;
405     Builder.Inliner = createAMDGPUFunctionInliningPass();
406   }
407 
408   Builder.addExtension(
409     PassManagerBuilder::EP_ModuleOptimizerEarly,
410     [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &,
411                                                legacy::PassManagerBase &PM) {
412       if (AMDGPUAA) {
413         PM.add(createAMDGPUAAWrapperPass());
414         PM.add(createAMDGPUExternalAAWrapperPass());
415       }
416       PM.add(createAMDGPUUnifyMetadataPass());
417       PM.add(createAMDGPUPrintfRuntimeBinding());
418       PM.add(createAMDGPUPropagateAttributesLatePass(this));
419       if (Internalize) {
420         PM.add(createInternalizePass(mustPreserveGV));
421         PM.add(createGlobalDCEPass());
422       }
423       if (EarlyInline)
424         PM.add(createAMDGPUAlwaysInlinePass(false));
425   });
426 
427   const auto &Opt = Options;
428   Builder.addExtension(
429     PassManagerBuilder::EP_EarlyAsPossible,
430     [AMDGPUAA, LibCallSimplify, &Opt, this](const PassManagerBuilder &,
431                                             legacy::PassManagerBase &PM) {
432       if (AMDGPUAA) {
433         PM.add(createAMDGPUAAWrapperPass());
434         PM.add(createAMDGPUExternalAAWrapperPass());
435       }
436       PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this));
437       PM.add(llvm::createAMDGPUUseNativeCallsPass());
438       if (LibCallSimplify)
439         PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt, this));
440   });
441 
442   Builder.addExtension(
443     PassManagerBuilder::EP_CGSCCOptimizerLate,
444     [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
445       // Add infer address spaces pass to the opt pipeline after inlining
446       // but before SROA to increase SROA opportunities.
447       PM.add(createInferAddressSpacesPass());
448 
449       // This should run after inlining to have any chance of doing anything,
450       // and before other cleanup optimizations.
451       PM.add(createAMDGPULowerKernelAttributesPass());
452   });
453 }
454 
455 //===----------------------------------------------------------------------===//
456 // R600 Target Machine (R600 -> Cayman)
457 //===----------------------------------------------------------------------===//
458 
R600TargetMachine(const Target & T,const Triple & TT,StringRef CPU,StringRef FS,TargetOptions Options,Optional<Reloc::Model> RM,Optional<CodeModel::Model> CM,CodeGenOpt::Level OL,bool JIT)459 R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
460                                      StringRef CPU, StringRef FS,
461                                      TargetOptions Options,
462                                      Optional<Reloc::Model> RM,
463                                      Optional<CodeModel::Model> CM,
464                                      CodeGenOpt::Level OL, bool JIT)
465     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
466   setRequiresStructuredCFG(true);
467 
468   // Override the default since calls aren't supported for r600.
469   if (EnableFunctionCalls &&
470       EnableAMDGPUFunctionCallsOpt.getNumOccurrences() == 0)
471     EnableFunctionCalls = false;
472 }
473 
getSubtargetImpl(const Function & F) const474 const R600Subtarget *R600TargetMachine::getSubtargetImpl(
475   const Function &F) const {
476   StringRef GPU = getGPUName(F);
477   StringRef FS = getFeatureString(F);
478 
479   SmallString<128> SubtargetKey(GPU);
480   SubtargetKey.append(FS);
481 
482   auto &I = SubtargetMap[SubtargetKey];
483   if (!I) {
484     // This needs to be done before we create a new subtarget since any
485     // creation will depend on the TM and the code generation flags on the
486     // function that reside in TargetOptions.
487     resetTargetOptions(F);
488     I = std::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
489   }
490 
491   return I.get();
492 }
493 
494 TargetTransformInfo
getTargetTransformInfo(const Function & F)495 R600TargetMachine::getTargetTransformInfo(const Function &F) {
496   return TargetTransformInfo(R600TTIImpl(this, F));
497 }
498 
499 //===----------------------------------------------------------------------===//
500 // GCN Target Machine (SI+)
501 //===----------------------------------------------------------------------===//
502 
GCNTargetMachine(const Target & T,const Triple & TT,StringRef CPU,StringRef FS,TargetOptions Options,Optional<Reloc::Model> RM,Optional<CodeModel::Model> CM,CodeGenOpt::Level OL,bool JIT)503 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
504                                    StringRef CPU, StringRef FS,
505                                    TargetOptions Options,
506                                    Optional<Reloc::Model> RM,
507                                    Optional<CodeModel::Model> CM,
508                                    CodeGenOpt::Level OL, bool JIT)
509     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
510 
getSubtargetImpl(const Function & F) const511 const GCNSubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
512   StringRef GPU = getGPUName(F);
513   StringRef FS = getFeatureString(F);
514 
515   SmallString<128> SubtargetKey(GPU);
516   SubtargetKey.append(FS);
517 
518   auto &I = SubtargetMap[SubtargetKey];
519   if (!I) {
520     // This needs to be done before we create a new subtarget since any
521     // creation will depend on the TM and the code generation flags on the
522     // function that reside in TargetOptions.
523     resetTargetOptions(F);
524     I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
525   }
526 
527   I->setScalarizeGlobalBehavior(ScalarizeGlobal);
528 
529   return I.get();
530 }
531 
532 TargetTransformInfo
getTargetTransformInfo(const Function & F)533 GCNTargetMachine::getTargetTransformInfo(const Function &F) {
534   return TargetTransformInfo(GCNTTIImpl(this, F));
535 }
536 
537 //===----------------------------------------------------------------------===//
538 // AMDGPU Pass Setup
539 //===----------------------------------------------------------------------===//
540 
541 namespace {
542 
543 class AMDGPUPassConfig : public TargetPassConfig {
544 public:
AMDGPUPassConfig(LLVMTargetMachine & TM,PassManagerBase & PM)545   AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
546     : TargetPassConfig(TM, PM) {
547     // Exceptions and StackMaps are not supported, so these passes will never do
548     // anything.
549     disablePass(&StackMapLivenessID);
550     disablePass(&FuncletLayoutID);
551   }
552 
getAMDGPUTargetMachine() const553   AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
554     return getTM<AMDGPUTargetMachine>();
555   }
556 
557   ScheduleDAGInstrs *
createMachineScheduler(MachineSchedContext * C) const558   createMachineScheduler(MachineSchedContext *C) const override {
559     ScheduleDAGMILive *DAG = createGenericSchedLive(C);
560     DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
561     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
562     return DAG;
563   }
564 
565   void addEarlyCSEOrGVNPass();
566   void addStraightLineScalarOptimizationPasses();
567   void addIRPasses() override;
568   void addCodeGenPrepare() override;
569   bool addPreISel() override;
570   bool addInstSelector() override;
571   bool addGCPasses() override;
572 
573   std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
574 };
575 
getCSEConfig() const576 std::unique_ptr<CSEConfigBase> AMDGPUPassConfig::getCSEConfig() const {
577   return getStandardCSEConfigForOpt(TM->getOptLevel());
578 }
579 
580 class R600PassConfig final : public AMDGPUPassConfig {
581 public:
R600PassConfig(LLVMTargetMachine & TM,PassManagerBase & PM)582   R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
583     : AMDGPUPassConfig(TM, PM) {}
584 
createMachineScheduler(MachineSchedContext * C) const585   ScheduleDAGInstrs *createMachineScheduler(
586     MachineSchedContext *C) const override {
587     return createR600MachineScheduler(C);
588   }
589 
590   bool addPreISel() override;
591   bool addInstSelector() override;
592   void addPreRegAlloc() override;
593   void addPreSched2() override;
594   void addPreEmitPass() override;
595 };
596 
597 class GCNPassConfig final : public AMDGPUPassConfig {
598 public:
GCNPassConfig(LLVMTargetMachine & TM,PassManagerBase & PM)599   GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
600     : AMDGPUPassConfig(TM, PM) {
601     // It is necessary to know the register usage of the entire call graph.  We
602     // allow calls without EnableAMDGPUFunctionCalls if they are marked
603     // noinline, so this is always required.
604     setRequiresCodeGenSCCOrder(true);
605   }
606 
getGCNTargetMachine() const607   GCNTargetMachine &getGCNTargetMachine() const {
608     return getTM<GCNTargetMachine>();
609   }
610 
611   ScheduleDAGInstrs *
612   createMachineScheduler(MachineSchedContext *C) const override;
613 
614   bool addPreISel() override;
615   void addMachineSSAOptimization() override;
616   bool addILPOpts() override;
617   bool addInstSelector() override;
618   bool addIRTranslator() override;
619   bool addLegalizeMachineIR() override;
620   bool addRegBankSelect() override;
621   bool addGlobalInstructionSelect() override;
622   void addFastRegAlloc() override;
623   void addOptimizedRegAlloc() override;
624   void addPreRegAlloc() override;
625   bool addPreRewrite() override;
626   void addPostRegAlloc() override;
627   void addPreSched2() override;
628   void addPreEmitPass() override;
629 };
630 
631 } // end anonymous namespace
632 
addEarlyCSEOrGVNPass()633 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
634   if (getOptLevel() == CodeGenOpt::Aggressive)
635     addPass(createGVNPass());
636   else
637     addPass(createEarlyCSEPass());
638 }
639 
addStraightLineScalarOptimizationPasses()640 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
641   addPass(createLICMPass());
642   addPass(createSeparateConstOffsetFromGEPPass());
643   addPass(createSpeculativeExecutionPass());
644   // ReassociateGEPs exposes more opportunites for SLSR. See
645   // the example in reassociate-geps-and-slsr.ll.
646   addPass(createStraightLineStrengthReducePass());
647   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
648   // EarlyCSE can reuse.
649   addEarlyCSEOrGVNPass();
650   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
651   addPass(createNaryReassociatePass());
652   // NaryReassociate on GEPs creates redundant common expressions, so run
653   // EarlyCSE after it.
654   addPass(createEarlyCSEPass());
655 }
656 
addIRPasses()657 void AMDGPUPassConfig::addIRPasses() {
658   const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
659 
660   // There is no reason to run these.
661   disablePass(&StackMapLivenessID);
662   disablePass(&FuncletLayoutID);
663   disablePass(&PatchableFunctionID);
664 
665   addPass(createAMDGPUPrintfRuntimeBinding());
666 
667   // This must occur before inlining, as the inliner will not look through
668   // bitcast calls.
669   addPass(createAMDGPUFixFunctionBitcastsPass());
670 
671   // A call to propagate attributes pass in the backend in case opt was not run.
672   addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
673 
674   addPass(createAtomicExpandPass());
675 
676 
677   addPass(createAMDGPULowerIntrinsicsPass());
678 
679   // Function calls are not supported, so make sure we inline everything.
680   addPass(createAMDGPUAlwaysInlinePass());
681   addPass(createAlwaysInlinerLegacyPass());
682   // We need to add the barrier noop pass, otherwise adding the function
683   // inlining pass will cause all of the PassConfigs passes to be run
684   // one function at a time, which means if we have a nodule with two
685   // functions, then we will generate code for the first function
686   // without ever running any passes on the second.
687   addPass(createBarrierNoopPass());
688 
689   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
690   if (TM.getTargetTriple().getArch() == Triple::r600)
691     addPass(createR600OpenCLImageTypeLoweringPass());
692 
693   // Replace OpenCL enqueued block function pointers with global variables.
694   addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
695 
696   if (TM.getOptLevel() > CodeGenOpt::None) {
697     addPass(createInferAddressSpacesPass());
698     addPass(createAMDGPUPromoteAlloca());
699 
700     if (EnableSROA)
701       addPass(createSROAPass());
702 
703     if (EnableScalarIRPasses)
704       addStraightLineScalarOptimizationPasses();
705 
706     if (EnableAMDGPUAliasAnalysis) {
707       addPass(createAMDGPUAAWrapperPass());
708       addPass(createExternalAAWrapperPass([](Pass &P, Function &,
709                                              AAResults &AAR) {
710         if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
711           AAR.addAAResult(WrapperPass->getResult());
712         }));
713     }
714   }
715 
716   if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
717     // TODO: May want to move later or split into an early and late one.
718     addPass(createAMDGPUCodeGenPreparePass());
719   }
720 
721   TargetPassConfig::addIRPasses();
722 
723   // EarlyCSE is not always strong enough to clean up what LSR produces. For
724   // example, GVN can combine
725   //
726   //   %0 = add %a, %b
727   //   %1 = add %b, %a
728   //
729   // and
730   //
731   //   %0 = shl nsw %a, 2
732   //   %1 = shl %a, 2
733   //
734   // but EarlyCSE can do neither of them.
735   if (getOptLevel() != CodeGenOpt::None && EnableScalarIRPasses)
736     addEarlyCSEOrGVNPass();
737 }
738 
addCodeGenPrepare()739 void AMDGPUPassConfig::addCodeGenPrepare() {
740   if (TM->getTargetTriple().getArch() == Triple::amdgcn)
741     addPass(createAMDGPUAnnotateKernelFeaturesPass());
742 
743   if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
744       EnableLowerKernelArguments)
745     addPass(createAMDGPULowerKernelArgumentsPass());
746 
747   addPass(&AMDGPUPerfHintAnalysisID);
748 
749   TargetPassConfig::addCodeGenPrepare();
750 
751   if (EnableLoadStoreVectorizer)
752     addPass(createLoadStoreVectorizerPass());
753 }
754 
addPreISel()755 bool AMDGPUPassConfig::addPreISel() {
756   addPass(createLowerSwitchPass());
757   addPass(createFlattenCFGPass());
758   return false;
759 }
760 
addInstSelector()761 bool AMDGPUPassConfig::addInstSelector() {
762   // Defer the verifier until FinalizeISel.
763   addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()), false);
764   return false;
765 }
766 
addGCPasses()767 bool AMDGPUPassConfig::addGCPasses() {
768   // Do nothing. GC is not supported.
769   return false;
770 }
771 
772 //===----------------------------------------------------------------------===//
773 // R600 Pass Setup
774 //===----------------------------------------------------------------------===//
775 
addPreISel()776 bool R600PassConfig::addPreISel() {
777   AMDGPUPassConfig::addPreISel();
778 
779   if (EnableR600StructurizeCFG)
780     addPass(createStructurizeCFGPass());
781   return false;
782 }
783 
addInstSelector()784 bool R600PassConfig::addInstSelector() {
785   addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
786   return false;
787 }
788 
addPreRegAlloc()789 void R600PassConfig::addPreRegAlloc() {
790   addPass(createR600VectorRegMerger());
791 }
792 
addPreSched2()793 void R600PassConfig::addPreSched2() {
794   addPass(createR600EmitClauseMarkers(), false);
795   if (EnableR600IfConvert)
796     addPass(&IfConverterID, false);
797   addPass(createR600ClauseMergePass(), false);
798 }
799 
addPreEmitPass()800 void R600PassConfig::addPreEmitPass() {
801   addPass(createAMDGPUCFGStructurizerPass(), false);
802   addPass(createR600ExpandSpecialInstrsPass(), false);
803   addPass(&FinalizeMachineBundlesID, false);
804   addPass(createR600Packetizer(), false);
805   addPass(createR600ControlFlowFinalizer(), false);
806 }
807 
createPassConfig(PassManagerBase & PM)808 TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
809   return new R600PassConfig(*this, PM);
810 }
811 
812 //===----------------------------------------------------------------------===//
813 // GCN Pass Setup
814 //===----------------------------------------------------------------------===//
815 
createMachineScheduler(MachineSchedContext * C) const816 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
817   MachineSchedContext *C) const {
818   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
819   if (ST.enableSIScheduler())
820     return createSIMachineScheduler(C);
821   return createGCNMaxOccupancyMachineScheduler(C);
822 }
823 
addPreISel()824 bool GCNPassConfig::addPreISel() {
825   AMDGPUPassConfig::addPreISel();
826 
827   if (EnableAtomicOptimizations) {
828     addPass(createAMDGPUAtomicOptimizerPass());
829   }
830 
831   // FIXME: We need to run a pass to propagate the attributes when calls are
832   // supported.
833 
834   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
835   // regions formed by them.
836   addPass(&AMDGPUUnifyDivergentExitNodesID);
837   if (!LateCFGStructurize) {
838     addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
839   }
840   addPass(createSinkingPass());
841   addPass(createAMDGPUAnnotateUniformValues());
842   if (!LateCFGStructurize) {
843     addPass(createSIAnnotateControlFlowPass());
844   }
845   addPass(createLCSSAPass());
846 
847   return false;
848 }
849 
addMachineSSAOptimization()850 void GCNPassConfig::addMachineSSAOptimization() {
851   TargetPassConfig::addMachineSSAOptimization();
852 
853   // We want to fold operands after PeepholeOptimizer has run (or as part of
854   // it), because it will eliminate extra copies making it easier to fold the
855   // real source operand. We want to eliminate dead instructions after, so that
856   // we see fewer uses of the copies. We then need to clean up the dead
857   // instructions leftover after the operands are folded as well.
858   //
859   // XXX - Can we get away without running DeadMachineInstructionElim again?
860   addPass(&SIFoldOperandsID);
861   if (EnableDPPCombine)
862     addPass(&GCNDPPCombineID);
863   addPass(&DeadMachineInstructionElimID);
864   addPass(&SILoadStoreOptimizerID);
865   if (EnableSDWAPeephole) {
866     addPass(&SIPeepholeSDWAID);
867     addPass(&EarlyMachineLICMID);
868     addPass(&MachineCSEID);
869     addPass(&SIFoldOperandsID);
870     addPass(&DeadMachineInstructionElimID);
871   }
872   addPass(createSIShrinkInstructionsPass());
873 }
874 
addILPOpts()875 bool GCNPassConfig::addILPOpts() {
876   if (EnableEarlyIfConversion)
877     addPass(&EarlyIfConverterID);
878 
879   TargetPassConfig::addILPOpts();
880   return false;
881 }
882 
addInstSelector()883 bool GCNPassConfig::addInstSelector() {
884   AMDGPUPassConfig::addInstSelector();
885   addPass(&SIFixSGPRCopiesID);
886   addPass(createSILowerI1CopiesPass());
887   addPass(createSIFixupVectorISelPass());
888   addPass(createSIAddIMGInitPass());
889   return false;
890 }
891 
addIRTranslator()892 bool GCNPassConfig::addIRTranslator() {
893   addPass(new IRTranslator());
894   return false;
895 }
896 
addLegalizeMachineIR()897 bool GCNPassConfig::addLegalizeMachineIR() {
898   addPass(new Legalizer());
899   return false;
900 }
901 
addRegBankSelect()902 bool GCNPassConfig::addRegBankSelect() {
903   addPass(new RegBankSelect());
904   return false;
905 }
906 
addGlobalInstructionSelect()907 bool GCNPassConfig::addGlobalInstructionSelect() {
908   addPass(new InstructionSelect());
909   return false;
910 }
911 
addPreRegAlloc()912 void GCNPassConfig::addPreRegAlloc() {
913   if (LateCFGStructurize) {
914     addPass(createAMDGPUMachineCFGStructurizerPass());
915   }
916   addPass(createSIWholeQuadModePass());
917 }
918 
addFastRegAlloc()919 void GCNPassConfig::addFastRegAlloc() {
920   // FIXME: We have to disable the verifier here because of PHIElimination +
921   // TwoAddressInstructions disabling it.
922 
923   // This must be run immediately after phi elimination and before
924   // TwoAddressInstructions, otherwise the processing of the tied operand of
925   // SI_ELSE will introduce a copy of the tied operand source after the else.
926   insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
927 
928   // This must be run just after RegisterCoalescing.
929   insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false);
930 
931   TargetPassConfig::addFastRegAlloc();
932 }
933 
addOptimizedRegAlloc()934 void GCNPassConfig::addOptimizedRegAlloc() {
935   if (OptExecMaskPreRA) {
936     insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
937     insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID);
938   } else {
939     insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
940   }
941 
942   // This must be run immediately after phi elimination and before
943   // TwoAddressInstructions, otherwise the processing of the tied operand of
944   // SI_ELSE will introduce a copy of the tied operand source after the else.
945   insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
946 
947   // This must be run just after RegisterCoalescing.
948   insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false);
949 
950   if (EnableDCEInRA)
951     insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
952 
953   TargetPassConfig::addOptimizedRegAlloc();
954 }
955 
addPreRewrite()956 bool GCNPassConfig::addPreRewrite() {
957   if (EnableRegReassign) {
958     addPass(&GCNNSAReassignID);
959     addPass(&GCNRegBankReassignID);
960   }
961   return true;
962 }
963 
addPostRegAlloc()964 void GCNPassConfig::addPostRegAlloc() {
965   addPass(&SIFixVGPRCopiesID);
966   if (getOptLevel() > CodeGenOpt::None)
967     addPass(&SIOptimizeExecMaskingID);
968   TargetPassConfig::addPostRegAlloc();
969 
970   // Equivalent of PEI for SGPRs.
971   addPass(&SILowerSGPRSpillsID);
972 }
973 
addPreSched2()974 void GCNPassConfig::addPreSched2() {
975 }
976 
addPreEmitPass()977 void GCNPassConfig::addPreEmitPass() {
978   addPass(createSIMemoryLegalizerPass());
979   addPass(createSIInsertWaitcntsPass());
980   addPass(createSIShrinkInstructionsPass());
981   addPass(createSIModeRegisterPass());
982 
983   // The hazard recognizer that runs as part of the post-ra scheduler does not
984   // guarantee to be able handle all hazards correctly. This is because if there
985   // are multiple scheduling regions in a basic block, the regions are scheduled
986   // bottom up, so when we begin to schedule a region we don't know what
987   // instructions were emitted directly before it.
988   //
989   // Here we add a stand-alone hazard recognizer pass which can handle all
990   // cases.
991   //
992   // FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would
993   // be better for it to emit S_NOP <N> when possible.
994   addPass(&PostRAHazardRecognizerID);
995 
996   addPass(&SIInsertSkipsPassID);
997   addPass(&BranchRelaxationPassID);
998 }
999 
createPassConfig(PassManagerBase & PM)1000 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1001   return new GCNPassConfig(*this, PM);
1002 }
1003 
createDefaultFuncInfoYAML() const1004 yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1005   return new yaml::SIMachineFunctionInfo();
1006 }
1007 
1008 yaml::MachineFunctionInfo *
convertFuncInfoToYAML(const MachineFunction & MF) const1009 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1010   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1011   return new yaml::SIMachineFunctionInfo(*MFI,
1012                                          *MF.getSubtarget().getRegisterInfo());
1013 }
1014 
parseMachineFunctionInfo(const yaml::MachineFunctionInfo & MFI_,PerFunctionMIParsingState & PFS,SMDiagnostic & Error,SMRange & SourceRange) const1015 bool GCNTargetMachine::parseMachineFunctionInfo(
1016     const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1017     SMDiagnostic &Error, SMRange &SourceRange) const {
1018   const yaml::SIMachineFunctionInfo &YamlMFI =
1019       reinterpret_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1020   MachineFunction &MF = PFS.MF;
1021   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1022 
1023   MFI->initializeBaseYamlFields(YamlMFI);
1024 
1025   auto parseRegister = [&](const yaml::StringValue &RegName, unsigned &RegVal) {
1026     if (parseNamedRegisterReference(PFS, RegVal, RegName.Value, Error)) {
1027       SourceRange = RegName.SourceRange;
1028       return true;
1029     }
1030 
1031     return false;
1032   };
1033 
1034   auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1035     // Create a diagnostic for a the register string literal.
1036     const MemoryBuffer &Buffer =
1037         *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
1038     Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1039                          RegName.Value.size(), SourceMgr::DK_Error,
1040                          "incorrect register class for field", RegName.Value,
1041                          None, None);
1042     SourceRange = RegName.SourceRange;
1043     return true;
1044   };
1045 
1046   if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1047       parseRegister(YamlMFI.ScratchWaveOffsetReg, MFI->ScratchWaveOffsetReg) ||
1048       parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1049       parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1050     return true;
1051 
1052   if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1053       !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
1054     return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1055   }
1056 
1057   if (MFI->ScratchWaveOffsetReg != AMDGPU::SCRATCH_WAVE_OFFSET_REG &&
1058       !AMDGPU::SGPR_32RegClass.contains(MFI->ScratchWaveOffsetReg)) {
1059     return diagnoseRegisterClass(YamlMFI.ScratchWaveOffsetReg);
1060   }
1061 
1062   if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1063       !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
1064     return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1065   }
1066 
1067   if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1068       !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
1069     return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1070   }
1071 
1072   auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A,
1073                                    const TargetRegisterClass &RC,
1074                                    ArgDescriptor &Arg, unsigned UserSGPRs,
1075                                    unsigned SystemSGPRs) {
1076     // Skip parsing if it's not present.
1077     if (!A)
1078       return false;
1079 
1080     if (A->IsRegister) {
1081       unsigned Reg;
1082       if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
1083         SourceRange = A->RegisterName.SourceRange;
1084         return true;
1085       }
1086       if (!RC.contains(Reg))
1087         return diagnoseRegisterClass(A->RegisterName);
1088       Arg = ArgDescriptor::createRegister(Reg);
1089     } else
1090       Arg = ArgDescriptor::createStack(A->StackOffset);
1091     // Check and apply the optional mask.
1092     if (A->Mask)
1093       Arg = ArgDescriptor::createArg(Arg, A->Mask.getValue());
1094 
1095     MFI->NumUserSGPRs += UserSGPRs;
1096     MFI->NumSystemSGPRs += SystemSGPRs;
1097     return false;
1098   };
1099 
1100   if (YamlMFI.ArgInfo &&
1101       (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1102                              AMDGPU::SGPR_128RegClass,
1103                              MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1104        parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1105                              AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1106                              2, 0) ||
1107        parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1108                              MFI->ArgInfo.QueuePtr, 2, 0) ||
1109        parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1110                              AMDGPU::SReg_64RegClass,
1111                              MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1112        parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1113                              AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1114                              2, 0) ||
1115        parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1116                              AMDGPU::SReg_64RegClass,
1117                              MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1118        parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1119                              AMDGPU::SGPR_32RegClass,
1120                              MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1121        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1122                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1123                              0, 1) ||
1124        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1125                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1126                              0, 1) ||
1127        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1128                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1129                              0, 1) ||
1130        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1131                              AMDGPU::SGPR_32RegClass,
1132                              MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1133        parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1134                              AMDGPU::SGPR_32RegClass,
1135                              MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1136        parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1137                              AMDGPU::SReg_64RegClass,
1138                              MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1139        parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1140                              AMDGPU::SReg_64RegClass,
1141                              MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1142        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1143                              AMDGPU::VGPR_32RegClass,
1144                              MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1145        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1146                              AMDGPU::VGPR_32RegClass,
1147                              MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1148        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1149                              AMDGPU::VGPR_32RegClass,
1150                              MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1151     return true;
1152 
1153   MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1154   MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1155   MFI->Mode.FP32Denormals = YamlMFI.Mode.FP32Denormals;
1156   MFI->Mode.FP64FP16Denormals = YamlMFI.Mode.FP64FP16Denormals;
1157 
1158   return false;
1159 }
1160