1 //===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer --------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 ///
12 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary
13 /// code. When passed an MCAsmStreamer it prints assembly and when passed
14 /// an MCObjectStreamer it outputs binary code.
15 //
16 //===----------------------------------------------------------------------===//
17 //
18
19 #include "AMDGPUAsmPrinter.h"
20 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
21 #include "InstPrinter/AMDGPUInstPrinter.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "AMDGPU.h"
24 #include "AMDKernelCodeT.h"
25 #include "AMDGPUSubtarget.h"
26 #include "R600Defines.h"
27 #include "R600MachineFunctionInfo.h"
28 #include "R600RegisterInfo.h"
29 #include "SIDefines.h"
30 #include "SIMachineFunctionInfo.h"
31 #include "SIInstrInfo.h"
32 #include "SIRegisterInfo.h"
33 #include "llvm/CodeGen/MachineFrameInfo.h"
34 #include "llvm/IR/DiagnosticInfo.h"
35 #include "llvm/MC/MCContext.h"
36 #include "llvm/MC/MCSectionELF.h"
37 #include "llvm/MC/MCStreamer.h"
38 #include "llvm/Support/ELF.h"
39 #include "llvm/Support/MathExtras.h"
40 #include "llvm/Support/TargetRegistry.h"
41 #include "llvm/Target/TargetLoweringObjectFile.h"
42
43 using namespace llvm;
44
45 // TODO: This should get the default rounding mode from the kernel. We just set
46 // the default here, but this could change if the OpenCL rounding mode pragmas
47 // are used.
48 //
49 // The denormal mode here should match what is reported by the OpenCL runtime
50 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
51 // can also be override to flush with the -cl-denorms-are-zero compiler flag.
52 //
53 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
54 // precision, and leaves single precision to flush all and does not report
55 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
56 // CL_FP_DENORM for both.
57 //
58 // FIXME: It seems some instructions do not support single precision denormals
59 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
60 // and sin_f32, cos_f32 on most parts).
61
62 // We want to use these instructions, and using fp32 denormals also causes
63 // instructions to run at the double precision rate for the device so it's
64 // probably best to just report no single precision denormals.
getFPMode(const MachineFunction & F)65 static uint32_t getFPMode(const MachineFunction &F) {
66 const SISubtarget& ST = F.getSubtarget<SISubtarget>();
67 // TODO: Is there any real use for the flush in only / flush out only modes?
68
69 uint32_t FP32Denormals =
70 ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
71
72 uint32_t FP64Denormals =
73 ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
74
75 return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
76 FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
77 FP_DENORM_MODE_SP(FP32Denormals) |
78 FP_DENORM_MODE_DP(FP64Denormals);
79 }
80
81 static AsmPrinter *
createAMDGPUAsmPrinterPass(TargetMachine & tm,std::unique_ptr<MCStreamer> && Streamer)82 createAMDGPUAsmPrinterPass(TargetMachine &tm,
83 std::unique_ptr<MCStreamer> &&Streamer) {
84 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
85 }
86
LLVMInitializeAMDGPUAsmPrinter()87 extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
88 TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass);
89 TargetRegistry::RegisterAsmPrinter(TheGCNTarget, createAMDGPUAsmPrinterPass);
90 }
91
AMDGPUAsmPrinter(TargetMachine & TM,std::unique_ptr<MCStreamer> Streamer)92 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
93 std::unique_ptr<MCStreamer> Streamer)
94 : AsmPrinter(TM, std::move(Streamer)) {}
95
EmitStartOfAsmFile(Module & M)96 void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
97 if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
98 return;
99
100 // Need to construct an MCSubtargetInfo here in case we have no functions
101 // in the module.
102 std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo(
103 TM.getTargetTriple().str(), TM.getTargetCPU(),
104 TM.getTargetFeatureString()));
105
106 AMDGPUTargetStreamer *TS =
107 static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
108
109 TS->EmitDirectiveHSACodeObjectVersion(2, 1);
110
111 AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI->getFeatureBits());
112 TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping,
113 "AMD", "AMDGPU");
114 }
115
EmitFunctionBodyStart()116 void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
117 const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
118 SIProgramInfo KernelInfo;
119 if (STM.isAmdHsaOS()) {
120 getSIProgramInfo(KernelInfo, *MF);
121 EmitAmdKernelCodeT(*MF, KernelInfo);
122 }
123 }
124
EmitFunctionEntryLabel()125 void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
126 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
127 const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
128 if (MFI->isKernel() && STM.isAmdHsaOS()) {
129 AMDGPUTargetStreamer *TS =
130 static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
131 TS->EmitAMDGPUSymbolType(CurrentFnSym->getName(),
132 ELF::STT_AMDGPU_HSA_KERNEL);
133 }
134
135 AsmPrinter::EmitFunctionEntryLabel();
136 }
137
EmitGlobalVariable(const GlobalVariable * GV)138 void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
139
140 // Group segment variables aren't emitted in HSA.
141 if (AMDGPU::isGroupSegment(GV))
142 return;
143
144 AsmPrinter::EmitGlobalVariable(GV);
145 }
146
runOnMachineFunction(MachineFunction & MF)147 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
148
149 // The starting address of all shader programs must be 256 bytes aligned.
150 MF.setAlignment(8);
151
152 SetupMachineFunction(MF);
153
154 MCContext &Context = getObjFileLowering().getContext();
155 MCSectionELF *ConfigSection =
156 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
157 OutStreamer->SwitchSection(ConfigSection);
158
159 const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
160 SIProgramInfo KernelInfo;
161 if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
162 getSIProgramInfo(KernelInfo, MF);
163 if (!STM.isAmdHsaOS()) {
164 EmitProgramInfoSI(MF, KernelInfo);
165 }
166 } else {
167 EmitProgramInfoR600(MF);
168 }
169
170 DisasmLines.clear();
171 HexLines.clear();
172 DisasmLineMaxLen = 0;
173
174 EmitFunctionBody();
175
176 if (isVerbose()) {
177 MCSectionELF *CommentSection =
178 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
179 OutStreamer->SwitchSection(CommentSection);
180
181 if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
182 OutStreamer->emitRawComment(" Kernel info:", false);
183 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen),
184 false);
185 OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),
186 false);
187 OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
188 false);
189 OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode),
190 false);
191 OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
192 false);
193 OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
194 false);
195 OutStreamer->emitRawComment(" LDSByteSize: " + Twine(KernelInfo.LDSSize) +
196 " bytes/workgroup (compile time only)", false);
197
198 OutStreamer->emitRawComment(" ReservedVGPRFirst: " + Twine(KernelInfo.ReservedVGPRFirst),
199 false);
200 OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount),
201 false);
202
203 if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) {
204 OutStreamer->emitRawComment(" DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
205 Twine(KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
206 OutStreamer->emitRawComment(" DebuggerPrivateSegmentBufferSGPR: s" +
207 Twine(KernelInfo.DebuggerPrivateSegmentBufferSGPR), false);
208 }
209
210 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
211 Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)),
212 false);
213 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
214 Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)),
215 false);
216 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
217 Twine(G_00B84C_TGID_Y_EN(KernelInfo.ComputePGMRSrc2)),
218 false);
219 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
220 Twine(G_00B84C_TGID_Z_EN(KernelInfo.ComputePGMRSrc2)),
221 false);
222 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
223 Twine(G_00B84C_TIDIG_COMP_CNT(KernelInfo.ComputePGMRSrc2)),
224 false);
225
226 } else {
227 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
228 OutStreamer->emitRawComment(
229 Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->StackSize)));
230 }
231 }
232
233 if (STM.dumpCode()) {
234
235 OutStreamer->SwitchSection(
236 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0));
237
238 for (size_t i = 0; i < DisasmLines.size(); ++i) {
239 std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
240 Comment += " ; " + HexLines[i] + "\n";
241
242 OutStreamer->EmitBytes(StringRef(DisasmLines[i]));
243 OutStreamer->EmitBytes(StringRef(Comment));
244 }
245 }
246
247 return false;
248 }
249
EmitProgramInfoR600(const MachineFunction & MF)250 void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
251 unsigned MaxGPR = 0;
252 bool killPixel = false;
253 const R600Subtarget &STM = MF.getSubtarget<R600Subtarget>();
254 const R600RegisterInfo *RI = STM.getRegisterInfo();
255 const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
256
257 for (const MachineBasicBlock &MBB : MF) {
258 for (const MachineInstr &MI : MBB) {
259 if (MI.getOpcode() == AMDGPU::KILLGT)
260 killPixel = true;
261 unsigned numOperands = MI.getNumOperands();
262 for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
263 const MachineOperand &MO = MI.getOperand(op_idx);
264 if (!MO.isReg())
265 continue;
266 unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff;
267
268 // Register with value > 127 aren't GPR
269 if (HWReg > 127)
270 continue;
271 MaxGPR = std::max(MaxGPR, HWReg);
272 }
273 }
274 }
275
276 unsigned RsrcReg;
277 if (STM.getGeneration() >= R600Subtarget::EVERGREEN) {
278 // Evergreen / Northern Islands
279 switch (MF.getFunction()->getCallingConv()) {
280 default: // Fall through
281 case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
282 case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
283 case CallingConv::AMDGPU_PS: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
284 case CallingConv::AMDGPU_VS: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
285 }
286 } else {
287 // R600 / R700
288 switch (MF.getFunction()->getCallingConv()) {
289 default: // Fall through
290 case CallingConv::AMDGPU_GS: // Fall through
291 case CallingConv::AMDGPU_CS: // Fall through
292 case CallingConv::AMDGPU_VS: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
293 case CallingConv::AMDGPU_PS: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
294 }
295 }
296
297 OutStreamer->EmitIntValue(RsrcReg, 4);
298 OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
299 S_STACK_SIZE(MFI->StackSize), 4);
300 OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
301 OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
302
303 if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
304 OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
305 OutStreamer->EmitIntValue(alignTo(MFI->LDSSize, 4) >> 2, 4);
306 }
307 }
308
getSIProgramInfo(SIProgramInfo & ProgInfo,const MachineFunction & MF) const309 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
310 const MachineFunction &MF) const {
311 const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
312 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
313 uint64_t CodeSize = 0;
314 unsigned MaxSGPR = 0;
315 unsigned MaxVGPR = 0;
316 bool VCCUsed = false;
317 bool FlatUsed = false;
318 const SIRegisterInfo *RI = STM.getRegisterInfo();
319 const SIInstrInfo *TII = STM.getInstrInfo();
320
321 for (const MachineBasicBlock &MBB : MF) {
322 for (const MachineInstr &MI : MBB) {
323 // TODO: CodeSize should account for multiple functions.
324
325 // TODO: Should we count size of debug info?
326 if (MI.isDebugValue())
327 continue;
328
329 CodeSize += TII->getInstSizeInBytes(MI);
330
331 unsigned numOperands = MI.getNumOperands();
332 for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
333 const MachineOperand &MO = MI.getOperand(op_idx);
334 unsigned width = 0;
335 bool isSGPR = false;
336
337 if (!MO.isReg())
338 continue;
339
340 unsigned reg = MO.getReg();
341 switch (reg) {
342 case AMDGPU::EXEC:
343 case AMDGPU::EXEC_LO:
344 case AMDGPU::EXEC_HI:
345 case AMDGPU::SCC:
346 case AMDGPU::M0:
347 continue;
348
349 case AMDGPU::VCC:
350 case AMDGPU::VCC_LO:
351 case AMDGPU::VCC_HI:
352 VCCUsed = true;
353 continue;
354
355 case AMDGPU::FLAT_SCR:
356 case AMDGPU::FLAT_SCR_LO:
357 case AMDGPU::FLAT_SCR_HI:
358 FlatUsed = true;
359 continue;
360
361 case AMDGPU::TBA:
362 case AMDGPU::TBA_LO:
363 case AMDGPU::TBA_HI:
364 case AMDGPU::TMA:
365 case AMDGPU::TMA_LO:
366 case AMDGPU::TMA_HI:
367 llvm_unreachable("Trap Handler registers should not be used");
368 continue;
369
370 default:
371 break;
372 }
373
374 if (AMDGPU::SReg_32RegClass.contains(reg)) {
375 if (AMDGPU::TTMP_32RegClass.contains(reg)) {
376 llvm_unreachable("Trap Handler registers should not be used");
377 }
378 isSGPR = true;
379 width = 1;
380 } else if (AMDGPU::VGPR_32RegClass.contains(reg)) {
381 isSGPR = false;
382 width = 1;
383 } else if (AMDGPU::SReg_64RegClass.contains(reg)) {
384 if (AMDGPU::TTMP_64RegClass.contains(reg)) {
385 llvm_unreachable("Trap Handler registers should not be used");
386 }
387 isSGPR = true;
388 width = 2;
389 } else if (AMDGPU::VReg_64RegClass.contains(reg)) {
390 isSGPR = false;
391 width = 2;
392 } else if (AMDGPU::VReg_96RegClass.contains(reg)) {
393 isSGPR = false;
394 width = 3;
395 } else if (AMDGPU::SReg_128RegClass.contains(reg)) {
396 isSGPR = true;
397 width = 4;
398 } else if (AMDGPU::VReg_128RegClass.contains(reg)) {
399 isSGPR = false;
400 width = 4;
401 } else if (AMDGPU::SReg_256RegClass.contains(reg)) {
402 isSGPR = true;
403 width = 8;
404 } else if (AMDGPU::VReg_256RegClass.contains(reg)) {
405 isSGPR = false;
406 width = 8;
407 } else if (AMDGPU::SReg_512RegClass.contains(reg)) {
408 isSGPR = true;
409 width = 16;
410 } else if (AMDGPU::VReg_512RegClass.contains(reg)) {
411 isSGPR = false;
412 width = 16;
413 } else {
414 llvm_unreachable("Unknown register class");
415 }
416 unsigned hwReg = RI->getEncodingValue(reg) & 0xff;
417 unsigned maxUsed = hwReg + width - 1;
418 if (isSGPR) {
419 MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR;
420 } else {
421 MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR;
422 }
423 }
424 }
425 }
426
427 unsigned ExtraSGPRs = 0;
428
429 if (VCCUsed)
430 ExtraSGPRs = 2;
431
432 if (STM.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) {
433 if (FlatUsed)
434 ExtraSGPRs = 4;
435 } else {
436 if (STM.isXNACKEnabled())
437 ExtraSGPRs = 4;
438
439 if (FlatUsed)
440 ExtraSGPRs = 6;
441 }
442
443 MaxSGPR += ExtraSGPRs;
444
445 // Record first reserved register and reserved register count fields, and
446 // update max register counts if "amdgpu-debugger-reserve-regs" attribute was
447 // specified.
448 if (STM.debuggerReserveRegs()) {
449 ProgInfo.ReservedVGPRFirst = MaxVGPR + 1;
450 ProgInfo.ReservedVGPRCount = MFI->getDebuggerReservedVGPRCount();
451 MaxVGPR += MFI->getDebuggerReservedVGPRCount();
452 }
453
454 // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
455 // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
456 // attribute was specified.
457 if (STM.debuggerEmitPrologue()) {
458 ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR =
459 RI->getHWRegIndex(MFI->getScratchWaveOffsetReg());
460 ProgInfo.DebuggerPrivateSegmentBufferSGPR =
461 RI->getHWRegIndex(MFI->getScratchRSrcReg());
462 }
463
464 // We found the maximum register index. They start at 0, so add one to get the
465 // number of registers.
466 ProgInfo.NumVGPR = MaxVGPR + 1;
467 ProgInfo.NumSGPR = MaxSGPR + 1;
468
469 if (STM.hasSGPRInitBug()) {
470 if (ProgInfo.NumSGPR > SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG) {
471 LLVMContext &Ctx = MF.getFunction()->getContext();
472 DiagnosticInfoResourceLimit Diag(*MF.getFunction(),
473 "SGPRs with SGPR init bug",
474 ProgInfo.NumSGPR, DS_Error);
475 Ctx.diagnose(Diag);
476 }
477
478 ProgInfo.NumSGPR = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
479 }
480
481 if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) {
482 LLVMContext &Ctx = MF.getFunction()->getContext();
483 DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "user SGPRs",
484 MFI->NumUserSGPRs, DS_Error);
485 Ctx.diagnose(Diag);
486 }
487
488 if (MFI->LDSSize > static_cast<unsigned>(STM.getLocalMemorySize())) {
489 LLVMContext &Ctx = MF.getFunction()->getContext();
490 DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "local memory",
491 MFI->LDSSize, DS_Error);
492 Ctx.diagnose(Diag);
493 }
494
495 ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4;
496 ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8;
497 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
498 // register.
499 ProgInfo.FloatMode = getFPMode(MF);
500
501 ProgInfo.IEEEMode = 0;
502
503 // Make clamp modifier on NaN input returns 0.
504 ProgInfo.DX10Clamp = 1;
505
506 const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
507 ProgInfo.ScratchSize = FrameInfo->getStackSize();
508
509 ProgInfo.FlatUsed = FlatUsed;
510 ProgInfo.VCCUsed = VCCUsed;
511 ProgInfo.CodeLen = CodeSize;
512
513 unsigned LDSAlignShift;
514 if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) {
515 // LDS is allocated in 64 dword blocks.
516 LDSAlignShift = 8;
517 } else {
518 // LDS is allocated in 128 dword blocks.
519 LDSAlignShift = 9;
520 }
521
522 unsigned LDSSpillSize = MFI->LDSWaveSpillSize *
523 MFI->getMaximumWorkGroupSize(MF);
524
525 ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize;
526 ProgInfo.LDSBlocks =
527 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
528
529 // Scratch is allocated in 256 dword blocks.
530 unsigned ScratchAlignShift = 10;
531 // We need to program the hardware with the amount of scratch memory that
532 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
533 // scratch memory used per thread.
534 ProgInfo.ScratchBlocks =
535 alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(),
536 1ULL << ScratchAlignShift) >>
537 ScratchAlignShift;
538
539 ProgInfo.ComputePGMRSrc1 =
540 S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
541 S_00B848_SGPRS(ProgInfo.SGPRBlocks) |
542 S_00B848_PRIORITY(ProgInfo.Priority) |
543 S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
544 S_00B848_PRIV(ProgInfo.Priv) |
545 S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
546 S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
547 S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
548
549 // 0 = X, 1 = XY, 2 = XYZ
550 unsigned TIDIGCompCnt = 0;
551 if (MFI->hasWorkItemIDZ())
552 TIDIGCompCnt = 2;
553 else if (MFI->hasWorkItemIDY())
554 TIDIGCompCnt = 1;
555
556 ProgInfo.ComputePGMRSrc2 =
557 S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
558 S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
559 S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
560 S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
561 S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
562 S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) |
563 S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
564 S_00B84C_EXCP_EN_MSB(0) |
565 S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) |
566 S_00B84C_EXCP_EN(0);
567 }
568
getRsrcReg(CallingConv::ID CallConv)569 static unsigned getRsrcReg(CallingConv::ID CallConv) {
570 switch (CallConv) {
571 default: // Fall through
572 case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
573 case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
574 case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
575 case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
576 }
577 }
578
EmitProgramInfoSI(const MachineFunction & MF,const SIProgramInfo & KernelInfo)579 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
580 const SIProgramInfo &KernelInfo) {
581 const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
582 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
583 unsigned RsrcReg = getRsrcReg(MF.getFunction()->getCallingConv());
584
585 if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
586 OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
587
588 OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4);
589
590 OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
591 OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4);
592
593 OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
594 OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4);
595
596 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
597 // 0" comment but I don't see a corresponding field in the register spec.
598 } else {
599 OutStreamer->EmitIntValue(RsrcReg, 4);
600 OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) |
601 S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4);
602 if (STM.isVGPRSpillingEnabled(*MF.getFunction())) {
603 OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
604 OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4);
605 }
606 }
607
608 if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) {
609 OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
610 OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
611 OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
612 OutStreamer->EmitIntValue(MFI->PSInputEna, 4);
613 OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
614 OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
615 }
616
617 OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4);
618 OutStreamer->EmitIntValue(MFI->getNumSpilledSGPRs(), 4);
619 OutStreamer->EmitIntValue(R_SPILLED_VGPRS, 4);
620 OutStreamer->EmitIntValue(MFI->getNumSpilledVGPRs(), 4);
621 }
622
623 // This is supposed to be log2(Size)
getElementByteSizeValue(unsigned Size)624 static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
625 switch (Size) {
626 case 4:
627 return AMD_ELEMENT_4_BYTES;
628 case 8:
629 return AMD_ELEMENT_8_BYTES;
630 case 16:
631 return AMD_ELEMENT_16_BYTES;
632 default:
633 llvm_unreachable("invalid private_element_size");
634 }
635 }
636
EmitAmdKernelCodeT(const MachineFunction & MF,const SIProgramInfo & KernelInfo) const637 void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
638 const SIProgramInfo &KernelInfo) const {
639 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
640 const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
641 amd_kernel_code_t header;
642
643 AMDGPU::initDefaultAMDKernelCodeT(header, STM.getFeatureBits());
644
645 header.compute_pgm_resource_registers =
646 KernelInfo.ComputePGMRSrc1 |
647 (KernelInfo.ComputePGMRSrc2 << 32);
648 header.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
649
650
651 AMD_HSA_BITS_SET(header.code_properties,
652 AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
653 getElementByteSizeValue(STM.getMaxPrivateElementSize()));
654
655 if (MFI->hasPrivateSegmentBuffer()) {
656 header.code_properties |=
657 AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
658 }
659
660 if (MFI->hasDispatchPtr())
661 header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
662
663 if (MFI->hasQueuePtr())
664 header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
665
666 if (MFI->hasKernargSegmentPtr())
667 header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
668
669 if (MFI->hasDispatchID())
670 header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
671
672 if (MFI->hasFlatScratchInit())
673 header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
674
675 // TODO: Private segment size
676
677 if (MFI->hasGridWorkgroupCountX()) {
678 header.code_properties |=
679 AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X;
680 }
681
682 if (MFI->hasGridWorkgroupCountY()) {
683 header.code_properties |=
684 AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y;
685 }
686
687 if (MFI->hasGridWorkgroupCountZ()) {
688 header.code_properties |=
689 AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z;
690 }
691
692 if (MFI->hasDispatchPtr())
693 header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
694
695 if (STM.debuggerSupported())
696 header.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED;
697
698 if (STM.isXNACKEnabled())
699 header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
700
701 header.kernarg_segment_byte_size = MFI->ABIArgOffset;
702 header.wavefront_sgpr_count = KernelInfo.NumSGPR;
703 header.workitem_vgpr_count = KernelInfo.NumVGPR;
704 header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
705 header.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
706 header.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst;
707 header.reserved_vgpr_count = KernelInfo.ReservedVGPRCount;
708
709 if (STM.debuggerEmitPrologue()) {
710 header.debug_wavefront_private_segment_offset_sgpr =
711 KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
712 header.debug_private_segment_buffer_sgpr =
713 KernelInfo.DebuggerPrivateSegmentBufferSGPR;
714 }
715
716 AMDGPUTargetStreamer *TS =
717 static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
718
719 OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
720 TS->EmitAMDKernelCodeT(header);
721 }
722
PrintAsmOperand(const MachineInstr * MI,unsigned OpNo,unsigned AsmVariant,const char * ExtraCode,raw_ostream & O)723 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
724 unsigned AsmVariant,
725 const char *ExtraCode, raw_ostream &O) {
726 if (ExtraCode && ExtraCode[0]) {
727 if (ExtraCode[1] != 0)
728 return true; // Unknown modifier.
729
730 switch (ExtraCode[0]) {
731 default:
732 // See if this is a generic print operand
733 return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
734 case 'r':
735 break;
736 }
737 }
738
739 AMDGPUInstPrinter::printRegOperand(MI->getOperand(OpNo).getReg(), O,
740 *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo());
741 return false;
742 }
743