1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// SI implementation of the TargetRegisterInfo class.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "SIRegisterInfo.h"
15 #include "AMDGPURegisterBankInfo.h"
16 #include "AMDGPUSubtarget.h"
17 #include "SIInstrInfo.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "MCTargetDesc/AMDGPUInstPrinter.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "llvm/CodeGen/LiveIntervals.h"
22 #include "llvm/CodeGen/MachineDominators.h"
23 #include "llvm/CodeGen/MachineFrameInfo.h"
24 #include "llvm/CodeGen/MachineInstrBuilder.h"
25 #include "llvm/CodeGen/RegisterScavenging.h"
26 #include "llvm/CodeGen/SlotIndexes.h"
27 #include "llvm/IR/Function.h"
28 #include "llvm/IR/LLVMContext.h"
29 #include <vector>
30
31 using namespace llvm;
32
33 #define GET_REGINFO_TARGET_DESC
34 #include "AMDGPUGenRegisterInfo.inc"
35
36 static cl::opt<bool> EnableSpillSGPRToVGPR(
37 "amdgpu-spill-sgpr-to-vgpr",
38 cl::desc("Enable spilling VGPRs to SGPRs"),
39 cl::ReallyHidden,
40 cl::init(true));
41
42 std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts;
43 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
44
45 // Map numbers of DWORDs to indexes in SubRegFromChannelTable.
46 // Valid indexes are shifted 1, such that a 0 mapping means unsupported.
47 // e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
48 // meaning index 7 in SubRegFromChannelTable.
49 static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
50 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
51
SIRegisterInfo(const GCNSubtarget & ST)52 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
53 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST),
54 SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
55
56 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
57 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
58 (getSubRegIndexLaneMask(AMDGPU::lo16) |
59 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
60 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
61 "getNumCoveredRegs() will not work with generated subreg masks!");
62
63 RegPressureIgnoredUnits.resize(getNumRegUnits());
64 RegPressureIgnoredUnits.set(
65 *MCRegUnitIterator(MCRegister::from(AMDGPU::M0), this));
66 for (auto Reg : AMDGPU::VGPR_HI16RegClass)
67 RegPressureIgnoredUnits.set(*MCRegUnitIterator(Reg, this));
68
69 // HACK: Until this is fully tablegen'd.
70 static llvm::once_flag InitializeRegSplitPartsFlag;
71
72 static auto InitializeRegSplitPartsOnce = [this]() {
73 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
74 unsigned Size = getSubRegIdxSize(Idx);
75 if (Size & 31)
76 continue;
77 std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1];
78 unsigned Pos = getSubRegIdxOffset(Idx);
79 if (Pos % Size)
80 continue;
81 Pos /= Size;
82 if (Vec.empty()) {
83 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits.
84 Vec.resize(MaxNumParts);
85 }
86 Vec[Pos] = Idx;
87 }
88 };
89
90 static llvm::once_flag InitializeSubRegFromChannelTableFlag;
91
92 static auto InitializeSubRegFromChannelTableOnce = [this]() {
93 for (auto &Row : SubRegFromChannelTable)
94 Row.fill(AMDGPU::NoSubRegister);
95 for (uint16_t Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
96 unsigned Width = AMDGPUSubRegIdxRanges[Idx].Size / 32;
97 unsigned Offset = AMDGPUSubRegIdxRanges[Idx].Offset / 32;
98 assert(Width < SubRegFromChannelTableWidthMap.size());
99 Width = SubRegFromChannelTableWidthMap[Width];
100 if (Width == 0)
101 continue;
102 unsigned TableIdx = Width - 1;
103 assert(TableIdx < SubRegFromChannelTable.size());
104 assert(Offset < SubRegFromChannelTable[TableIdx].size());
105 SubRegFromChannelTable[TableIdx][Offset] = Idx;
106 }
107 };
108
109 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
110 llvm::call_once(InitializeSubRegFromChannelTableFlag,
111 InitializeSubRegFromChannelTableOnce);
112 }
113
reserveRegisterTuples(BitVector & Reserved,MCRegister Reg) const114 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
115 MCRegister Reg) const {
116 MCRegAliasIterator R(Reg, this, true);
117
118 for (; R.isValid(); ++R)
119 Reserved.set(*R);
120 }
121
122 // Forced to be here by one .inc
getCalleeSavedRegs(const MachineFunction * MF) const123 const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
124 const MachineFunction *MF) const {
125 CallingConv::ID CC = MF->getFunction().getCallingConv();
126 switch (CC) {
127 case CallingConv::C:
128 case CallingConv::Fast:
129 case CallingConv::Cold:
130 case CallingConv::AMDGPU_Gfx:
131 return CSR_AMDGPU_HighRegs_SaveList;
132 default: {
133 // Dummy to not crash RegisterClassInfo.
134 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
135 return &NoCalleeSavedReg;
136 }
137 }
138 }
139
140 const MCPhysReg *
getCalleeSavedRegsViaCopy(const MachineFunction * MF) const141 SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const {
142 return nullptr;
143 }
144
getCallPreservedMask(const MachineFunction & MF,CallingConv::ID CC) const145 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
146 CallingConv::ID CC) const {
147 switch (CC) {
148 case CallingConv::C:
149 case CallingConv::Fast:
150 case CallingConv::Cold:
151 case CallingConv::AMDGPU_Gfx:
152 return CSR_AMDGPU_HighRegs_RegMask;
153 default:
154 return nullptr;
155 }
156 }
157
getNoPreservedMask() const158 const uint32_t *SIRegisterInfo::getNoPreservedMask() const {
159 return CSR_AMDGPU_NoRegs_RegMask;
160 }
161
getFrameRegister(const MachineFunction & MF) const162 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
163 const SIFrameLowering *TFI =
164 MF.getSubtarget<GCNSubtarget>().getFrameLowering();
165 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
166 // During ISel lowering we always reserve the stack pointer in entry
167 // functions, but never actually want to reference it when accessing our own
168 // frame. If we need a frame pointer we use it, but otherwise we can just use
169 // an immediate "0" which we represent by returning NoRegister.
170 if (FuncInfo->isEntryFunction()) {
171 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
172 }
173 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
174 : FuncInfo->getStackPtrOffsetReg();
175 }
176
hasBasePointer(const MachineFunction & MF) const177 bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
178 // When we need stack realignment, we can't reference off of the
179 // stack pointer, so we reserve a base pointer.
180 const MachineFrameInfo &MFI = MF.getFrameInfo();
181 return MFI.getNumFixedObjects() && needsStackRealignment(MF);
182 }
183
getBaseRegister() const184 Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
185
getAllVGPRRegMask() const186 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const {
187 return CSR_AMDGPU_AllVGPRs_RegMask;
188 }
189
getAllAllocatableSRegMask() const190 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const {
191 return CSR_AMDGPU_AllAllocatableSRegs_RegMask;
192 }
193
getSubRegFromChannel(unsigned Channel,unsigned NumRegs)194 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
195 unsigned NumRegs) {
196 assert(NumRegs < SubRegFromChannelTableWidthMap.size());
197 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs];
198 assert(NumRegIndex && "Not implemented");
199 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size());
200 return SubRegFromChannelTable[NumRegIndex - 1][Channel];
201 }
202
reservedPrivateSegmentBufferReg(const MachineFunction & MF) const203 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
204 const MachineFunction &MF) const {
205 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
206 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
207 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass);
208 }
209
getReservedRegs(const MachineFunction & MF) const210 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
211 BitVector Reserved(getNumRegs());
212 Reserved.set(AMDGPU::MODE);
213
214 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
215 // this seems likely to result in bugs, so I'm marking them as reserved.
216 reserveRegisterTuples(Reserved, AMDGPU::EXEC);
217 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
218
219 // M0 has to be reserved so that llvm accepts it as a live-in into a block.
220 reserveRegisterTuples(Reserved, AMDGPU::M0);
221
222 // Reserve src_vccz, src_execz, src_scc.
223 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
224 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
225 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
226
227 // Reserve the memory aperture registers.
228 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
229 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
230 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
231 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
232
233 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
234 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
235
236 // Reserve xnack_mask registers - support is not implemented in Codegen.
237 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
238
239 // Reserve lds_direct register - support is not implemented in Codegen.
240 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
241
242 // Reserve Trap Handler registers - support is not implemented in Codegen.
243 reserveRegisterTuples(Reserved, AMDGPU::TBA);
244 reserveRegisterTuples(Reserved, AMDGPU::TMA);
245 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
246 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
247 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
248 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
249 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
250 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
251 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
252 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
253
254 // Reserve null register - it shall never be allocated
255 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL);
256
257 // Disallow vcc_hi allocation in wave32. It may be allocated but most likely
258 // will result in bugs.
259 if (isWave32) {
260 Reserved.set(AMDGPU::VCC);
261 Reserved.set(AMDGPU::VCC_HI);
262 }
263
264 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
265 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
266 for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) {
267 unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
268 reserveRegisterTuples(Reserved, Reg);
269 }
270
271 unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
272 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
273 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
274 unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
275 reserveRegisterTuples(Reserved, Reg);
276 Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
277 reserveRegisterTuples(Reserved, Reg);
278 }
279
280 for (auto Reg : AMDGPU::SReg_32RegClass) {
281 Reserved.set(getSubReg(Reg, AMDGPU::hi16));
282 Register Low = getSubReg(Reg, AMDGPU::lo16);
283 // This is to prevent BB vcc liveness errors.
284 if (!AMDGPU::SGPR_LO16RegClass.contains(Low))
285 Reserved.set(Low);
286 }
287
288 for (auto Reg : AMDGPU::AGPR_32RegClass) {
289 Reserved.set(getSubReg(Reg, AMDGPU::hi16));
290 }
291
292 // Reserve all the rest AGPRs if there are no instructions to use it.
293 if (!ST.hasMAIInsts()) {
294 for (unsigned i = 0; i < MaxNumVGPRs; ++i) {
295 unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
296 reserveRegisterTuples(Reserved, Reg);
297 }
298 }
299
300 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
301
302 Register ScratchRSrcReg = MFI->getScratchRSrcReg();
303 if (ScratchRSrcReg != AMDGPU::NoRegister) {
304 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
305 // to spill.
306 // TODO: May need to reserve a VGPR if doing LDS spilling.
307 reserveRegisterTuples(Reserved, ScratchRSrcReg);
308 }
309
310 // We have to assume the SP is needed in case there are calls in the function,
311 // which is detected after the function is lowered. If we aren't really going
312 // to need SP, don't bother reserving it.
313 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
314
315 if (StackPtrReg) {
316 reserveRegisterTuples(Reserved, StackPtrReg);
317 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
318 }
319
320 MCRegister FrameReg = MFI->getFrameOffsetReg();
321 if (FrameReg) {
322 reserveRegisterTuples(Reserved, FrameReg);
323 assert(!isSubRegister(ScratchRSrcReg, FrameReg));
324 }
325
326 if (hasBasePointer(MF)) {
327 MCRegister BasePtrReg = getBaseRegister();
328 reserveRegisterTuples(Reserved, BasePtrReg);
329 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
330 }
331
332 for (MCRegister Reg : MFI->WWMReservedRegs) {
333 reserveRegisterTuples(Reserved, Reg);
334 }
335
336 // FIXME: Stop using reserved registers for this.
337 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
338 reserveRegisterTuples(Reserved, Reg);
339
340 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
341 reserveRegisterTuples(Reserved, Reg);
342
343 for (auto SSpill : MFI->getSGPRSpillVGPRs())
344 reserveRegisterTuples(Reserved, SSpill.VGPR);
345
346 return Reserved;
347 }
348
canRealignStack(const MachineFunction & MF) const349 bool SIRegisterInfo::canRealignStack(const MachineFunction &MF) const {
350 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
351 // On entry, the base address is 0, so it can't possibly need any more
352 // alignment.
353
354 // FIXME: Should be able to specify the entry frame alignment per calling
355 // convention instead.
356 if (Info->isEntryFunction())
357 return false;
358
359 return TargetRegisterInfo::canRealignStack(MF);
360 }
361
requiresRegisterScavenging(const MachineFunction & Fn) const362 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
363 const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>();
364 if (Info->isEntryFunction()) {
365 const MachineFrameInfo &MFI = Fn.getFrameInfo();
366 return MFI.hasStackObjects() || MFI.hasCalls();
367 }
368
369 // May need scavenger for dealing with callee saved registers.
370 return true;
371 }
372
requiresFrameIndexScavenging(const MachineFunction & MF) const373 bool SIRegisterInfo::requiresFrameIndexScavenging(
374 const MachineFunction &MF) const {
375 // Do not use frame virtual registers. They used to be used for SGPRs, but
376 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
377 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
378 // spill.
379 return false;
380 }
381
requiresFrameIndexReplacementScavenging(const MachineFunction & MF) const382 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
383 const MachineFunction &MF) const {
384 const MachineFrameInfo &MFI = MF.getFrameInfo();
385 return MFI.hasStackObjects();
386 }
387
requiresVirtualBaseRegisters(const MachineFunction &) const388 bool SIRegisterInfo::requiresVirtualBaseRegisters(
389 const MachineFunction &) const {
390 // There are no special dedicated stack or frame pointers.
391 return true;
392 }
393
getScratchInstrOffset(const MachineInstr * MI) const394 int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const {
395 assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI));
396
397 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
398 AMDGPU::OpName::offset);
399 return MI->getOperand(OffIdx).getImm();
400 }
401
getFrameIndexInstrOffset(const MachineInstr * MI,int Idx) const402 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
403 int Idx) const {
404 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
405 return 0;
406
407 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
408 AMDGPU::OpName::vaddr) ||
409 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
410 AMDGPU::OpName::saddr))) &&
411 "Should never see frame index on non-address operand");
412
413 return getScratchInstrOffset(MI);
414 }
415
needsFrameBaseReg(MachineInstr * MI,int64_t Offset) const416 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
417 if (!MI->mayLoadOrStore())
418 return false;
419
420 int64_t FullOffset = Offset + getScratchInstrOffset(MI);
421
422 if (SIInstrInfo::isMUBUF(*MI))
423 return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset);
424
425 const SIInstrInfo *TII = ST.getInstrInfo();
426 return TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, true);
427 }
428
materializeFrameBaseRegister(MachineBasicBlock * MBB,Register BaseReg,int FrameIdx,int64_t Offset) const429 void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
430 Register BaseReg,
431 int FrameIdx,
432 int64_t Offset) const {
433 MachineBasicBlock::iterator Ins = MBB->begin();
434 DebugLoc DL; // Defaults to "unknown"
435
436 if (Ins != MBB->end())
437 DL = Ins->getDebugLoc();
438
439 MachineFunction *MF = MBB->getParent();
440 const SIInstrInfo *TII = ST.getInstrInfo();
441 unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32
442 : AMDGPU::V_MOV_B32_e32;
443
444 if (Offset == 0) {
445 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
446 .addFrameIndex(FrameIdx);
447 return;
448 }
449
450 MachineRegisterInfo &MRI = MF->getRegInfo();
451 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
452
453 Register FIReg = MRI.createVirtualRegister(
454 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass
455 : &AMDGPU::VGPR_32RegClass);
456
457 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
458 .addImm(Offset);
459 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
460 .addFrameIndex(FrameIdx);
461
462 if (ST.enableFlatScratch() ) {
463 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_U32), BaseReg)
464 .addReg(OffsetReg, RegState::Kill)
465 .addReg(FIReg);
466 return;
467 }
468
469 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
470 .addReg(OffsetReg, RegState::Kill)
471 .addReg(FIReg)
472 .addImm(0); // clamp bit
473 }
474
resolveFrameIndex(MachineInstr & MI,Register BaseReg,int64_t Offset) const475 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
476 int64_t Offset) const {
477 const SIInstrInfo *TII = ST.getInstrInfo();
478 bool IsFlat = TII->isFLATScratch(MI);
479
480 #ifndef NDEBUG
481 // FIXME: Is it possible to be storing a frame index to itself?
482 bool SeenFI = false;
483 for (const MachineOperand &MO: MI.operands()) {
484 if (MO.isFI()) {
485 if (SeenFI)
486 llvm_unreachable("should not see multiple frame indices");
487
488 SeenFI = true;
489 }
490 }
491 #endif
492
493 MachineOperand *FIOp =
494 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr
495 : AMDGPU::OpName::vaddr);
496
497 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
498 int64_t NewOffset = OffsetOp->getImm() + Offset;
499
500 #ifndef NDEBUG
501 MachineBasicBlock *MBB = MI.getParent();
502 MachineFunction *MF = MBB->getParent();
503 assert(FIOp && FIOp->isFI() && "frame index must be address operand");
504 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
505
506 if (IsFlat) {
507 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true) &&
508 "offset should be legal");
509 FIOp->ChangeToRegister(BaseReg, false);
510 OffsetOp->setImm(NewOffset);
511 return;
512 }
513
514 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
515 assert((SOffset->isReg() &&
516 SOffset->getReg() ==
517 MF->getInfo<SIMachineFunctionInfo>()->getStackPtrOffsetReg()) ||
518 (SOffset->isImm() && SOffset->getImm() == 0));
519 #endif
520
521 assert(SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) &&
522 "offset should be legal");
523
524 FIOp->ChangeToRegister(BaseReg, false);
525 OffsetOp->setImm(NewOffset);
526 }
527
isFrameOffsetLegal(const MachineInstr * MI,Register BaseReg,int64_t Offset) const528 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
529 Register BaseReg,
530 int64_t Offset) const {
531 if (!SIInstrInfo::isMUBUF(*MI) && !!SIInstrInfo::isFLATScratch(*MI))
532 return false;
533
534 int64_t NewOffset = Offset + getScratchInstrOffset(MI);
535
536 if (SIInstrInfo::isMUBUF(*MI))
537 return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset);
538
539 const SIInstrInfo *TII = ST.getInstrInfo();
540 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true);
541 }
542
getPointerRegClass(const MachineFunction & MF,unsigned Kind) const543 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
544 const MachineFunction &MF, unsigned Kind) const {
545 // This is inaccurate. It depends on the instruction and address space. The
546 // only place where we should hit this is for dealing with frame indexes /
547 // private accesses, so this is correct in that case.
548 return &AMDGPU::VGPR_32RegClass;
549 }
550
getNumSubRegsForSpillOp(unsigned Op)551 static unsigned getNumSubRegsForSpillOp(unsigned Op) {
552
553 switch (Op) {
554 case AMDGPU::SI_SPILL_S1024_SAVE:
555 case AMDGPU::SI_SPILL_S1024_RESTORE:
556 case AMDGPU::SI_SPILL_V1024_SAVE:
557 case AMDGPU::SI_SPILL_V1024_RESTORE:
558 case AMDGPU::SI_SPILL_A1024_SAVE:
559 case AMDGPU::SI_SPILL_A1024_RESTORE:
560 return 32;
561 case AMDGPU::SI_SPILL_S512_SAVE:
562 case AMDGPU::SI_SPILL_S512_RESTORE:
563 case AMDGPU::SI_SPILL_V512_SAVE:
564 case AMDGPU::SI_SPILL_V512_RESTORE:
565 case AMDGPU::SI_SPILL_A512_SAVE:
566 case AMDGPU::SI_SPILL_A512_RESTORE:
567 return 16;
568 case AMDGPU::SI_SPILL_S256_SAVE:
569 case AMDGPU::SI_SPILL_S256_RESTORE:
570 case AMDGPU::SI_SPILL_V256_SAVE:
571 case AMDGPU::SI_SPILL_V256_RESTORE:
572 case AMDGPU::SI_SPILL_A256_SAVE:
573 case AMDGPU::SI_SPILL_A256_RESTORE:
574 return 8;
575 case AMDGPU::SI_SPILL_S192_SAVE:
576 case AMDGPU::SI_SPILL_S192_RESTORE:
577 case AMDGPU::SI_SPILL_V192_SAVE:
578 case AMDGPU::SI_SPILL_V192_RESTORE:
579 case AMDGPU::SI_SPILL_A192_SAVE:
580 case AMDGPU::SI_SPILL_A192_RESTORE:
581 return 6;
582 case AMDGPU::SI_SPILL_S160_SAVE:
583 case AMDGPU::SI_SPILL_S160_RESTORE:
584 case AMDGPU::SI_SPILL_V160_SAVE:
585 case AMDGPU::SI_SPILL_V160_RESTORE:
586 case AMDGPU::SI_SPILL_A160_SAVE:
587 case AMDGPU::SI_SPILL_A160_RESTORE:
588 return 5;
589 case AMDGPU::SI_SPILL_S128_SAVE:
590 case AMDGPU::SI_SPILL_S128_RESTORE:
591 case AMDGPU::SI_SPILL_V128_SAVE:
592 case AMDGPU::SI_SPILL_V128_RESTORE:
593 case AMDGPU::SI_SPILL_A128_SAVE:
594 case AMDGPU::SI_SPILL_A128_RESTORE:
595 return 4;
596 case AMDGPU::SI_SPILL_S96_SAVE:
597 case AMDGPU::SI_SPILL_S96_RESTORE:
598 case AMDGPU::SI_SPILL_V96_SAVE:
599 case AMDGPU::SI_SPILL_V96_RESTORE:
600 case AMDGPU::SI_SPILL_A96_SAVE:
601 case AMDGPU::SI_SPILL_A96_RESTORE:
602 return 3;
603 case AMDGPU::SI_SPILL_S64_SAVE:
604 case AMDGPU::SI_SPILL_S64_RESTORE:
605 case AMDGPU::SI_SPILL_V64_SAVE:
606 case AMDGPU::SI_SPILL_V64_RESTORE:
607 case AMDGPU::SI_SPILL_A64_SAVE:
608 case AMDGPU::SI_SPILL_A64_RESTORE:
609 return 2;
610 case AMDGPU::SI_SPILL_S32_SAVE:
611 case AMDGPU::SI_SPILL_S32_RESTORE:
612 case AMDGPU::SI_SPILL_V32_SAVE:
613 case AMDGPU::SI_SPILL_V32_RESTORE:
614 case AMDGPU::SI_SPILL_A32_SAVE:
615 case AMDGPU::SI_SPILL_A32_RESTORE:
616 return 1;
617 default: llvm_unreachable("Invalid spill opcode");
618 }
619 }
620
getOffsetMUBUFStore(unsigned Opc)621 static int getOffsetMUBUFStore(unsigned Opc) {
622 switch (Opc) {
623 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
624 return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
625 case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
626 return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
627 case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
628 return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
629 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
630 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
631 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
632 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
633 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
634 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
635 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
636 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
637 default:
638 return -1;
639 }
640 }
641
getOffsetMUBUFLoad(unsigned Opc)642 static int getOffsetMUBUFLoad(unsigned Opc) {
643 switch (Opc) {
644 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
645 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
646 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
647 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
648 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
649 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
650 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
651 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
652 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
653 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
654 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
655 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
656 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
657 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
658 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
659 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
660 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
661 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
662 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
663 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
664 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
665 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
666 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
667 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
668 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
669 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
670 default:
671 return -1;
672 }
673 }
674
spillVGPRtoAGPR(const GCNSubtarget & ST,MachineBasicBlock::iterator MI,int Index,unsigned Lane,unsigned ValueReg,bool IsKill)675 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
676 MachineBasicBlock::iterator MI,
677 int Index,
678 unsigned Lane,
679 unsigned ValueReg,
680 bool IsKill) {
681 MachineBasicBlock *MBB = MI->getParent();
682 MachineFunction *MF = MI->getParent()->getParent();
683 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
684 const SIInstrInfo *TII = ST.getInstrInfo();
685
686 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
687
688 if (Reg == AMDGPU::NoRegister)
689 return MachineInstrBuilder();
690
691 bool IsStore = MI->mayStore();
692 MachineRegisterInfo &MRI = MF->getRegInfo();
693 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
694
695 unsigned Dst = IsStore ? Reg : ValueReg;
696 unsigned Src = IsStore ? ValueReg : Reg;
697 unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32
698 : AMDGPU::V_ACCVGPR_READ_B32;
699
700 auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst)
701 .addReg(Src, getKillRegState(IsKill));
702 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
703 return MIB;
704 }
705
706 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
707 // need to handle the case where an SGPR may need to be spilled while spilling.
buildMUBUFOffsetLoadStore(const GCNSubtarget & ST,MachineFrameInfo & MFI,MachineBasicBlock::iterator MI,int Index,int64_t Offset)708 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST,
709 MachineFrameInfo &MFI,
710 MachineBasicBlock::iterator MI,
711 int Index,
712 int64_t Offset) {
713 const SIInstrInfo *TII = ST.getInstrInfo();
714 MachineBasicBlock *MBB = MI->getParent();
715 const DebugLoc &DL = MI->getDebugLoc();
716 bool IsStore = MI->mayStore();
717
718 unsigned Opc = MI->getOpcode();
719 int LoadStoreOp = IsStore ?
720 getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc);
721 if (LoadStoreOp == -1)
722 return false;
723
724 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
725 if (spillVGPRtoAGPR(ST, MI, Index, 0, Reg->getReg(), false).getInstr())
726 return true;
727
728 MachineInstrBuilder NewMI =
729 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
730 .add(*Reg)
731 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
732 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
733 .addImm(Offset)
734 .addImm(0) // glc
735 .addImm(0) // slc
736 .addImm(0) // tfe
737 .addImm(0) // dlc
738 .addImm(0) // swz
739 .cloneMemRefs(*MI);
740
741 const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
742 AMDGPU::OpName::vdata_in);
743 if (VDataIn)
744 NewMI.add(*VDataIn);
745 return true;
746 }
747
buildSpillLoadStore(MachineBasicBlock::iterator MI,unsigned LoadStoreOp,int Index,Register ValueReg,bool IsKill,MCRegister ScratchOffsetReg,int64_t InstOffset,MachineMemOperand * MMO,RegScavenger * RS) const748 void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
749 unsigned LoadStoreOp,
750 int Index,
751 Register ValueReg,
752 bool IsKill,
753 MCRegister ScratchOffsetReg,
754 int64_t InstOffset,
755 MachineMemOperand *MMO,
756 RegScavenger *RS) const {
757 MachineBasicBlock *MBB = MI->getParent();
758 MachineFunction *MF = MI->getParent()->getParent();
759 const SIInstrInfo *TII = ST.getInstrInfo();
760 const MachineFrameInfo &MFI = MF->getFrameInfo();
761 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
762
763 const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
764 const DebugLoc &DL = MI->getDebugLoc();
765 bool IsStore = Desc->mayStore();
766 bool IsFlat = TII->isFLATScratch(LoadStoreOp);
767
768 bool Scavenged = false;
769 MCRegister SOffset = ScratchOffsetReg;
770
771 const unsigned EltSize = 4;
772 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
773 unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / (EltSize * CHAR_BIT);
774 unsigned Size = NumSubRegs * EltSize;
775 int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
776 int64_t MaxOffset = Offset + Size - EltSize;
777 int64_t ScratchOffsetRegDelta = 0;
778
779 Align Alignment = MFI.getObjectAlign(Index);
780 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
781
782 assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset");
783
784 bool IsOffsetLegal = IsFlat
785 ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, true)
786 : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset);
787 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
788 SOffset = MCRegister();
789
790 // We currently only support spilling VGPRs to EltSize boundaries, meaning
791 // we can simplify the adjustment of Offset here to just scale with
792 // WavefrontSize.
793 if (!IsFlat)
794 Offset *= ST.getWavefrontSize();
795
796 // We don't have access to the register scavenger if this function is called
797 // during PEI::scavengeFrameVirtualRegs().
798 if (RS)
799 SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false);
800
801 if (!SOffset) {
802 // There are no free SGPRs, and since we are in the process of spilling
803 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true
804 // on SI/CI and on VI it is true until we implement spilling using scalar
805 // stores), we have no way to free up an SGPR. Our solution here is to
806 // add the offset directly to the ScratchOffset or StackPtrOffset
807 // register, and then subtract the offset after the spill to return the
808 // register to it's original value.
809 if (!ScratchOffsetReg)
810 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
811 SOffset = ScratchOffsetReg;
812 ScratchOffsetRegDelta = Offset;
813 } else {
814 Scavenged = true;
815 }
816
817 if (!SOffset)
818 report_fatal_error("could not scavenge SGPR to spill in entry function");
819
820 if (ScratchOffsetReg == AMDGPU::NoRegister) {
821 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset)
822 .addImm(Offset);
823 } else {
824 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
825 .addReg(ScratchOffsetReg)
826 .addImm(Offset);
827 }
828
829 Offset = 0;
830 }
831
832 if (IsFlat && SOffset == AMDGPU::NoRegister) {
833 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
834 && "Unexpected vaddr for flat scratch with a FI operand");
835
836 assert(ST.hasFlatScratchSTMode());
837 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
838 Desc = &TII->get(LoadStoreOp);
839 }
840
841 Register TmpReg;
842
843 // FIXME: Flat scratch does not have to be limited to a dword per store.
844 for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) {
845 Register SubReg =
846 NumSubRegs == 1
847 ? ValueReg
848 : Register(getSubReg(ValueReg, getSubRegFromChannel(i)));
849
850 unsigned SOffsetRegState = 0;
851 unsigned SrcDstRegState = getDefRegState(!IsStore);
852 if (i + 1 == e) {
853 SOffsetRegState |= getKillRegState(Scavenged);
854 // The last implicit use carries the "Kill" flag.
855 SrcDstRegState |= getKillRegState(IsKill);
856 }
857
858 // Make sure the whole register is defined if there are undef components by
859 // adding an implicit def of the super-reg on the first instruction.
860 const bool NeedSuperRegDef = NumSubRegs > 1 && IsStore && i == 0;
861
862 auto MIB = spillVGPRtoAGPR(ST, MI, Index, i, SubReg, IsKill);
863
864 if (!MIB.getInstr()) {
865 unsigned FinalReg = SubReg;
866
867 const bool IsAGPR = hasAGPRs(RC);
868 if (IsAGPR) {
869 if (!TmpReg) {
870 assert(RS && "Needs to have RegScavenger to spill an AGPR!");
871 // FIXME: change to scavengeRegisterBackwards()
872 TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
873 RS->setRegUsed(TmpReg);
874 }
875 if (IsStore) {
876 auto AccRead = BuildMI(*MBB, MI, DL,
877 TII->get(AMDGPU::V_ACCVGPR_READ_B32), TmpReg)
878 .addReg(SubReg, getKillRegState(IsKill));
879 if (NeedSuperRegDef)
880 AccRead.addReg(ValueReg, RegState::ImplicitDefine);
881 AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse);
882 }
883 SubReg = TmpReg;
884 }
885
886 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
887 MachineMemOperand *NewMMO =
888 MF->getMachineMemOperand(PInfo, MMO->getFlags(), EltSize,
889 commonAlignment(Alignment, EltSize * i));
890
891 MIB = BuildMI(*MBB, MI, DL, *Desc)
892 .addReg(SubReg,
893 getDefRegState(!IsStore) | getKillRegState(IsKill));
894 if (!IsFlat)
895 MIB.addReg(FuncInfo->getScratchRSrcReg());
896
897 if (SOffset == AMDGPU::NoRegister) {
898 if (!IsFlat)
899 MIB.addImm(0);
900 } else {
901 MIB.addReg(SOffset, SOffsetRegState);
902 }
903 MIB.addImm(Offset)
904 .addImm(0) // glc
905 .addImm(0) // slc
906 .addImm(0); // tfe for MUBUF or dlc for FLAT
907 if (!IsFlat)
908 MIB.addImm(0) // dlc
909 .addImm(0); // swz
910 MIB.addMemOperand(NewMMO);
911
912 if (!IsAGPR && NeedSuperRegDef)
913 MIB.addReg(ValueReg, RegState::ImplicitDefine);
914
915 if (!IsStore && TmpReg != AMDGPU::NoRegister) {
916 MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32),
917 FinalReg)
918 .addReg(TmpReg, RegState::Kill);
919 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
920 }
921 } else {
922 if (NeedSuperRegDef)
923 MIB.addReg(ValueReg, RegState::ImplicitDefine);
924 }
925
926 if (NumSubRegs > 1) {
927 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
928 }
929 }
930
931 if (ScratchOffsetRegDelta != 0) {
932 // Subtract the offset we added to the ScratchOffset register.
933 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), SOffset)
934 .addReg(SOffset)
935 .addImm(ScratchOffsetRegDelta);
936 }
937 }
938
939 // Generate a VMEM access which loads or stores the VGPR containing an SGPR
940 // spill such that all the lanes set in VGPRLanes are loaded or stored.
941 // This generates exec mask manipulation and will use SGPRs available in MI
942 // or VGPR lanes in the VGPR to save and restore the exec mask.
buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI,int Index,int Offset,unsigned EltSize,Register VGPR,int64_t VGPRLanes,RegScavenger * RS,bool IsLoad) const943 void SIRegisterInfo::buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI,
944 int Index, int Offset,
945 unsigned EltSize, Register VGPR,
946 int64_t VGPRLanes,
947 RegScavenger *RS,
948 bool IsLoad) const {
949 MachineBasicBlock *MBB = MI->getParent();
950 MachineFunction *MF = MBB->getParent();
951 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
952 const SIInstrInfo *TII = ST.getInstrInfo();
953
954 Register SuperReg = MI->getOperand(0).getReg();
955 const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
956 ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
957 unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
958 unsigned FirstPart = Offset * 32;
959 unsigned ExecLane = 0;
960
961 bool IsKill = MI->getOperand(0).isKill();
962 const DebugLoc &DL = MI->getDebugLoc();
963
964 // Cannot handle load/store to EXEC
965 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
966 SuperReg != AMDGPU::EXEC && "exec should never spill");
967
968 // On Wave32 only handle EXEC_LO.
969 // On Wave64 only update EXEC_HI if there is sufficent space for a copy.
970 bool OnlyExecLo = isWave32 || NumSubRegs == 1 || SuperReg == AMDGPU::EXEC_HI;
971
972 unsigned ExecMovOpc = OnlyExecLo ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
973 Register ExecReg = OnlyExecLo ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
974 Register SavedExecReg;
975
976 // Backup EXEC
977 if (OnlyExecLo) {
978 SavedExecReg =
979 NumSubRegs == 1
980 ? SuperReg
981 : Register(getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]));
982 } else {
983 // If src/dst is an odd size it is possible subreg0 is not aligned.
984 for (; ExecLane < (NumSubRegs - 1); ++ExecLane) {
985 SavedExecReg = getMatchingSuperReg(
986 getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]), AMDGPU::sub0,
987 &AMDGPU::SReg_64_XEXECRegClass);
988 if (SavedExecReg)
989 break;
990 }
991 }
992 assert(SavedExecReg);
993 BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), SavedExecReg).addReg(ExecReg);
994
995 // Setup EXEC
996 BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg).addImm(VGPRLanes);
997
998 // Load/store VGPR
999 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1000 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
1001
1002 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
1003 ? getBaseRegister()
1004 : getFrameRegister(*MF);
1005
1006 Align Alignment = FrameInfo.getObjectAlign(Index);
1007 MachinePointerInfo PtrInfo =
1008 MachinePointerInfo::getFixedStack(*MF, Index);
1009 MachineMemOperand *MMO = MF->getMachineMemOperand(
1010 PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore,
1011 EltSize, Alignment);
1012
1013 if (IsLoad) {
1014 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
1015 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1016 buildSpillLoadStore(MI, Opc,
1017 Index,
1018 VGPR, false,
1019 FrameReg,
1020 Offset * EltSize, MMO,
1021 RS);
1022 } else {
1023 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1024 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1025 buildSpillLoadStore(MI, Opc, Index, VGPR,
1026 IsKill, FrameReg,
1027 Offset * EltSize, MMO, RS);
1028 // This only ever adds one VGPR spill
1029 MFI->addToSpilledVGPRs(1);
1030 }
1031
1032 // Restore EXEC
1033 BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg)
1034 .addReg(SavedExecReg, getKillRegState(IsLoad || IsKill));
1035
1036 // Restore clobbered SGPRs
1037 if (IsLoad) {
1038 // Nothing to do; register will be overwritten
1039 } else if (!IsKill) {
1040 // Restore SGPRs from appropriate VGPR lanes
1041 if (!OnlyExecLo) {
1042 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
1043 getSubReg(SuperReg, SplitParts[FirstPart + ExecLane + 1]))
1044 .addReg(VGPR)
1045 .addImm(ExecLane + 1);
1046 }
1047 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
1048 NumSubRegs == 1 ? SavedExecReg
1049 : Register(getSubReg(
1050 SuperReg, SplitParts[FirstPart + ExecLane])))
1051 .addReg(VGPR, RegState::Kill)
1052 .addImm(ExecLane);
1053 }
1054 }
1055
spillSGPR(MachineBasicBlock::iterator MI,int Index,RegScavenger * RS,bool OnlyToVGPR) const1056 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
1057 int Index,
1058 RegScavenger *RS,
1059 bool OnlyToVGPR) const {
1060 MachineBasicBlock *MBB = MI->getParent();
1061 MachineFunction *MF = MBB->getParent();
1062 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1063 DenseSet<Register> SGPRSpillVGPRDefinedSet; // FIXME: This should be removed
1064
1065 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
1066 = MFI->getSGPRToVGPRSpills(Index);
1067 bool SpillToVGPR = !VGPRSpills.empty();
1068 if (OnlyToVGPR && !SpillToVGPR)
1069 return false;
1070
1071 const SIInstrInfo *TII = ST.getInstrInfo();
1072
1073 Register SuperReg = MI->getOperand(0).getReg();
1074 bool IsKill = MI->getOperand(0).isKill();
1075 const DebugLoc &DL = MI->getDebugLoc();
1076
1077 assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() &&
1078 SuperReg != MFI->getFrameOffsetReg()));
1079
1080 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
1081 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
1082 SuperReg != AMDGPU::EXEC && "exec should never spill");
1083
1084 unsigned EltSize = 4;
1085 const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
1086
1087 ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
1088 unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
1089
1090 if (SpillToVGPR) {
1091 for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
1092 Register SubReg = NumSubRegs == 1
1093 ? SuperReg
1094 : Register(getSubReg(SuperReg, SplitParts[i]));
1095 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
1096
1097 bool UseKill = IsKill && i == NumSubRegs - 1;
1098
1099 // During SGPR spilling to VGPR, determine if the VGPR is defined. The
1100 // only circumstance in which we say it is undefined is when it is the
1101 // first spill to this VGPR in the first basic block.
1102 bool VGPRDefined = true;
1103 if (MBB == &MF->front())
1104 VGPRDefined = !SGPRSpillVGPRDefinedSet.insert(Spill.VGPR).second;
1105
1106 // Mark the "old value of vgpr" input undef only if this is the first sgpr
1107 // spill to this specific vgpr in the first basic block.
1108 auto MIB =
1109 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill.VGPR)
1110 .addReg(SubReg, getKillRegState(UseKill))
1111 .addImm(Spill.Lane)
1112 .addReg(Spill.VGPR, VGPRDefined ? 0 : RegState::Undef);
1113
1114 if (i == 0 && NumSubRegs > 1) {
1115 // We may be spilling a super-register which is only partially defined,
1116 // and need to ensure later spills think the value is defined.
1117 MIB.addReg(SuperReg, RegState::ImplicitDefine);
1118 }
1119
1120 if (NumSubRegs > 1)
1121 MIB.addReg(SuperReg, getKillRegState(UseKill) | RegState::Implicit);
1122
1123 // FIXME: Since this spills to another register instead of an actual
1124 // frame index, we should delete the frame index when all references to
1125 // it are fixed.
1126 }
1127 } else {
1128 // Scavenged temporary VGPR to use. It must be scavenged once for any number
1129 // of spilled subregs.
1130 Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
1131 RS->setRegUsed(TmpVGPR);
1132
1133 // SubReg carries the "Kill" flag when SubReg == SuperReg.
1134 unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
1135
1136 unsigned PerVGPR = 32;
1137 unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR;
1138 int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL;
1139
1140 for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) {
1141 unsigned TmpVGPRFlags = RegState::Undef;
1142
1143 // Write sub registers into the VGPR
1144 for (unsigned i = Offset * PerVGPR,
1145 e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
1146 i < e; ++i) {
1147 Register SubReg = NumSubRegs == 1
1148 ? SuperReg
1149 : Register(getSubReg(SuperReg, SplitParts[i]));
1150
1151 MachineInstrBuilder WriteLane =
1152 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), TmpVGPR)
1153 .addReg(SubReg, SubKillState)
1154 .addImm(i % PerVGPR)
1155 .addReg(TmpVGPR, TmpVGPRFlags);
1156 TmpVGPRFlags = 0;
1157
1158 // There could be undef components of a spilled super register.
1159 // TODO: Can we detect this and skip the spill?
1160 if (NumSubRegs > 1) {
1161 // The last implicit use of the SuperReg carries the "Kill" flag.
1162 unsigned SuperKillState = 0;
1163 if (i + 1 == NumSubRegs)
1164 SuperKillState |= getKillRegState(IsKill);
1165 WriteLane.addReg(SuperReg, RegState::Implicit | SuperKillState);
1166 }
1167 }
1168
1169 // Write out VGPR
1170 buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes,
1171 RS, false);
1172 }
1173 }
1174
1175 MI->eraseFromParent();
1176 MFI->addToSpilledSGPRs(NumSubRegs);
1177 return true;
1178 }
1179
restoreSGPR(MachineBasicBlock::iterator MI,int Index,RegScavenger * RS,bool OnlyToVGPR) const1180 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
1181 int Index,
1182 RegScavenger *RS,
1183 bool OnlyToVGPR) const {
1184 MachineFunction *MF = MI->getParent()->getParent();
1185 MachineBasicBlock *MBB = MI->getParent();
1186 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1187
1188 ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
1189 = MFI->getSGPRToVGPRSpills(Index);
1190 bool SpillToVGPR = !VGPRSpills.empty();
1191 if (OnlyToVGPR && !SpillToVGPR)
1192 return false;
1193
1194 const SIInstrInfo *TII = ST.getInstrInfo();
1195 const DebugLoc &DL = MI->getDebugLoc();
1196
1197 Register SuperReg = MI->getOperand(0).getReg();
1198
1199 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
1200 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
1201 SuperReg != AMDGPU::EXEC && "exec should never spill");
1202
1203 unsigned EltSize = 4;
1204
1205 const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
1206
1207 ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
1208 unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
1209
1210 if (SpillToVGPR) {
1211 for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
1212 Register SubReg = NumSubRegs == 1
1213 ? SuperReg
1214 : Register(getSubReg(SuperReg, SplitParts[i]));
1215
1216 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
1217 auto MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
1218 .addReg(Spill.VGPR)
1219 .addImm(Spill.Lane);
1220 if (NumSubRegs > 1 && i == 0)
1221 MIB.addReg(SuperReg, RegState::ImplicitDefine);
1222 }
1223 } else {
1224 Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
1225 RS->setRegUsed(TmpVGPR);
1226
1227 unsigned PerVGPR = 32;
1228 unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR;
1229 int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL;
1230
1231 for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) {
1232 // Load in VGPR data
1233 buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes,
1234 RS, true);
1235
1236 // Unpack lanes
1237 for (unsigned i = Offset * PerVGPR,
1238 e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
1239 i < e; ++i) {
1240 Register SubReg = NumSubRegs == 1
1241 ? SuperReg
1242 : Register(getSubReg(SuperReg, SplitParts[i]));
1243
1244 bool LastSubReg = (i + 1 == e);
1245 auto MIB =
1246 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
1247 .addReg(TmpVGPR, getKillRegState(LastSubReg))
1248 .addImm(i);
1249 if (NumSubRegs > 1 && i == 0)
1250 MIB.addReg(SuperReg, RegState::ImplicitDefine);
1251 }
1252 }
1253 }
1254
1255 MI->eraseFromParent();
1256 return true;
1257 }
1258
1259 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
1260 /// a VGPR and the stack slot can be safely eliminated when all other users are
1261 /// handled.
eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI,int FI,RegScavenger * RS) const1262 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
1263 MachineBasicBlock::iterator MI,
1264 int FI,
1265 RegScavenger *RS) const {
1266 switch (MI->getOpcode()) {
1267 case AMDGPU::SI_SPILL_S1024_SAVE:
1268 case AMDGPU::SI_SPILL_S512_SAVE:
1269 case AMDGPU::SI_SPILL_S256_SAVE:
1270 case AMDGPU::SI_SPILL_S192_SAVE:
1271 case AMDGPU::SI_SPILL_S160_SAVE:
1272 case AMDGPU::SI_SPILL_S128_SAVE:
1273 case AMDGPU::SI_SPILL_S96_SAVE:
1274 case AMDGPU::SI_SPILL_S64_SAVE:
1275 case AMDGPU::SI_SPILL_S32_SAVE:
1276 return spillSGPR(MI, FI, RS, true);
1277 case AMDGPU::SI_SPILL_S1024_RESTORE:
1278 case AMDGPU::SI_SPILL_S512_RESTORE:
1279 case AMDGPU::SI_SPILL_S256_RESTORE:
1280 case AMDGPU::SI_SPILL_S192_RESTORE:
1281 case AMDGPU::SI_SPILL_S160_RESTORE:
1282 case AMDGPU::SI_SPILL_S128_RESTORE:
1283 case AMDGPU::SI_SPILL_S96_RESTORE:
1284 case AMDGPU::SI_SPILL_S64_RESTORE:
1285 case AMDGPU::SI_SPILL_S32_RESTORE:
1286 return restoreSGPR(MI, FI, RS, true);
1287 default:
1288 llvm_unreachable("not an SGPR spill instruction");
1289 }
1290 }
1291
eliminateFrameIndex(MachineBasicBlock::iterator MI,int SPAdj,unsigned FIOperandNum,RegScavenger * RS) const1292 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
1293 int SPAdj, unsigned FIOperandNum,
1294 RegScavenger *RS) const {
1295 MachineFunction *MF = MI->getParent()->getParent();
1296 MachineBasicBlock *MBB = MI->getParent();
1297 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1298 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1299 const SIInstrInfo *TII = ST.getInstrInfo();
1300 DebugLoc DL = MI->getDebugLoc();
1301
1302 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
1303
1304 MachineOperand &FIOp = MI->getOperand(FIOperandNum);
1305 int Index = MI->getOperand(FIOperandNum).getIndex();
1306
1307 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
1308 ? getBaseRegister()
1309 : getFrameRegister(*MF);
1310
1311 switch (MI->getOpcode()) {
1312 // SGPR register spill
1313 case AMDGPU::SI_SPILL_S1024_SAVE:
1314 case AMDGPU::SI_SPILL_S512_SAVE:
1315 case AMDGPU::SI_SPILL_S256_SAVE:
1316 case AMDGPU::SI_SPILL_S192_SAVE:
1317 case AMDGPU::SI_SPILL_S160_SAVE:
1318 case AMDGPU::SI_SPILL_S128_SAVE:
1319 case AMDGPU::SI_SPILL_S96_SAVE:
1320 case AMDGPU::SI_SPILL_S64_SAVE:
1321 case AMDGPU::SI_SPILL_S32_SAVE: {
1322 spillSGPR(MI, Index, RS);
1323 break;
1324 }
1325
1326 // SGPR register restore
1327 case AMDGPU::SI_SPILL_S1024_RESTORE:
1328 case AMDGPU::SI_SPILL_S512_RESTORE:
1329 case AMDGPU::SI_SPILL_S256_RESTORE:
1330 case AMDGPU::SI_SPILL_S192_RESTORE:
1331 case AMDGPU::SI_SPILL_S160_RESTORE:
1332 case AMDGPU::SI_SPILL_S128_RESTORE:
1333 case AMDGPU::SI_SPILL_S96_RESTORE:
1334 case AMDGPU::SI_SPILL_S64_RESTORE:
1335 case AMDGPU::SI_SPILL_S32_RESTORE: {
1336 restoreSGPR(MI, Index, RS);
1337 break;
1338 }
1339
1340 // VGPR register spill
1341 case AMDGPU::SI_SPILL_V1024_SAVE:
1342 case AMDGPU::SI_SPILL_V512_SAVE:
1343 case AMDGPU::SI_SPILL_V256_SAVE:
1344 case AMDGPU::SI_SPILL_V160_SAVE:
1345 case AMDGPU::SI_SPILL_V128_SAVE:
1346 case AMDGPU::SI_SPILL_V96_SAVE:
1347 case AMDGPU::SI_SPILL_V64_SAVE:
1348 case AMDGPU::SI_SPILL_V32_SAVE:
1349 case AMDGPU::SI_SPILL_A1024_SAVE:
1350 case AMDGPU::SI_SPILL_A512_SAVE:
1351 case AMDGPU::SI_SPILL_A256_SAVE:
1352 case AMDGPU::SI_SPILL_A192_SAVE:
1353 case AMDGPU::SI_SPILL_A160_SAVE:
1354 case AMDGPU::SI_SPILL_A128_SAVE:
1355 case AMDGPU::SI_SPILL_A96_SAVE:
1356 case AMDGPU::SI_SPILL_A64_SAVE:
1357 case AMDGPU::SI_SPILL_A32_SAVE: {
1358 const MachineOperand *VData = TII->getNamedOperand(*MI,
1359 AMDGPU::OpName::vdata);
1360 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
1361 MFI->getStackPtrOffsetReg());
1362
1363 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1364 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1365 buildSpillLoadStore(MI, Opc,
1366 Index,
1367 VData->getReg(), VData->isKill(),
1368 FrameReg,
1369 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
1370 *MI->memoperands_begin(),
1371 RS);
1372 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
1373 MI->eraseFromParent();
1374 break;
1375 }
1376 case AMDGPU::SI_SPILL_V32_RESTORE:
1377 case AMDGPU::SI_SPILL_V64_RESTORE:
1378 case AMDGPU::SI_SPILL_V96_RESTORE:
1379 case AMDGPU::SI_SPILL_V128_RESTORE:
1380 case AMDGPU::SI_SPILL_V160_RESTORE:
1381 case AMDGPU::SI_SPILL_V256_RESTORE:
1382 case AMDGPU::SI_SPILL_V512_RESTORE:
1383 case AMDGPU::SI_SPILL_V1024_RESTORE:
1384 case AMDGPU::SI_SPILL_A32_RESTORE:
1385 case AMDGPU::SI_SPILL_A64_RESTORE:
1386 case AMDGPU::SI_SPILL_A96_RESTORE:
1387 case AMDGPU::SI_SPILL_A128_RESTORE:
1388 case AMDGPU::SI_SPILL_A160_RESTORE:
1389 case AMDGPU::SI_SPILL_A192_RESTORE:
1390 case AMDGPU::SI_SPILL_A256_RESTORE:
1391 case AMDGPU::SI_SPILL_A512_RESTORE:
1392 case AMDGPU::SI_SPILL_A1024_RESTORE: {
1393 const MachineOperand *VData = TII->getNamedOperand(*MI,
1394 AMDGPU::OpName::vdata);
1395 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
1396 MFI->getStackPtrOffsetReg());
1397
1398 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
1399 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1400 buildSpillLoadStore(MI, Opc,
1401 Index,
1402 VData->getReg(), VData->isKill(),
1403 FrameReg,
1404 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
1405 *MI->memoperands_begin(),
1406 RS);
1407 MI->eraseFromParent();
1408 break;
1409 }
1410
1411 default: {
1412 const DebugLoc &DL = MI->getDebugLoc();
1413
1414 int64_t Offset = FrameInfo.getObjectOffset(Index);
1415 if (ST.enableFlatScratch()) {
1416 if (TII->isFLATScratch(*MI)) {
1417 // The offset is always swizzled, just replace it
1418 if (FrameReg)
1419 FIOp.ChangeToRegister(FrameReg, false);
1420
1421 if (!Offset)
1422 return;
1423
1424 MachineOperand *OffsetOp =
1425 TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1426 int64_t NewOffset = Offset + OffsetOp->getImm();
1427 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1428 true)) {
1429 OffsetOp->setImm(NewOffset);
1430 if (FrameReg)
1431 return;
1432 Offset = 0;
1433 }
1434
1435 assert(!TII->getNamedOperand(*MI, AMDGPU::OpName::vaddr) &&
1436 "Unexpected vaddr for flat scratch with a FI operand");
1437
1438 // On GFX10 we have ST mode to use no registers for an address.
1439 // Otherwise we need to materialize 0 into an SGPR.
1440 if (!Offset && ST.hasFlatScratchSTMode()) {
1441 unsigned Opc = MI->getOpcode();
1442 unsigned NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc);
1443 MI->RemoveOperand(
1444 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
1445 MI->setDesc(TII->get(NewOpc));
1446 return;
1447 }
1448 }
1449
1450 if (!FrameReg) {
1451 FIOp.ChangeToImmediate(Offset);
1452 if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp))
1453 return;
1454 }
1455
1456 // We need to use register here. Check if we can use an SGPR or need
1457 // a VGPR.
1458 FIOp.ChangeToRegister(AMDGPU::M0, false);
1459 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp);
1460
1461 if (!Offset && FrameReg && UseSGPR) {
1462 FIOp.setReg(FrameReg);
1463 return;
1464 }
1465
1466 const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass
1467 : &AMDGPU::VGPR_32RegClass;
1468
1469 Register TmpReg = RS->scavengeRegister(RC, MI, 0, !UseSGPR);
1470 FIOp.setReg(TmpReg);
1471 FIOp.setIsKill(true);
1472
1473 if ((!FrameReg || !Offset) && TmpReg) {
1474 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1475 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
1476 if (FrameReg)
1477 MIB.addReg(FrameReg);
1478 else
1479 MIB.addImm(Offset);
1480
1481 return;
1482 }
1483
1484 Register TmpSReg =
1485 UseSGPR ? TmpReg
1486 : RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0,
1487 !UseSGPR);
1488
1489 // TODO: for flat scratch another attempt can be made with a VGPR index
1490 // if no SGPRs can be scavenged.
1491 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
1492 report_fatal_error("Cannot scavenge register in FI elimination!");
1493
1494 if (!TmpSReg) {
1495 // Use frame register and restore it after.
1496 TmpSReg = FrameReg;
1497 FIOp.setReg(FrameReg);
1498 FIOp.setIsKill(false);
1499 }
1500
1501 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), TmpSReg)
1502 .addReg(FrameReg)
1503 .addImm(Offset);
1504
1505 if (!UseSGPR)
1506 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
1507 .addReg(TmpSReg, RegState::Kill);
1508
1509 if (TmpSReg == FrameReg) {
1510 // Undo frame register modification.
1511 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_SUB_U32),
1512 FrameReg)
1513 .addReg(FrameReg)
1514 .addImm(Offset);
1515 }
1516
1517 return;
1518 }
1519
1520 bool IsMUBUF = TII->isMUBUF(*MI);
1521
1522 if (!IsMUBUF && !MFI->isEntryFunction()) {
1523 // Convert to a swizzled stack address by scaling by the wave size.
1524 //
1525 // In an entry function/kernel the offset is already swizzled.
1526
1527 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32;
1528 Register ResultReg =
1529 IsCopy ? MI->getOperand(0).getReg()
1530 : RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
1531
1532 int64_t Offset = FrameInfo.getObjectOffset(Index);
1533 if (Offset == 0) {
1534 // XXX - This never happens because of emergency scavenging slot at 0?
1535 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg)
1536 .addImm(ST.getWavefrontSizeLog2())
1537 .addReg(FrameReg);
1538 } else {
1539 if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) {
1540 // Reuse ResultReg in intermediate step.
1541 Register ScaledReg = ResultReg;
1542
1543 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
1544 ScaledReg)
1545 .addImm(ST.getWavefrontSizeLog2())
1546 .addReg(FrameReg);
1547
1548 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
1549
1550 // TODO: Fold if use instruction is another add of a constant.
1551 if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
1552 // FIXME: This can fail
1553 MIB.addImm(Offset);
1554 MIB.addReg(ScaledReg, RegState::Kill);
1555 if (!IsVOP2)
1556 MIB.addImm(0); // clamp bit
1557 } else {
1558 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
1559 "Need to reuse carry out register");
1560
1561 // Use scavenged unused carry out as offset register.
1562 Register ConstOffsetReg;
1563 if (!isWave32)
1564 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
1565 else
1566 ConstOffsetReg = MIB.getReg(1);
1567
1568 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
1569 .addImm(Offset);
1570 MIB.addReg(ConstOffsetReg, RegState::Kill);
1571 MIB.addReg(ScaledReg, RegState::Kill);
1572 MIB.addImm(0); // clamp bit
1573 }
1574 } else {
1575 // We have to produce a carry out, and there isn't a free SGPR pair
1576 // for it. We can keep the whole computation on the SALU to avoid
1577 // clobbering an additional register at the cost of an extra mov.
1578
1579 // We may have 1 free scratch SGPR even though a carry out is
1580 // unavailable. Only one additional mov is needed.
1581 Register TmpScaledReg =
1582 RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
1583 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
1584
1585 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
1586 .addReg(FrameReg)
1587 .addImm(ST.getWavefrontSizeLog2());
1588 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg)
1589 .addReg(ScaledReg, RegState::Kill)
1590 .addImm(Offset);
1591 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
1592 .addReg(ScaledReg, RegState::Kill);
1593
1594 // If there were truly no free SGPRs, we need to undo everything.
1595 if (!TmpScaledReg.isValid()) {
1596 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScaledReg)
1597 .addReg(ScaledReg, RegState::Kill)
1598 .addImm(Offset);
1599 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
1600 .addReg(FrameReg)
1601 .addImm(ST.getWavefrontSizeLog2());
1602 }
1603 }
1604 }
1605
1606 // Don't introduce an extra copy if we're just materializing in a mov.
1607 if (IsCopy)
1608 MI->eraseFromParent();
1609 else
1610 FIOp.ChangeToRegister(ResultReg, false, false, true);
1611 return;
1612 }
1613
1614 if (IsMUBUF) {
1615 // Disable offen so we don't need a 0 vgpr base.
1616 assert(static_cast<int>(FIOperandNum) ==
1617 AMDGPU::getNamedOperandIdx(MI->getOpcode(),
1618 AMDGPU::OpName::vaddr));
1619
1620 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
1621 assert((SOffset.isReg() &&
1622 SOffset.getReg() == MFI->getStackPtrOffsetReg()) ||
1623 (SOffset.isImm() && SOffset.getImm() == 0));
1624 if (SOffset.isReg()) {
1625 if (FrameReg == AMDGPU::NoRegister) {
1626 SOffset.ChangeToImmediate(0);
1627 } else {
1628 SOffset.setReg(FrameReg);
1629 }
1630 } else if (SOffset.isImm() && FrameReg != AMDGPU::NoRegister) {
1631 SOffset.ChangeToRegister(FrameReg, false);
1632 }
1633
1634 int64_t Offset = FrameInfo.getObjectOffset(Index);
1635 int64_t OldImm
1636 = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
1637 int64_t NewOffset = OldImm + Offset;
1638
1639 if (SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) &&
1640 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
1641 MI->eraseFromParent();
1642 return;
1643 }
1644 }
1645
1646 // If the offset is simply too big, don't convert to a scratch wave offset
1647 // relative index.
1648
1649 FIOp.ChangeToImmediate(Offset);
1650 if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
1651 Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
1652 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
1653 .addImm(Offset);
1654 FIOp.ChangeToRegister(TmpReg, false, false, true);
1655 }
1656 }
1657 }
1658 }
1659
getRegAsmName(MCRegister Reg) const1660 StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const {
1661 return AMDGPUInstPrinter::getRegisterName(Reg);
1662 }
1663
1664 const TargetRegisterClass *
getVGPRClassForBitWidth(unsigned BitWidth)1665 SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) {
1666 if (BitWidth == 1)
1667 return &AMDGPU::VReg_1RegClass;
1668 if (BitWidth <= 16)
1669 return &AMDGPU::VGPR_LO16RegClass;
1670 if (BitWidth <= 32)
1671 return &AMDGPU::VGPR_32RegClass;
1672 if (BitWidth <= 64)
1673 return &AMDGPU::VReg_64RegClass;
1674 if (BitWidth <= 96)
1675 return &AMDGPU::VReg_96RegClass;
1676 if (BitWidth <= 128)
1677 return &AMDGPU::VReg_128RegClass;
1678 if (BitWidth <= 160)
1679 return &AMDGPU::VReg_160RegClass;
1680 if (BitWidth <= 192)
1681 return &AMDGPU::VReg_192RegClass;
1682 if (BitWidth <= 256)
1683 return &AMDGPU::VReg_256RegClass;
1684 if (BitWidth <= 512)
1685 return &AMDGPU::VReg_512RegClass;
1686 if (BitWidth <= 1024)
1687 return &AMDGPU::VReg_1024RegClass;
1688
1689 return nullptr;
1690 }
1691
1692 const TargetRegisterClass *
getAGPRClassForBitWidth(unsigned BitWidth)1693 SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) {
1694 if (BitWidth <= 16)
1695 return &AMDGPU::AGPR_LO16RegClass;
1696 if (BitWidth <= 32)
1697 return &AMDGPU::AGPR_32RegClass;
1698 if (BitWidth <= 64)
1699 return &AMDGPU::AReg_64RegClass;
1700 if (BitWidth <= 96)
1701 return &AMDGPU::AReg_96RegClass;
1702 if (BitWidth <= 128)
1703 return &AMDGPU::AReg_128RegClass;
1704 if (BitWidth <= 160)
1705 return &AMDGPU::AReg_160RegClass;
1706 if (BitWidth <= 192)
1707 return &AMDGPU::AReg_192RegClass;
1708 if (BitWidth <= 256)
1709 return &AMDGPU::AReg_256RegClass;
1710 if (BitWidth <= 512)
1711 return &AMDGPU::AReg_512RegClass;
1712 if (BitWidth <= 1024)
1713 return &AMDGPU::AReg_1024RegClass;
1714
1715 return nullptr;
1716 }
1717
1718 const TargetRegisterClass *
getSGPRClassForBitWidth(unsigned BitWidth)1719 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
1720 if (BitWidth <= 16)
1721 return &AMDGPU::SGPR_LO16RegClass;
1722 if (BitWidth <= 32)
1723 return &AMDGPU::SReg_32RegClass;
1724 if (BitWidth <= 64)
1725 return &AMDGPU::SReg_64RegClass;
1726 if (BitWidth <= 96)
1727 return &AMDGPU::SGPR_96RegClass;
1728 if (BitWidth <= 128)
1729 return &AMDGPU::SGPR_128RegClass;
1730 if (BitWidth <= 160)
1731 return &AMDGPU::SGPR_160RegClass;
1732 if (BitWidth <= 192)
1733 return &AMDGPU::SGPR_192RegClass;
1734 if (BitWidth <= 256)
1735 return &AMDGPU::SGPR_256RegClass;
1736 if (BitWidth <= 512)
1737 return &AMDGPU::SGPR_512RegClass;
1738 if (BitWidth <= 1024)
1739 return &AMDGPU::SGPR_1024RegClass;
1740
1741 return nullptr;
1742 }
1743
1744 // FIXME: This is very slow. It might be worth creating a map from physreg to
1745 // register class.
1746 const TargetRegisterClass *
getPhysRegClass(MCRegister Reg) const1747 SIRegisterInfo::getPhysRegClass(MCRegister Reg) const {
1748 static const TargetRegisterClass *const BaseClasses[] = {
1749 &AMDGPU::VGPR_LO16RegClass,
1750 &AMDGPU::VGPR_HI16RegClass,
1751 &AMDGPU::SReg_LO16RegClass,
1752 &AMDGPU::AGPR_LO16RegClass,
1753 &AMDGPU::VGPR_32RegClass,
1754 &AMDGPU::SReg_32RegClass,
1755 &AMDGPU::AGPR_32RegClass,
1756 &AMDGPU::VReg_64RegClass,
1757 &AMDGPU::SReg_64RegClass,
1758 &AMDGPU::AReg_64RegClass,
1759 &AMDGPU::VReg_96RegClass,
1760 &AMDGPU::SReg_96RegClass,
1761 &AMDGPU::AReg_96RegClass,
1762 &AMDGPU::VReg_128RegClass,
1763 &AMDGPU::SReg_128RegClass,
1764 &AMDGPU::AReg_128RegClass,
1765 &AMDGPU::VReg_160RegClass,
1766 &AMDGPU::SReg_160RegClass,
1767 &AMDGPU::AReg_160RegClass,
1768 &AMDGPU::VReg_192RegClass,
1769 &AMDGPU::SReg_192RegClass,
1770 &AMDGPU::AReg_192RegClass,
1771 &AMDGPU::VReg_256RegClass,
1772 &AMDGPU::SReg_256RegClass,
1773 &AMDGPU::AReg_256RegClass,
1774 &AMDGPU::VReg_512RegClass,
1775 &AMDGPU::SReg_512RegClass,
1776 &AMDGPU::AReg_512RegClass,
1777 &AMDGPU::SReg_1024RegClass,
1778 &AMDGPU::VReg_1024RegClass,
1779 &AMDGPU::AReg_1024RegClass,
1780 &AMDGPU::SCC_CLASSRegClass,
1781 &AMDGPU::Pseudo_SReg_32RegClass,
1782 &AMDGPU::Pseudo_SReg_128RegClass,
1783 };
1784
1785 for (const TargetRegisterClass *BaseClass : BaseClasses) {
1786 if (BaseClass->contains(Reg)) {
1787 return BaseClass;
1788 }
1789 }
1790 return nullptr;
1791 }
1792
1793 // TODO: It might be helpful to have some target specific flags in
1794 // TargetRegisterClass to mark which classes are VGPRs to make this trivial.
hasVGPRs(const TargetRegisterClass * RC) const1795 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
1796 unsigned Size = getRegSizeInBits(*RC);
1797 if (Size == 16) {
1798 return getCommonSubClass(&AMDGPU::VGPR_LO16RegClass, RC) != nullptr ||
1799 getCommonSubClass(&AMDGPU::VGPR_HI16RegClass, RC) != nullptr;
1800 }
1801 const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size);
1802 if (!VRC) {
1803 assert(Size < 32 && "Invalid register class size");
1804 return false;
1805 }
1806 return getCommonSubClass(VRC, RC) != nullptr;
1807 }
1808
hasAGPRs(const TargetRegisterClass * RC) const1809 bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const {
1810 unsigned Size = getRegSizeInBits(*RC);
1811 if (Size < 16)
1812 return false;
1813 const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size);
1814 if (!ARC) {
1815 assert(getVGPRClassForBitWidth(Size) && "Invalid register class size");
1816 return false;
1817 }
1818 return getCommonSubClass(ARC, RC) != nullptr;
1819 }
1820
1821 const TargetRegisterClass *
getEquivalentVGPRClass(const TargetRegisterClass * SRC) const1822 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const {
1823 unsigned Size = getRegSizeInBits(*SRC);
1824 const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size);
1825 assert(VRC && "Invalid register class size");
1826 return VRC;
1827 }
1828
1829 const TargetRegisterClass *
getEquivalentAGPRClass(const TargetRegisterClass * SRC) const1830 SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const {
1831 unsigned Size = getRegSizeInBits(*SRC);
1832 const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size);
1833 assert(ARC && "Invalid register class size");
1834 return ARC;
1835 }
1836
1837 const TargetRegisterClass *
getEquivalentSGPRClass(const TargetRegisterClass * VRC) const1838 SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const {
1839 unsigned Size = getRegSizeInBits(*VRC);
1840 if (Size == 32)
1841 return &AMDGPU::SGPR_32RegClass;
1842 const TargetRegisterClass *SRC = getSGPRClassForBitWidth(Size);
1843 assert(SRC && "Invalid register class size");
1844 return SRC;
1845 }
1846
getSubRegClass(const TargetRegisterClass * RC,unsigned SubIdx) const1847 const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
1848 const TargetRegisterClass *RC, unsigned SubIdx) const {
1849 if (SubIdx == AMDGPU::NoSubRegister)
1850 return RC;
1851
1852 // We can assume that each lane corresponds to one 32-bit register.
1853 unsigned Size = getNumChannelsFromSubReg(SubIdx) * 32;
1854 if (isSGPRClass(RC)) {
1855 if (Size == 32)
1856 RC = &AMDGPU::SGPR_32RegClass;
1857 else
1858 RC = getSGPRClassForBitWidth(Size);
1859 } else if (hasAGPRs(RC)) {
1860 RC = getAGPRClassForBitWidth(Size);
1861 } else {
1862 RC = getVGPRClassForBitWidth(Size);
1863 }
1864 assert(RC && "Invalid sub-register class size");
1865 return RC;
1866 }
1867
opCanUseInlineConstant(unsigned OpType) const1868 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
1869 if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST &&
1870 OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST)
1871 return !ST.hasMFMAInlineLiteralBug();
1872
1873 return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
1874 OpType <= AMDGPU::OPERAND_SRC_LAST;
1875 }
1876
shouldRewriteCopySrc(const TargetRegisterClass * DefRC,unsigned DefSubReg,const TargetRegisterClass * SrcRC,unsigned SrcSubReg) const1877 bool SIRegisterInfo::shouldRewriteCopySrc(
1878 const TargetRegisterClass *DefRC,
1879 unsigned DefSubReg,
1880 const TargetRegisterClass *SrcRC,
1881 unsigned SrcSubReg) const {
1882 // We want to prefer the smallest register class possible, so we don't want to
1883 // stop and rewrite on anything that looks like a subregister
1884 // extract. Operations mostly don't care about the super register class, so we
1885 // only want to stop on the most basic of copies between the same register
1886 // class.
1887 //
1888 // e.g. if we have something like
1889 // %0 = ...
1890 // %1 = ...
1891 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2
1892 // %3 = COPY %2, sub0
1893 //
1894 // We want to look through the COPY to find:
1895 // => %3 = COPY %0
1896
1897 // Plain copy.
1898 return getCommonSubClass(DefRC, SrcRC) != nullptr;
1899 }
1900
1901 /// Returns a lowest register that is not used at any point in the function.
1902 /// If all registers are used, then this function will return
1903 /// AMDGPU::NoRegister. If \p ReserveHighestVGPR = true, then return
1904 /// highest unused register.
findUnusedRegister(const MachineRegisterInfo & MRI,const TargetRegisterClass * RC,const MachineFunction & MF,bool ReserveHighestVGPR) const1905 MCRegister SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
1906 const TargetRegisterClass *RC,
1907 const MachineFunction &MF,
1908 bool ReserveHighestVGPR) const {
1909 if (ReserveHighestVGPR) {
1910 for (MCRegister Reg : reverse(*RC))
1911 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
1912 return Reg;
1913 } else {
1914 for (MCRegister Reg : *RC)
1915 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
1916 return Reg;
1917 }
1918 return MCRegister();
1919 }
1920
getRegSplitParts(const TargetRegisterClass * RC,unsigned EltSize) const1921 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
1922 unsigned EltSize) const {
1923 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC->MC);
1924 assert(RegBitWidth >= 32 && RegBitWidth <= 1024);
1925
1926 const unsigned RegDWORDs = RegBitWidth / 32;
1927 const unsigned EltDWORDs = EltSize / 4;
1928 assert(RegSplitParts.size() + 1 >= EltDWORDs);
1929
1930 const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1];
1931 const unsigned NumParts = RegDWORDs / EltDWORDs;
1932
1933 return makeArrayRef(Parts.data(), NumParts);
1934 }
1935
1936 const TargetRegisterClass*
getRegClassForReg(const MachineRegisterInfo & MRI,Register Reg) const1937 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
1938 Register Reg) const {
1939 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegClass(Reg);
1940 }
1941
isVGPR(const MachineRegisterInfo & MRI,Register Reg) const1942 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
1943 Register Reg) const {
1944 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
1945 // Registers without classes are unaddressable, SGPR-like registers.
1946 return RC && hasVGPRs(RC);
1947 }
1948
isAGPR(const MachineRegisterInfo & MRI,Register Reg) const1949 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI,
1950 Register Reg) const {
1951 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
1952
1953 // Registers without classes are unaddressable, SGPR-like registers.
1954 return RC && hasAGPRs(RC);
1955 }
1956
shouldCoalesce(MachineInstr * MI,const TargetRegisterClass * SrcRC,unsigned SubReg,const TargetRegisterClass * DstRC,unsigned DstSubReg,const TargetRegisterClass * NewRC,LiveIntervals & LIS) const1957 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
1958 const TargetRegisterClass *SrcRC,
1959 unsigned SubReg,
1960 const TargetRegisterClass *DstRC,
1961 unsigned DstSubReg,
1962 const TargetRegisterClass *NewRC,
1963 LiveIntervals &LIS) const {
1964 unsigned SrcSize = getRegSizeInBits(*SrcRC);
1965 unsigned DstSize = getRegSizeInBits(*DstRC);
1966 unsigned NewSize = getRegSizeInBits(*NewRC);
1967
1968 // Do not increase size of registers beyond dword, we would need to allocate
1969 // adjacent registers and constraint regalloc more than needed.
1970
1971 // Always allow dword coalescing.
1972 if (SrcSize <= 32 || DstSize <= 32)
1973 return true;
1974
1975 return NewSize <= DstSize || NewSize <= SrcSize;
1976 }
1977
getRegPressureLimit(const TargetRegisterClass * RC,MachineFunction & MF) const1978 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
1979 MachineFunction &MF) const {
1980 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1981
1982 unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
1983 MF.getFunction());
1984 switch (RC->getID()) {
1985 default:
1986 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
1987 case AMDGPU::VGPR_32RegClassID:
1988 case AMDGPU::VGPR_LO16RegClassID:
1989 case AMDGPU::VGPR_HI16RegClassID:
1990 return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF));
1991 case AMDGPU::SGPR_32RegClassID:
1992 case AMDGPU::SGPR_LO16RegClassID:
1993 return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF));
1994 }
1995 }
1996
getRegPressureSetLimit(const MachineFunction & MF,unsigned Idx) const1997 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
1998 unsigned Idx) const {
1999 if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 ||
2000 Idx == AMDGPU::RegisterPressureSets::AGPR_32)
2001 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
2002 const_cast<MachineFunction &>(MF));
2003
2004 if (Idx == AMDGPU::RegisterPressureSets::SReg_32)
2005 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
2006 const_cast<MachineFunction &>(MF));
2007
2008 llvm_unreachable("Unexpected register pressure set!");
2009 }
2010
getRegUnitPressureSets(unsigned RegUnit) const2011 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
2012 static const int Empty[] = { -1 };
2013
2014 if (RegPressureIgnoredUnits[RegUnit])
2015 return Empty;
2016
2017 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
2018 }
2019
getReturnAddressReg(const MachineFunction & MF) const2020 MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
2021 // Not a callee saved register.
2022 return AMDGPU::SGPR30_SGPR31;
2023 }
2024
2025 const TargetRegisterClass *
getRegClassForSizeOnBank(unsigned Size,const RegisterBank & RB,const MachineRegisterInfo & MRI) const2026 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
2027 const RegisterBank &RB,
2028 const MachineRegisterInfo &MRI) const {
2029 switch (RB.getID()) {
2030 case AMDGPU::VGPRRegBankID:
2031 return getVGPRClassForBitWidth(std::max(32u, Size));
2032 case AMDGPU::VCCRegBankID:
2033 assert(Size == 1);
2034 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
2035 : &AMDGPU::SReg_64_XEXECRegClass;
2036 case AMDGPU::SGPRRegBankID:
2037 return getSGPRClassForBitWidth(std::max(32u, Size));
2038 case AMDGPU::AGPRRegBankID:
2039 return getAGPRClassForBitWidth(std::max(32u, Size));
2040 default:
2041 llvm_unreachable("unknown register bank");
2042 }
2043 }
2044
2045 const TargetRegisterClass *
getConstrainedRegClassForOperand(const MachineOperand & MO,const MachineRegisterInfo & MRI) const2046 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
2047 const MachineRegisterInfo &MRI) const {
2048 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
2049 if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>())
2050 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI);
2051
2052 const TargetRegisterClass *RC = RCOrRB.get<const TargetRegisterClass*>();
2053 return getAllocatableClass(RC);
2054 }
2055
getVCC() const2056 MCRegister SIRegisterInfo::getVCC() const {
2057 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
2058 }
2059
2060 const TargetRegisterClass *
getRegClass(unsigned RCID) const2061 SIRegisterInfo::getRegClass(unsigned RCID) const {
2062 switch ((int)RCID) {
2063 case AMDGPU::SReg_1RegClassID:
2064 return getBoolRC();
2065 case AMDGPU::SReg_1_XEXECRegClassID:
2066 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
2067 : &AMDGPU::SReg_64_XEXECRegClass;
2068 case -1:
2069 return nullptr;
2070 default:
2071 return AMDGPUGenRegisterInfo::getRegClass(RCID);
2072 }
2073 }
2074
2075 // Find reaching register definition
findReachingDef(Register Reg,unsigned SubReg,MachineInstr & Use,MachineRegisterInfo & MRI,LiveIntervals * LIS) const2076 MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg,
2077 MachineInstr &Use,
2078 MachineRegisterInfo &MRI,
2079 LiveIntervals *LIS) const {
2080 auto &MDT = LIS->getAnalysis<MachineDominatorTree>();
2081 SlotIndex UseIdx = LIS->getInstructionIndex(Use);
2082 SlotIndex DefIdx;
2083
2084 if (Reg.isVirtual()) {
2085 if (!LIS->hasInterval(Reg))
2086 return nullptr;
2087 LiveInterval &LI = LIS->getInterval(Reg);
2088 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
2089 : MRI.getMaxLaneMaskForVReg(Reg);
2090 VNInfo *V = nullptr;
2091 if (LI.hasSubRanges()) {
2092 for (auto &S : LI.subranges()) {
2093 if ((S.LaneMask & SubLanes) == SubLanes) {
2094 V = S.getVNInfoAt(UseIdx);
2095 break;
2096 }
2097 }
2098 } else {
2099 V = LI.getVNInfoAt(UseIdx);
2100 }
2101 if (!V)
2102 return nullptr;
2103 DefIdx = V->def;
2104 } else {
2105 // Find last def.
2106 for (MCRegUnitIterator Units(Reg.asMCReg(), this); Units.isValid();
2107 ++Units) {
2108 LiveRange &LR = LIS->getRegUnit(*Units);
2109 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
2110 if (!DefIdx.isValid() ||
2111 MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
2112 LIS->getInstructionFromIndex(V->def)))
2113 DefIdx = V->def;
2114 } else {
2115 return nullptr;
2116 }
2117 }
2118 }
2119
2120 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
2121
2122 if (!Def || !MDT.dominates(Def, &Use))
2123 return nullptr;
2124
2125 assert(Def->modifiesRegister(Reg, this));
2126
2127 return Def;
2128 }
2129
get32BitRegister(MCPhysReg Reg) const2130 MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const {
2131 assert(getRegSizeInBits(*getPhysRegClass(Reg)) <= 32);
2132
2133 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass,
2134 AMDGPU::SReg_32RegClass,
2135 AMDGPU::AGPR_32RegClass } ) {
2136 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
2137 return Super;
2138 }
2139 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
2140 &AMDGPU::VGPR_32RegClass)) {
2141 return Super;
2142 }
2143
2144 return AMDGPU::NoRegister;
2145 }
2146
isConstantPhysReg(MCRegister PhysReg) const2147 bool SIRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
2148 switch (PhysReg) {
2149 case AMDGPU::SGPR_NULL:
2150 case AMDGPU::SRC_SHARED_BASE:
2151 case AMDGPU::SRC_PRIVATE_BASE:
2152 case AMDGPU::SRC_SHARED_LIMIT:
2153 case AMDGPU::SRC_PRIVATE_LIMIT:
2154 return true;
2155 default:
2156 return false;
2157 }
2158 }
2159
2160 ArrayRef<MCPhysReg>
getAllSGPR128(const MachineFunction & MF) const2161 SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const {
2162 return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
2163 ST.getMaxNumSGPRs(MF) / 4);
2164 }
2165
2166 ArrayRef<MCPhysReg>
getAllSGPR64(const MachineFunction & MF) const2167 SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const {
2168 return makeArrayRef(AMDGPU::SGPR_64RegClass.begin(),
2169 ST.getMaxNumSGPRs(MF) / 2);
2170 }
2171
2172 ArrayRef<MCPhysReg>
getAllSGPR32(const MachineFunction & MF) const2173 SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const {
2174 return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
2175 }
2176