• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// SI implementation of the TargetRegisterInfo class.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "SIRegisterInfo.h"
15 #include "AMDGPURegisterBankInfo.h"
16 #include "AMDGPUSubtarget.h"
17 #include "SIInstrInfo.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "MCTargetDesc/AMDGPUInstPrinter.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "llvm/CodeGen/LiveIntervals.h"
22 #include "llvm/CodeGen/MachineDominators.h"
23 #include "llvm/CodeGen/MachineFrameInfo.h"
24 #include "llvm/CodeGen/MachineInstrBuilder.h"
25 #include "llvm/CodeGen/RegisterScavenging.h"
26 #include "llvm/CodeGen/SlotIndexes.h"
27 #include "llvm/IR/Function.h"
28 #include "llvm/IR/LLVMContext.h"
29 #include <vector>
30 
31 using namespace llvm;
32 
33 #define GET_REGINFO_TARGET_DESC
34 #include "AMDGPUGenRegisterInfo.inc"
35 
36 static cl::opt<bool> EnableSpillSGPRToVGPR(
37   "amdgpu-spill-sgpr-to-vgpr",
38   cl::desc("Enable spilling VGPRs to SGPRs"),
39   cl::ReallyHidden,
40   cl::init(true));
41 
42 std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts;
43 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
44 
45 // Map numbers of DWORDs to indexes in SubRegFromChannelTable.
46 // Valid indexes are shifted 1, such that a 0 mapping means unsupported.
47 // e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
48 //      meaning index 7 in SubRegFromChannelTable.
49 static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
50     0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
51 
SIRegisterInfo(const GCNSubtarget & ST)52 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
53     : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST),
54       SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
55 
56   assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
57          getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
58          (getSubRegIndexLaneMask(AMDGPU::lo16) |
59           getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
60            getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
61          "getNumCoveredRegs() will not work with generated subreg masks!");
62 
63   RegPressureIgnoredUnits.resize(getNumRegUnits());
64   RegPressureIgnoredUnits.set(
65       *MCRegUnitIterator(MCRegister::from(AMDGPU::M0), this));
66   for (auto Reg : AMDGPU::VGPR_HI16RegClass)
67     RegPressureIgnoredUnits.set(*MCRegUnitIterator(Reg, this));
68 
69   // HACK: Until this is fully tablegen'd.
70   static llvm::once_flag InitializeRegSplitPartsFlag;
71 
72   static auto InitializeRegSplitPartsOnce = [this]() {
73     for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
74       unsigned Size = getSubRegIdxSize(Idx);
75       if (Size & 31)
76         continue;
77       std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1];
78       unsigned Pos = getSubRegIdxOffset(Idx);
79       if (Pos % Size)
80         continue;
81       Pos /= Size;
82       if (Vec.empty()) {
83         unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits.
84         Vec.resize(MaxNumParts);
85       }
86       Vec[Pos] = Idx;
87     }
88   };
89 
90   static llvm::once_flag InitializeSubRegFromChannelTableFlag;
91 
92   static auto InitializeSubRegFromChannelTableOnce = [this]() {
93     for (auto &Row : SubRegFromChannelTable)
94       Row.fill(AMDGPU::NoSubRegister);
95     for (uint16_t Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
96       unsigned Width = AMDGPUSubRegIdxRanges[Idx].Size / 32;
97       unsigned Offset = AMDGPUSubRegIdxRanges[Idx].Offset / 32;
98       assert(Width < SubRegFromChannelTableWidthMap.size());
99       Width = SubRegFromChannelTableWidthMap[Width];
100       if (Width == 0)
101         continue;
102       unsigned TableIdx = Width - 1;
103       assert(TableIdx < SubRegFromChannelTable.size());
104       assert(Offset < SubRegFromChannelTable[TableIdx].size());
105       SubRegFromChannelTable[TableIdx][Offset] = Idx;
106     }
107   };
108 
109   llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
110   llvm::call_once(InitializeSubRegFromChannelTableFlag,
111                   InitializeSubRegFromChannelTableOnce);
112 }
113 
reserveRegisterTuples(BitVector & Reserved,MCRegister Reg) const114 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
115                                            MCRegister Reg) const {
116   MCRegAliasIterator R(Reg, this, true);
117 
118   for (; R.isValid(); ++R)
119     Reserved.set(*R);
120 }
121 
122 // Forced to be here by one .inc
getCalleeSavedRegs(const MachineFunction * MF) const123 const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
124   const MachineFunction *MF) const {
125   CallingConv::ID CC = MF->getFunction().getCallingConv();
126   switch (CC) {
127   case CallingConv::C:
128   case CallingConv::Fast:
129   case CallingConv::Cold:
130   case CallingConv::AMDGPU_Gfx:
131     return CSR_AMDGPU_HighRegs_SaveList;
132   default: {
133     // Dummy to not crash RegisterClassInfo.
134     static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
135     return &NoCalleeSavedReg;
136   }
137   }
138 }
139 
140 const MCPhysReg *
getCalleeSavedRegsViaCopy(const MachineFunction * MF) const141 SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const {
142   return nullptr;
143 }
144 
getCallPreservedMask(const MachineFunction & MF,CallingConv::ID CC) const145 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
146                                                      CallingConv::ID CC) const {
147   switch (CC) {
148   case CallingConv::C:
149   case CallingConv::Fast:
150   case CallingConv::Cold:
151   case CallingConv::AMDGPU_Gfx:
152     return CSR_AMDGPU_HighRegs_RegMask;
153   default:
154     return nullptr;
155   }
156 }
157 
getNoPreservedMask() const158 const uint32_t *SIRegisterInfo::getNoPreservedMask() const {
159   return CSR_AMDGPU_NoRegs_RegMask;
160 }
161 
getFrameRegister(const MachineFunction & MF) const162 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
163   const SIFrameLowering *TFI =
164       MF.getSubtarget<GCNSubtarget>().getFrameLowering();
165   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
166   // During ISel lowering we always reserve the stack pointer in entry
167   // functions, but never actually want to reference it when accessing our own
168   // frame. If we need a frame pointer we use it, but otherwise we can just use
169   // an immediate "0" which we represent by returning NoRegister.
170   if (FuncInfo->isEntryFunction()) {
171     return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
172   }
173   return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
174                         : FuncInfo->getStackPtrOffsetReg();
175 }
176 
hasBasePointer(const MachineFunction & MF) const177 bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
178   // When we need stack realignment, we can't reference off of the
179   // stack pointer, so we reserve a base pointer.
180   const MachineFrameInfo &MFI = MF.getFrameInfo();
181   return MFI.getNumFixedObjects() && needsStackRealignment(MF);
182 }
183 
getBaseRegister() const184 Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
185 
getAllVGPRRegMask() const186 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const {
187   return CSR_AMDGPU_AllVGPRs_RegMask;
188 }
189 
getAllAllocatableSRegMask() const190 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const {
191   return CSR_AMDGPU_AllAllocatableSRegs_RegMask;
192 }
193 
getSubRegFromChannel(unsigned Channel,unsigned NumRegs)194 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
195                                               unsigned NumRegs) {
196   assert(NumRegs < SubRegFromChannelTableWidthMap.size());
197   unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs];
198   assert(NumRegIndex && "Not implemented");
199   assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size());
200   return SubRegFromChannelTable[NumRegIndex - 1][Channel];
201 }
202 
reservedPrivateSegmentBufferReg(const MachineFunction & MF) const203 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
204   const MachineFunction &MF) const {
205   unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
206   MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
207   return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass);
208 }
209 
getReservedRegs(const MachineFunction & MF) const210 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
211   BitVector Reserved(getNumRegs());
212   Reserved.set(AMDGPU::MODE);
213 
214   // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
215   // this seems likely to result in bugs, so I'm marking them as reserved.
216   reserveRegisterTuples(Reserved, AMDGPU::EXEC);
217   reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
218 
219   // M0 has to be reserved so that llvm accepts it as a live-in into a block.
220   reserveRegisterTuples(Reserved, AMDGPU::M0);
221 
222   // Reserve src_vccz, src_execz, src_scc.
223   reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
224   reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
225   reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
226 
227   // Reserve the memory aperture registers.
228   reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
229   reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
230   reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
231   reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
232 
233   // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
234   reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
235 
236   // Reserve xnack_mask registers - support is not implemented in Codegen.
237   reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
238 
239   // Reserve lds_direct register - support is not implemented in Codegen.
240   reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
241 
242   // Reserve Trap Handler registers - support is not implemented in Codegen.
243   reserveRegisterTuples(Reserved, AMDGPU::TBA);
244   reserveRegisterTuples(Reserved, AMDGPU::TMA);
245   reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
246   reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
247   reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
248   reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
249   reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
250   reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
251   reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
252   reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
253 
254   // Reserve null register - it shall never be allocated
255   reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL);
256 
257   // Disallow vcc_hi allocation in wave32. It may be allocated but most likely
258   // will result in bugs.
259   if (isWave32) {
260     Reserved.set(AMDGPU::VCC);
261     Reserved.set(AMDGPU::VCC_HI);
262   }
263 
264   unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
265   unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
266   for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) {
267     unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
268     reserveRegisterTuples(Reserved, Reg);
269   }
270 
271   unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
272   unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
273   for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
274     unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
275     reserveRegisterTuples(Reserved, Reg);
276     Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
277     reserveRegisterTuples(Reserved, Reg);
278   }
279 
280   for (auto Reg : AMDGPU::SReg_32RegClass) {
281     Reserved.set(getSubReg(Reg, AMDGPU::hi16));
282     Register Low = getSubReg(Reg, AMDGPU::lo16);
283     // This is to prevent BB vcc liveness errors.
284     if (!AMDGPU::SGPR_LO16RegClass.contains(Low))
285       Reserved.set(Low);
286   }
287 
288   for (auto Reg : AMDGPU::AGPR_32RegClass) {
289     Reserved.set(getSubReg(Reg, AMDGPU::hi16));
290   }
291 
292   // Reserve all the rest AGPRs if there are no instructions to use it.
293   if (!ST.hasMAIInsts()) {
294     for (unsigned i = 0; i < MaxNumVGPRs; ++i) {
295       unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
296       reserveRegisterTuples(Reserved, Reg);
297     }
298   }
299 
300   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
301 
302   Register ScratchRSrcReg = MFI->getScratchRSrcReg();
303   if (ScratchRSrcReg != AMDGPU::NoRegister) {
304     // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
305     // to spill.
306     // TODO: May need to reserve a VGPR if doing LDS spilling.
307     reserveRegisterTuples(Reserved, ScratchRSrcReg);
308   }
309 
310   // We have to assume the SP is needed in case there are calls in the function,
311   // which is detected after the function is lowered. If we aren't really going
312   // to need SP, don't bother reserving it.
313   MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
314 
315   if (StackPtrReg) {
316     reserveRegisterTuples(Reserved, StackPtrReg);
317     assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
318   }
319 
320   MCRegister FrameReg = MFI->getFrameOffsetReg();
321   if (FrameReg) {
322     reserveRegisterTuples(Reserved, FrameReg);
323     assert(!isSubRegister(ScratchRSrcReg, FrameReg));
324   }
325 
326   if (hasBasePointer(MF)) {
327     MCRegister BasePtrReg = getBaseRegister();
328     reserveRegisterTuples(Reserved, BasePtrReg);
329     assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
330   }
331 
332   for (MCRegister Reg : MFI->WWMReservedRegs) {
333     reserveRegisterTuples(Reserved, Reg);
334   }
335 
336   // FIXME: Stop using reserved registers for this.
337   for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
338     reserveRegisterTuples(Reserved, Reg);
339 
340   for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
341     reserveRegisterTuples(Reserved, Reg);
342 
343   for (auto SSpill : MFI->getSGPRSpillVGPRs())
344     reserveRegisterTuples(Reserved, SSpill.VGPR);
345 
346   return Reserved;
347 }
348 
canRealignStack(const MachineFunction & MF) const349 bool SIRegisterInfo::canRealignStack(const MachineFunction &MF) const {
350   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
351   // On entry, the base address is 0, so it can't possibly need any more
352   // alignment.
353 
354   // FIXME: Should be able to specify the entry frame alignment per calling
355   // convention instead.
356   if (Info->isEntryFunction())
357     return false;
358 
359   return TargetRegisterInfo::canRealignStack(MF);
360 }
361 
requiresRegisterScavenging(const MachineFunction & Fn) const362 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
363   const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>();
364   if (Info->isEntryFunction()) {
365     const MachineFrameInfo &MFI = Fn.getFrameInfo();
366     return MFI.hasStackObjects() || MFI.hasCalls();
367   }
368 
369   // May need scavenger for dealing with callee saved registers.
370   return true;
371 }
372 
requiresFrameIndexScavenging(const MachineFunction & MF) const373 bool SIRegisterInfo::requiresFrameIndexScavenging(
374   const MachineFunction &MF) const {
375   // Do not use frame virtual registers. They used to be used for SGPRs, but
376   // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
377   // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
378   // spill.
379   return false;
380 }
381 
requiresFrameIndexReplacementScavenging(const MachineFunction & MF) const382 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
383   const MachineFunction &MF) const {
384   const MachineFrameInfo &MFI = MF.getFrameInfo();
385   return MFI.hasStackObjects();
386 }
387 
requiresVirtualBaseRegisters(const MachineFunction &) const388 bool SIRegisterInfo::requiresVirtualBaseRegisters(
389   const MachineFunction &) const {
390   // There are no special dedicated stack or frame pointers.
391   return true;
392 }
393 
getScratchInstrOffset(const MachineInstr * MI) const394 int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const {
395   assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI));
396 
397   int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
398                                           AMDGPU::OpName::offset);
399   return MI->getOperand(OffIdx).getImm();
400 }
401 
getFrameIndexInstrOffset(const MachineInstr * MI,int Idx) const402 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
403                                                  int Idx) const {
404   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
405     return 0;
406 
407   assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
408                                             AMDGPU::OpName::vaddr) ||
409          (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
410                                             AMDGPU::OpName::saddr))) &&
411          "Should never see frame index on non-address operand");
412 
413   return getScratchInstrOffset(MI);
414 }
415 
needsFrameBaseReg(MachineInstr * MI,int64_t Offset) const416 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
417   if (!MI->mayLoadOrStore())
418     return false;
419 
420   int64_t FullOffset = Offset + getScratchInstrOffset(MI);
421 
422   if (SIInstrInfo::isMUBUF(*MI))
423     return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset);
424 
425   const SIInstrInfo *TII = ST.getInstrInfo();
426   return TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, true);
427 }
428 
materializeFrameBaseRegister(MachineBasicBlock * MBB,Register BaseReg,int FrameIdx,int64_t Offset) const429 void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
430                                                   Register BaseReg,
431                                                   int FrameIdx,
432                                                   int64_t Offset) const {
433   MachineBasicBlock::iterator Ins = MBB->begin();
434   DebugLoc DL; // Defaults to "unknown"
435 
436   if (Ins != MBB->end())
437     DL = Ins->getDebugLoc();
438 
439   MachineFunction *MF = MBB->getParent();
440   const SIInstrInfo *TII = ST.getInstrInfo();
441   unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32
442                                            : AMDGPU::V_MOV_B32_e32;
443 
444   if (Offset == 0) {
445     BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
446       .addFrameIndex(FrameIdx);
447     return;
448   }
449 
450   MachineRegisterInfo &MRI = MF->getRegInfo();
451   Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
452 
453   Register FIReg = MRI.createVirtualRegister(
454       ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass
455                              : &AMDGPU::VGPR_32RegClass);
456 
457   BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
458     .addImm(Offset);
459   BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
460     .addFrameIndex(FrameIdx);
461 
462   if (ST.enableFlatScratch() ) {
463     BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_U32), BaseReg)
464         .addReg(OffsetReg, RegState::Kill)
465         .addReg(FIReg);
466     return;
467   }
468 
469   TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
470     .addReg(OffsetReg, RegState::Kill)
471     .addReg(FIReg)
472     .addImm(0); // clamp bit
473 }
474 
resolveFrameIndex(MachineInstr & MI,Register BaseReg,int64_t Offset) const475 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
476                                        int64_t Offset) const {
477   const SIInstrInfo *TII = ST.getInstrInfo();
478   bool IsFlat = TII->isFLATScratch(MI);
479 
480 #ifndef NDEBUG
481   // FIXME: Is it possible to be storing a frame index to itself?
482   bool SeenFI = false;
483   for (const MachineOperand &MO: MI.operands()) {
484     if (MO.isFI()) {
485       if (SeenFI)
486         llvm_unreachable("should not see multiple frame indices");
487 
488       SeenFI = true;
489     }
490   }
491 #endif
492 
493   MachineOperand *FIOp =
494       TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr
495                                       : AMDGPU::OpName::vaddr);
496 
497   MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
498   int64_t NewOffset = OffsetOp->getImm() + Offset;
499 
500 #ifndef NDEBUG
501   MachineBasicBlock *MBB = MI.getParent();
502   MachineFunction *MF = MBB->getParent();
503   assert(FIOp && FIOp->isFI() && "frame index must be address operand");
504   assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
505 
506   if (IsFlat) {
507     assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true) &&
508            "offset should be legal");
509     FIOp->ChangeToRegister(BaseReg, false);
510     OffsetOp->setImm(NewOffset);
511     return;
512   }
513 
514   MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
515   assert((SOffset->isReg() &&
516           SOffset->getReg() ==
517               MF->getInfo<SIMachineFunctionInfo>()->getStackPtrOffsetReg()) ||
518          (SOffset->isImm() && SOffset->getImm() == 0));
519 #endif
520 
521   assert(SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) &&
522          "offset should be legal");
523 
524   FIOp->ChangeToRegister(BaseReg, false);
525   OffsetOp->setImm(NewOffset);
526 }
527 
isFrameOffsetLegal(const MachineInstr * MI,Register BaseReg,int64_t Offset) const528 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
529                                         Register BaseReg,
530                                         int64_t Offset) const {
531   if (!SIInstrInfo::isMUBUF(*MI) && !!SIInstrInfo::isFLATScratch(*MI))
532     return false;
533 
534   int64_t NewOffset = Offset + getScratchInstrOffset(MI);
535 
536   if (SIInstrInfo::isMUBUF(*MI))
537     return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset);
538 
539   const SIInstrInfo *TII = ST.getInstrInfo();
540   return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true);
541 }
542 
getPointerRegClass(const MachineFunction & MF,unsigned Kind) const543 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
544   const MachineFunction &MF, unsigned Kind) const {
545   // This is inaccurate. It depends on the instruction and address space. The
546   // only place where we should hit this is for dealing with frame indexes /
547   // private accesses, so this is correct in that case.
548   return &AMDGPU::VGPR_32RegClass;
549 }
550 
getNumSubRegsForSpillOp(unsigned Op)551 static unsigned getNumSubRegsForSpillOp(unsigned Op) {
552 
553   switch (Op) {
554   case AMDGPU::SI_SPILL_S1024_SAVE:
555   case AMDGPU::SI_SPILL_S1024_RESTORE:
556   case AMDGPU::SI_SPILL_V1024_SAVE:
557   case AMDGPU::SI_SPILL_V1024_RESTORE:
558   case AMDGPU::SI_SPILL_A1024_SAVE:
559   case AMDGPU::SI_SPILL_A1024_RESTORE:
560     return 32;
561   case AMDGPU::SI_SPILL_S512_SAVE:
562   case AMDGPU::SI_SPILL_S512_RESTORE:
563   case AMDGPU::SI_SPILL_V512_SAVE:
564   case AMDGPU::SI_SPILL_V512_RESTORE:
565   case AMDGPU::SI_SPILL_A512_SAVE:
566   case AMDGPU::SI_SPILL_A512_RESTORE:
567     return 16;
568   case AMDGPU::SI_SPILL_S256_SAVE:
569   case AMDGPU::SI_SPILL_S256_RESTORE:
570   case AMDGPU::SI_SPILL_V256_SAVE:
571   case AMDGPU::SI_SPILL_V256_RESTORE:
572   case AMDGPU::SI_SPILL_A256_SAVE:
573   case AMDGPU::SI_SPILL_A256_RESTORE:
574     return 8;
575   case AMDGPU::SI_SPILL_S192_SAVE:
576   case AMDGPU::SI_SPILL_S192_RESTORE:
577   case AMDGPU::SI_SPILL_V192_SAVE:
578   case AMDGPU::SI_SPILL_V192_RESTORE:
579   case AMDGPU::SI_SPILL_A192_SAVE:
580   case AMDGPU::SI_SPILL_A192_RESTORE:
581     return 6;
582   case AMDGPU::SI_SPILL_S160_SAVE:
583   case AMDGPU::SI_SPILL_S160_RESTORE:
584   case AMDGPU::SI_SPILL_V160_SAVE:
585   case AMDGPU::SI_SPILL_V160_RESTORE:
586   case AMDGPU::SI_SPILL_A160_SAVE:
587   case AMDGPU::SI_SPILL_A160_RESTORE:
588     return 5;
589   case AMDGPU::SI_SPILL_S128_SAVE:
590   case AMDGPU::SI_SPILL_S128_RESTORE:
591   case AMDGPU::SI_SPILL_V128_SAVE:
592   case AMDGPU::SI_SPILL_V128_RESTORE:
593   case AMDGPU::SI_SPILL_A128_SAVE:
594   case AMDGPU::SI_SPILL_A128_RESTORE:
595     return 4;
596   case AMDGPU::SI_SPILL_S96_SAVE:
597   case AMDGPU::SI_SPILL_S96_RESTORE:
598   case AMDGPU::SI_SPILL_V96_SAVE:
599   case AMDGPU::SI_SPILL_V96_RESTORE:
600   case AMDGPU::SI_SPILL_A96_SAVE:
601   case AMDGPU::SI_SPILL_A96_RESTORE:
602     return 3;
603   case AMDGPU::SI_SPILL_S64_SAVE:
604   case AMDGPU::SI_SPILL_S64_RESTORE:
605   case AMDGPU::SI_SPILL_V64_SAVE:
606   case AMDGPU::SI_SPILL_V64_RESTORE:
607   case AMDGPU::SI_SPILL_A64_SAVE:
608   case AMDGPU::SI_SPILL_A64_RESTORE:
609     return 2;
610   case AMDGPU::SI_SPILL_S32_SAVE:
611   case AMDGPU::SI_SPILL_S32_RESTORE:
612   case AMDGPU::SI_SPILL_V32_SAVE:
613   case AMDGPU::SI_SPILL_V32_RESTORE:
614   case AMDGPU::SI_SPILL_A32_SAVE:
615   case AMDGPU::SI_SPILL_A32_RESTORE:
616     return 1;
617   default: llvm_unreachable("Invalid spill opcode");
618   }
619 }
620 
getOffsetMUBUFStore(unsigned Opc)621 static int getOffsetMUBUFStore(unsigned Opc) {
622   switch (Opc) {
623   case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
624     return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
625   case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
626     return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
627   case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
628     return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
629   case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
630     return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
631   case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
632     return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
633   case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
634     return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
635   case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
636     return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
637   default:
638     return -1;
639   }
640 }
641 
getOffsetMUBUFLoad(unsigned Opc)642 static int getOffsetMUBUFLoad(unsigned Opc) {
643   switch (Opc) {
644   case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
645     return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
646   case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
647     return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
648   case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
649     return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
650   case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
651     return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
652   case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
653     return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
654   case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
655     return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
656   case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
657     return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
658   case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
659     return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
660   case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
661     return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
662   case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
663     return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
664   case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
665     return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
666   case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
667     return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
668   case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
669     return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
670   default:
671     return -1;
672   }
673 }
674 
spillVGPRtoAGPR(const GCNSubtarget & ST,MachineBasicBlock::iterator MI,int Index,unsigned Lane,unsigned ValueReg,bool IsKill)675 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
676                                            MachineBasicBlock::iterator MI,
677                                            int Index,
678                                            unsigned Lane,
679                                            unsigned ValueReg,
680                                            bool IsKill) {
681   MachineBasicBlock *MBB = MI->getParent();
682   MachineFunction *MF = MI->getParent()->getParent();
683   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
684   const SIInstrInfo *TII = ST.getInstrInfo();
685 
686   MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
687 
688   if (Reg == AMDGPU::NoRegister)
689     return MachineInstrBuilder();
690 
691   bool IsStore = MI->mayStore();
692   MachineRegisterInfo &MRI = MF->getRegInfo();
693   auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
694 
695   unsigned Dst = IsStore ? Reg : ValueReg;
696   unsigned Src = IsStore ? ValueReg : Reg;
697   unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32
698                                                    : AMDGPU::V_ACCVGPR_READ_B32;
699 
700   auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst)
701                .addReg(Src, getKillRegState(IsKill));
702   MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
703   return MIB;
704 }
705 
706 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
707 // need to handle the case where an SGPR may need to be spilled while spilling.
buildMUBUFOffsetLoadStore(const GCNSubtarget & ST,MachineFrameInfo & MFI,MachineBasicBlock::iterator MI,int Index,int64_t Offset)708 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST,
709                                       MachineFrameInfo &MFI,
710                                       MachineBasicBlock::iterator MI,
711                                       int Index,
712                                       int64_t Offset) {
713   const SIInstrInfo *TII = ST.getInstrInfo();
714   MachineBasicBlock *MBB = MI->getParent();
715   const DebugLoc &DL = MI->getDebugLoc();
716   bool IsStore = MI->mayStore();
717 
718   unsigned Opc = MI->getOpcode();
719   int LoadStoreOp = IsStore ?
720     getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc);
721   if (LoadStoreOp == -1)
722     return false;
723 
724   const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
725   if (spillVGPRtoAGPR(ST, MI, Index, 0, Reg->getReg(), false).getInstr())
726     return true;
727 
728   MachineInstrBuilder NewMI =
729       BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
730           .add(*Reg)
731           .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
732           .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
733           .addImm(Offset)
734           .addImm(0) // glc
735           .addImm(0) // slc
736           .addImm(0) // tfe
737           .addImm(0) // dlc
738           .addImm(0) // swz
739           .cloneMemRefs(*MI);
740 
741   const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
742                                                        AMDGPU::OpName::vdata_in);
743   if (VDataIn)
744     NewMI.add(*VDataIn);
745   return true;
746 }
747 
buildSpillLoadStore(MachineBasicBlock::iterator MI,unsigned LoadStoreOp,int Index,Register ValueReg,bool IsKill,MCRegister ScratchOffsetReg,int64_t InstOffset,MachineMemOperand * MMO,RegScavenger * RS) const748 void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
749                                          unsigned LoadStoreOp,
750                                          int Index,
751                                          Register ValueReg,
752                                          bool IsKill,
753                                          MCRegister ScratchOffsetReg,
754                                          int64_t InstOffset,
755                                          MachineMemOperand *MMO,
756                                          RegScavenger *RS) const {
757   MachineBasicBlock *MBB = MI->getParent();
758   MachineFunction *MF = MI->getParent()->getParent();
759   const SIInstrInfo *TII = ST.getInstrInfo();
760   const MachineFrameInfo &MFI = MF->getFrameInfo();
761   const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
762 
763   const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
764   const DebugLoc &DL = MI->getDebugLoc();
765   bool IsStore = Desc->mayStore();
766   bool IsFlat = TII->isFLATScratch(LoadStoreOp);
767 
768   bool Scavenged = false;
769   MCRegister SOffset = ScratchOffsetReg;
770 
771   const unsigned EltSize = 4;
772   const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
773   unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / (EltSize * CHAR_BIT);
774   unsigned Size = NumSubRegs * EltSize;
775   int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
776   int64_t MaxOffset = Offset + Size - EltSize;
777   int64_t ScratchOffsetRegDelta = 0;
778 
779   Align Alignment = MFI.getObjectAlign(Index);
780   const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
781 
782   assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset");
783 
784   bool IsOffsetLegal = IsFlat
785       ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, true)
786       : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset);
787   if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
788     SOffset = MCRegister();
789 
790     // We currently only support spilling VGPRs to EltSize boundaries, meaning
791     // we can simplify the adjustment of Offset here to just scale with
792     // WavefrontSize.
793     if (!IsFlat)
794       Offset *= ST.getWavefrontSize();
795 
796     // We don't have access to the register scavenger if this function is called
797     // during  PEI::scavengeFrameVirtualRegs().
798     if (RS)
799       SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false);
800 
801     if (!SOffset) {
802       // There are no free SGPRs, and since we are in the process of spilling
803       // VGPRs too.  Since we need a VGPR in order to spill SGPRs (this is true
804       // on SI/CI and on VI it is true until we implement spilling using scalar
805       // stores), we have no way to free up an SGPR.  Our solution here is to
806       // add the offset directly to the ScratchOffset or StackPtrOffset
807       // register, and then subtract the offset after the spill to return the
808       // register to it's original value.
809       if (!ScratchOffsetReg)
810         ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
811       SOffset = ScratchOffsetReg;
812       ScratchOffsetRegDelta = Offset;
813     } else {
814       Scavenged = true;
815     }
816 
817     if (!SOffset)
818       report_fatal_error("could not scavenge SGPR to spill in entry function");
819 
820     if (ScratchOffsetReg == AMDGPU::NoRegister) {
821       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset)
822           .addImm(Offset);
823     } else {
824       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
825           .addReg(ScratchOffsetReg)
826           .addImm(Offset);
827     }
828 
829     Offset = 0;
830   }
831 
832   if (IsFlat && SOffset == AMDGPU::NoRegister) {
833     assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
834            && "Unexpected vaddr for flat scratch with a FI operand");
835 
836     assert(ST.hasFlatScratchSTMode());
837     LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
838     Desc = &TII->get(LoadStoreOp);
839   }
840 
841   Register TmpReg;
842 
843   // FIXME: Flat scratch does not have to be limited to a dword per store.
844   for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) {
845     Register SubReg =
846         NumSubRegs == 1
847             ? ValueReg
848             : Register(getSubReg(ValueReg, getSubRegFromChannel(i)));
849 
850     unsigned SOffsetRegState = 0;
851     unsigned SrcDstRegState = getDefRegState(!IsStore);
852     if (i + 1 == e) {
853       SOffsetRegState |= getKillRegState(Scavenged);
854       // The last implicit use carries the "Kill" flag.
855       SrcDstRegState |= getKillRegState(IsKill);
856     }
857 
858     // Make sure the whole register is defined if there are undef components by
859     // adding an implicit def of the super-reg on the first instruction.
860     const bool NeedSuperRegDef = NumSubRegs > 1 && IsStore && i == 0;
861 
862     auto MIB = spillVGPRtoAGPR(ST, MI, Index, i, SubReg, IsKill);
863 
864     if (!MIB.getInstr()) {
865       unsigned FinalReg = SubReg;
866 
867       const bool IsAGPR = hasAGPRs(RC);
868       if (IsAGPR) {
869         if (!TmpReg) {
870           assert(RS && "Needs to have RegScavenger to spill an AGPR!");
871           // FIXME: change to scavengeRegisterBackwards()
872           TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
873           RS->setRegUsed(TmpReg);
874         }
875         if (IsStore) {
876           auto AccRead = BuildMI(*MBB, MI, DL,
877                                  TII->get(AMDGPU::V_ACCVGPR_READ_B32), TmpReg)
878             .addReg(SubReg, getKillRegState(IsKill));
879           if (NeedSuperRegDef)
880             AccRead.addReg(ValueReg, RegState::ImplicitDefine);
881           AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse);
882         }
883         SubReg = TmpReg;
884       }
885 
886       MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
887       MachineMemOperand *NewMMO =
888           MF->getMachineMemOperand(PInfo, MMO->getFlags(), EltSize,
889                                    commonAlignment(Alignment, EltSize * i));
890 
891       MIB = BuildMI(*MBB, MI, DL, *Desc)
892                 .addReg(SubReg,
893                         getDefRegState(!IsStore) | getKillRegState(IsKill));
894       if (!IsFlat)
895         MIB.addReg(FuncInfo->getScratchRSrcReg());
896 
897       if (SOffset == AMDGPU::NoRegister) {
898         if (!IsFlat)
899           MIB.addImm(0);
900       } else {
901         MIB.addReg(SOffset, SOffsetRegState);
902       }
903       MIB.addImm(Offset)
904           .addImm(0) // glc
905           .addImm(0) // slc
906           .addImm(0); // tfe for MUBUF or dlc for FLAT
907       if (!IsFlat)
908         MIB.addImm(0) // dlc
909            .addImm(0); // swz
910       MIB.addMemOperand(NewMMO);
911 
912       if (!IsAGPR && NeedSuperRegDef)
913         MIB.addReg(ValueReg, RegState::ImplicitDefine);
914 
915       if (!IsStore && TmpReg != AMDGPU::NoRegister) {
916         MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32),
917                       FinalReg)
918           .addReg(TmpReg, RegState::Kill);
919         MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
920       }
921     } else {
922       if (NeedSuperRegDef)
923         MIB.addReg(ValueReg, RegState::ImplicitDefine);
924     }
925 
926     if (NumSubRegs > 1) {
927       MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
928     }
929   }
930 
931   if (ScratchOffsetRegDelta != 0) {
932     // Subtract the offset we added to the ScratchOffset register.
933     BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), SOffset)
934         .addReg(SOffset)
935         .addImm(ScratchOffsetRegDelta);
936   }
937 }
938 
939 // Generate a VMEM access which loads or stores the VGPR containing an SGPR
940 // spill such that all the lanes set in VGPRLanes are loaded or stored.
941 // This generates exec mask manipulation and will use SGPRs available in MI
942 // or VGPR lanes in the VGPR to save and restore the exec mask.
buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI,int Index,int Offset,unsigned EltSize,Register VGPR,int64_t VGPRLanes,RegScavenger * RS,bool IsLoad) const943 void SIRegisterInfo::buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI,
944                                              int Index, int Offset,
945                                              unsigned EltSize, Register VGPR,
946                                              int64_t VGPRLanes,
947                                              RegScavenger *RS,
948                                              bool IsLoad) const {
949   MachineBasicBlock *MBB = MI->getParent();
950   MachineFunction *MF = MBB->getParent();
951   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
952   const SIInstrInfo *TII = ST.getInstrInfo();
953 
954   Register SuperReg = MI->getOperand(0).getReg();
955   const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
956   ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
957   unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
958   unsigned FirstPart = Offset * 32;
959   unsigned ExecLane = 0;
960 
961   bool IsKill = MI->getOperand(0).isKill();
962   const DebugLoc &DL = MI->getDebugLoc();
963 
964   // Cannot handle load/store to EXEC
965   assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
966          SuperReg != AMDGPU::EXEC && "exec should never spill");
967 
968   // On Wave32 only handle EXEC_LO.
969   // On Wave64 only update EXEC_HI if there is sufficent space for a copy.
970   bool OnlyExecLo = isWave32 || NumSubRegs == 1 || SuperReg == AMDGPU::EXEC_HI;
971 
972   unsigned ExecMovOpc = OnlyExecLo ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
973   Register ExecReg = OnlyExecLo ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
974   Register SavedExecReg;
975 
976   // Backup EXEC
977   if (OnlyExecLo) {
978     SavedExecReg =
979         NumSubRegs == 1
980             ? SuperReg
981             : Register(getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]));
982   } else {
983     // If src/dst is an odd size it is possible subreg0 is not aligned.
984     for (; ExecLane < (NumSubRegs - 1); ++ExecLane) {
985       SavedExecReg = getMatchingSuperReg(
986           getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]), AMDGPU::sub0,
987           &AMDGPU::SReg_64_XEXECRegClass);
988       if (SavedExecReg)
989         break;
990     }
991   }
992   assert(SavedExecReg);
993   BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), SavedExecReg).addReg(ExecReg);
994 
995   // Setup EXEC
996   BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg).addImm(VGPRLanes);
997 
998   // Load/store VGPR
999   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1000   assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
1001 
1002   Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
1003                           ? getBaseRegister()
1004                           : getFrameRegister(*MF);
1005 
1006   Align Alignment = FrameInfo.getObjectAlign(Index);
1007   MachinePointerInfo PtrInfo =
1008       MachinePointerInfo::getFixedStack(*MF, Index);
1009   MachineMemOperand *MMO = MF->getMachineMemOperand(
1010       PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore,
1011       EltSize, Alignment);
1012 
1013   if (IsLoad) {
1014     unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
1015                                           : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1016     buildSpillLoadStore(MI, Opc,
1017           Index,
1018           VGPR, false,
1019           FrameReg,
1020           Offset * EltSize, MMO,
1021           RS);
1022   } else {
1023     unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1024                                           : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1025     buildSpillLoadStore(MI, Opc, Index, VGPR,
1026                         IsKill, FrameReg,
1027                         Offset * EltSize, MMO, RS);
1028     // This only ever adds one VGPR spill
1029     MFI->addToSpilledVGPRs(1);
1030   }
1031 
1032   // Restore EXEC
1033   BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg)
1034       .addReg(SavedExecReg, getKillRegState(IsLoad || IsKill));
1035 
1036   // Restore clobbered SGPRs
1037   if (IsLoad) {
1038     // Nothing to do; register will be overwritten
1039   } else if (!IsKill) {
1040     // Restore SGPRs from appropriate VGPR lanes
1041     if (!OnlyExecLo) {
1042       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
1043               getSubReg(SuperReg, SplitParts[FirstPart + ExecLane + 1]))
1044           .addReg(VGPR)
1045           .addImm(ExecLane + 1);
1046     }
1047     BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
1048             NumSubRegs == 1 ? SavedExecReg
1049                             : Register(getSubReg(
1050                                   SuperReg, SplitParts[FirstPart + ExecLane])))
1051         .addReg(VGPR, RegState::Kill)
1052         .addImm(ExecLane);
1053   }
1054 }
1055 
spillSGPR(MachineBasicBlock::iterator MI,int Index,RegScavenger * RS,bool OnlyToVGPR) const1056 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
1057                                int Index,
1058                                RegScavenger *RS,
1059                                bool OnlyToVGPR) const {
1060   MachineBasicBlock *MBB = MI->getParent();
1061   MachineFunction *MF = MBB->getParent();
1062   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1063   DenseSet<Register> SGPRSpillVGPRDefinedSet; // FIXME: This should be removed
1064 
1065   ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
1066     = MFI->getSGPRToVGPRSpills(Index);
1067   bool SpillToVGPR = !VGPRSpills.empty();
1068   if (OnlyToVGPR && !SpillToVGPR)
1069     return false;
1070 
1071   const SIInstrInfo *TII = ST.getInstrInfo();
1072 
1073   Register SuperReg = MI->getOperand(0).getReg();
1074   bool IsKill = MI->getOperand(0).isKill();
1075   const DebugLoc &DL = MI->getDebugLoc();
1076 
1077   assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() &&
1078                          SuperReg != MFI->getFrameOffsetReg()));
1079 
1080   assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
1081   assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
1082          SuperReg != AMDGPU::EXEC && "exec should never spill");
1083 
1084   unsigned EltSize = 4;
1085   const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
1086 
1087   ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
1088   unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
1089 
1090   if (SpillToVGPR) {
1091     for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
1092       Register SubReg = NumSubRegs == 1
1093                             ? SuperReg
1094                             : Register(getSubReg(SuperReg, SplitParts[i]));
1095       SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
1096 
1097       bool UseKill = IsKill && i == NumSubRegs - 1;
1098 
1099       // During SGPR spilling to VGPR, determine if the VGPR is defined. The
1100       // only circumstance in which we say it is undefined is when it is the
1101       // first spill to this VGPR in the first basic block.
1102       bool VGPRDefined = true;
1103       if (MBB == &MF->front())
1104         VGPRDefined = !SGPRSpillVGPRDefinedSet.insert(Spill.VGPR).second;
1105 
1106       // Mark the "old value of vgpr" input undef only if this is the first sgpr
1107       // spill to this specific vgpr in the first basic block.
1108       auto MIB =
1109           BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill.VGPR)
1110               .addReg(SubReg, getKillRegState(UseKill))
1111               .addImm(Spill.Lane)
1112               .addReg(Spill.VGPR, VGPRDefined ? 0 : RegState::Undef);
1113 
1114       if (i == 0 && NumSubRegs > 1) {
1115         // We may be spilling a super-register which is only partially defined,
1116         // and need to ensure later spills think the value is defined.
1117         MIB.addReg(SuperReg, RegState::ImplicitDefine);
1118       }
1119 
1120       if (NumSubRegs > 1)
1121         MIB.addReg(SuperReg, getKillRegState(UseKill) | RegState::Implicit);
1122 
1123       // FIXME: Since this spills to another register instead of an actual
1124       // frame index, we should delete the frame index when all references to
1125       // it are fixed.
1126     }
1127   } else {
1128     // Scavenged temporary VGPR to use. It must be scavenged once for any number
1129     // of spilled subregs.
1130     Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
1131     RS->setRegUsed(TmpVGPR);
1132 
1133     // SubReg carries the "Kill" flag when SubReg == SuperReg.
1134     unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
1135 
1136     unsigned PerVGPR = 32;
1137     unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR;
1138     int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL;
1139 
1140     for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) {
1141       unsigned TmpVGPRFlags = RegState::Undef;
1142 
1143       // Write sub registers into the VGPR
1144       for (unsigned i = Offset * PerVGPR,
1145                     e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
1146            i < e; ++i) {
1147         Register SubReg = NumSubRegs == 1
1148                               ? SuperReg
1149                               : Register(getSubReg(SuperReg, SplitParts[i]));
1150 
1151         MachineInstrBuilder WriteLane =
1152             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), TmpVGPR)
1153                 .addReg(SubReg, SubKillState)
1154                 .addImm(i % PerVGPR)
1155                 .addReg(TmpVGPR, TmpVGPRFlags);
1156         TmpVGPRFlags = 0;
1157 
1158         // There could be undef components of a spilled super register.
1159         // TODO: Can we detect this and skip the spill?
1160         if (NumSubRegs > 1) {
1161           // The last implicit use of the SuperReg carries the "Kill" flag.
1162           unsigned SuperKillState = 0;
1163           if (i + 1 == NumSubRegs)
1164             SuperKillState |= getKillRegState(IsKill);
1165           WriteLane.addReg(SuperReg, RegState::Implicit | SuperKillState);
1166         }
1167       }
1168 
1169       // Write out VGPR
1170       buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes,
1171                               RS, false);
1172     }
1173   }
1174 
1175   MI->eraseFromParent();
1176   MFI->addToSpilledSGPRs(NumSubRegs);
1177   return true;
1178 }
1179 
restoreSGPR(MachineBasicBlock::iterator MI,int Index,RegScavenger * RS,bool OnlyToVGPR) const1180 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
1181                                  int Index,
1182                                  RegScavenger *RS,
1183                                  bool OnlyToVGPR) const {
1184   MachineFunction *MF = MI->getParent()->getParent();
1185   MachineBasicBlock *MBB = MI->getParent();
1186   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1187 
1188   ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
1189     = MFI->getSGPRToVGPRSpills(Index);
1190   bool SpillToVGPR = !VGPRSpills.empty();
1191   if (OnlyToVGPR && !SpillToVGPR)
1192     return false;
1193 
1194   const SIInstrInfo *TII = ST.getInstrInfo();
1195   const DebugLoc &DL = MI->getDebugLoc();
1196 
1197   Register SuperReg = MI->getOperand(0).getReg();
1198 
1199   assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
1200   assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
1201          SuperReg != AMDGPU::EXEC && "exec should never spill");
1202 
1203   unsigned EltSize = 4;
1204 
1205   const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
1206 
1207   ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
1208   unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
1209 
1210   if (SpillToVGPR) {
1211     for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
1212       Register SubReg = NumSubRegs == 1
1213                             ? SuperReg
1214                             : Register(getSubReg(SuperReg, SplitParts[i]));
1215 
1216       SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
1217       auto MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
1218                      .addReg(Spill.VGPR)
1219                      .addImm(Spill.Lane);
1220       if (NumSubRegs > 1 && i == 0)
1221         MIB.addReg(SuperReg, RegState::ImplicitDefine);
1222     }
1223   } else {
1224     Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
1225     RS->setRegUsed(TmpVGPR);
1226 
1227     unsigned PerVGPR = 32;
1228     unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR;
1229     int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL;
1230 
1231     for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) {
1232       // Load in VGPR data
1233       buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes,
1234                               RS, true);
1235 
1236       // Unpack lanes
1237       for (unsigned i = Offset * PerVGPR,
1238                     e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
1239            i < e; ++i) {
1240         Register SubReg = NumSubRegs == 1
1241                               ? SuperReg
1242                               : Register(getSubReg(SuperReg, SplitParts[i]));
1243 
1244         bool LastSubReg = (i + 1 == e);
1245         auto MIB =
1246             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
1247                 .addReg(TmpVGPR, getKillRegState(LastSubReg))
1248                 .addImm(i);
1249         if (NumSubRegs > 1 && i == 0)
1250           MIB.addReg(SuperReg, RegState::ImplicitDefine);
1251       }
1252     }
1253   }
1254 
1255   MI->eraseFromParent();
1256   return true;
1257 }
1258 
1259 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
1260 /// a VGPR and the stack slot can be safely eliminated when all other users are
1261 /// handled.
eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI,int FI,RegScavenger * RS) const1262 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
1263   MachineBasicBlock::iterator MI,
1264   int FI,
1265   RegScavenger *RS) const {
1266   switch (MI->getOpcode()) {
1267   case AMDGPU::SI_SPILL_S1024_SAVE:
1268   case AMDGPU::SI_SPILL_S512_SAVE:
1269   case AMDGPU::SI_SPILL_S256_SAVE:
1270   case AMDGPU::SI_SPILL_S192_SAVE:
1271   case AMDGPU::SI_SPILL_S160_SAVE:
1272   case AMDGPU::SI_SPILL_S128_SAVE:
1273   case AMDGPU::SI_SPILL_S96_SAVE:
1274   case AMDGPU::SI_SPILL_S64_SAVE:
1275   case AMDGPU::SI_SPILL_S32_SAVE:
1276     return spillSGPR(MI, FI, RS, true);
1277   case AMDGPU::SI_SPILL_S1024_RESTORE:
1278   case AMDGPU::SI_SPILL_S512_RESTORE:
1279   case AMDGPU::SI_SPILL_S256_RESTORE:
1280   case AMDGPU::SI_SPILL_S192_RESTORE:
1281   case AMDGPU::SI_SPILL_S160_RESTORE:
1282   case AMDGPU::SI_SPILL_S128_RESTORE:
1283   case AMDGPU::SI_SPILL_S96_RESTORE:
1284   case AMDGPU::SI_SPILL_S64_RESTORE:
1285   case AMDGPU::SI_SPILL_S32_RESTORE:
1286     return restoreSGPR(MI, FI, RS, true);
1287   default:
1288     llvm_unreachable("not an SGPR spill instruction");
1289   }
1290 }
1291 
eliminateFrameIndex(MachineBasicBlock::iterator MI,int SPAdj,unsigned FIOperandNum,RegScavenger * RS) const1292 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
1293                                         int SPAdj, unsigned FIOperandNum,
1294                                         RegScavenger *RS) const {
1295   MachineFunction *MF = MI->getParent()->getParent();
1296   MachineBasicBlock *MBB = MI->getParent();
1297   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1298   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1299   const SIInstrInfo *TII = ST.getInstrInfo();
1300   DebugLoc DL = MI->getDebugLoc();
1301 
1302   assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
1303 
1304   MachineOperand &FIOp = MI->getOperand(FIOperandNum);
1305   int Index = MI->getOperand(FIOperandNum).getIndex();
1306 
1307   Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
1308                           ? getBaseRegister()
1309                           : getFrameRegister(*MF);
1310 
1311   switch (MI->getOpcode()) {
1312     // SGPR register spill
1313     case AMDGPU::SI_SPILL_S1024_SAVE:
1314     case AMDGPU::SI_SPILL_S512_SAVE:
1315     case AMDGPU::SI_SPILL_S256_SAVE:
1316     case AMDGPU::SI_SPILL_S192_SAVE:
1317     case AMDGPU::SI_SPILL_S160_SAVE:
1318     case AMDGPU::SI_SPILL_S128_SAVE:
1319     case AMDGPU::SI_SPILL_S96_SAVE:
1320     case AMDGPU::SI_SPILL_S64_SAVE:
1321     case AMDGPU::SI_SPILL_S32_SAVE: {
1322       spillSGPR(MI, Index, RS);
1323       break;
1324     }
1325 
1326     // SGPR register restore
1327     case AMDGPU::SI_SPILL_S1024_RESTORE:
1328     case AMDGPU::SI_SPILL_S512_RESTORE:
1329     case AMDGPU::SI_SPILL_S256_RESTORE:
1330     case AMDGPU::SI_SPILL_S192_RESTORE:
1331     case AMDGPU::SI_SPILL_S160_RESTORE:
1332     case AMDGPU::SI_SPILL_S128_RESTORE:
1333     case AMDGPU::SI_SPILL_S96_RESTORE:
1334     case AMDGPU::SI_SPILL_S64_RESTORE:
1335     case AMDGPU::SI_SPILL_S32_RESTORE: {
1336       restoreSGPR(MI, Index, RS);
1337       break;
1338     }
1339 
1340     // VGPR register spill
1341     case AMDGPU::SI_SPILL_V1024_SAVE:
1342     case AMDGPU::SI_SPILL_V512_SAVE:
1343     case AMDGPU::SI_SPILL_V256_SAVE:
1344     case AMDGPU::SI_SPILL_V160_SAVE:
1345     case AMDGPU::SI_SPILL_V128_SAVE:
1346     case AMDGPU::SI_SPILL_V96_SAVE:
1347     case AMDGPU::SI_SPILL_V64_SAVE:
1348     case AMDGPU::SI_SPILL_V32_SAVE:
1349     case AMDGPU::SI_SPILL_A1024_SAVE:
1350     case AMDGPU::SI_SPILL_A512_SAVE:
1351     case AMDGPU::SI_SPILL_A256_SAVE:
1352     case AMDGPU::SI_SPILL_A192_SAVE:
1353     case AMDGPU::SI_SPILL_A160_SAVE:
1354     case AMDGPU::SI_SPILL_A128_SAVE:
1355     case AMDGPU::SI_SPILL_A96_SAVE:
1356     case AMDGPU::SI_SPILL_A64_SAVE:
1357     case AMDGPU::SI_SPILL_A32_SAVE: {
1358       const MachineOperand *VData = TII->getNamedOperand(*MI,
1359                                                          AMDGPU::OpName::vdata);
1360       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
1361              MFI->getStackPtrOffsetReg());
1362 
1363       unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1364                                             : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1365       buildSpillLoadStore(MI, Opc,
1366             Index,
1367             VData->getReg(), VData->isKill(),
1368             FrameReg,
1369             TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
1370             *MI->memoperands_begin(),
1371             RS);
1372       MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
1373       MI->eraseFromParent();
1374       break;
1375     }
1376     case AMDGPU::SI_SPILL_V32_RESTORE:
1377     case AMDGPU::SI_SPILL_V64_RESTORE:
1378     case AMDGPU::SI_SPILL_V96_RESTORE:
1379     case AMDGPU::SI_SPILL_V128_RESTORE:
1380     case AMDGPU::SI_SPILL_V160_RESTORE:
1381     case AMDGPU::SI_SPILL_V256_RESTORE:
1382     case AMDGPU::SI_SPILL_V512_RESTORE:
1383     case AMDGPU::SI_SPILL_V1024_RESTORE:
1384     case AMDGPU::SI_SPILL_A32_RESTORE:
1385     case AMDGPU::SI_SPILL_A64_RESTORE:
1386     case AMDGPU::SI_SPILL_A96_RESTORE:
1387     case AMDGPU::SI_SPILL_A128_RESTORE:
1388     case AMDGPU::SI_SPILL_A160_RESTORE:
1389     case AMDGPU::SI_SPILL_A192_RESTORE:
1390     case AMDGPU::SI_SPILL_A256_RESTORE:
1391     case AMDGPU::SI_SPILL_A512_RESTORE:
1392     case AMDGPU::SI_SPILL_A1024_RESTORE: {
1393       const MachineOperand *VData = TII->getNamedOperand(*MI,
1394                                                          AMDGPU::OpName::vdata);
1395       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
1396              MFI->getStackPtrOffsetReg());
1397 
1398       unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
1399                                             : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1400       buildSpillLoadStore(MI, Opc,
1401             Index,
1402             VData->getReg(), VData->isKill(),
1403             FrameReg,
1404             TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
1405             *MI->memoperands_begin(),
1406             RS);
1407       MI->eraseFromParent();
1408       break;
1409     }
1410 
1411     default: {
1412       const DebugLoc &DL = MI->getDebugLoc();
1413 
1414       int64_t Offset = FrameInfo.getObjectOffset(Index);
1415       if (ST.enableFlatScratch()) {
1416         if (TII->isFLATScratch(*MI)) {
1417           // The offset is always swizzled, just replace it
1418           if (FrameReg)
1419             FIOp.ChangeToRegister(FrameReg, false);
1420 
1421           if (!Offset)
1422             return;
1423 
1424           MachineOperand *OffsetOp =
1425             TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1426           int64_t NewOffset = Offset + OffsetOp->getImm();
1427           if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1428                                      true)) {
1429             OffsetOp->setImm(NewOffset);
1430             if (FrameReg)
1431               return;
1432             Offset = 0;
1433           }
1434 
1435           assert(!TII->getNamedOperand(*MI, AMDGPU::OpName::vaddr) &&
1436                  "Unexpected vaddr for flat scratch with a FI operand");
1437 
1438           // On GFX10 we have ST mode to use no registers for an address.
1439           // Otherwise we need to materialize 0 into an SGPR.
1440           if (!Offset && ST.hasFlatScratchSTMode()) {
1441             unsigned Opc = MI->getOpcode();
1442             unsigned NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc);
1443             MI->RemoveOperand(
1444                 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
1445             MI->setDesc(TII->get(NewOpc));
1446             return;
1447           }
1448         }
1449 
1450         if (!FrameReg) {
1451           FIOp.ChangeToImmediate(Offset);
1452           if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp))
1453             return;
1454         }
1455 
1456         // We need to use register here. Check if we can use an SGPR or need
1457         // a VGPR.
1458         FIOp.ChangeToRegister(AMDGPU::M0, false);
1459         bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp);
1460 
1461         if (!Offset && FrameReg && UseSGPR) {
1462           FIOp.setReg(FrameReg);
1463           return;
1464         }
1465 
1466         const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass
1467                                                 : &AMDGPU::VGPR_32RegClass;
1468 
1469         Register TmpReg = RS->scavengeRegister(RC, MI, 0, !UseSGPR);
1470         FIOp.setReg(TmpReg);
1471         FIOp.setIsKill(true);
1472 
1473         if ((!FrameReg || !Offset) && TmpReg) {
1474           unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1475           auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
1476           if (FrameReg)
1477             MIB.addReg(FrameReg);
1478           else
1479             MIB.addImm(Offset);
1480 
1481           return;
1482         }
1483 
1484         Register TmpSReg =
1485             UseSGPR ? TmpReg
1486                     : RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0,
1487                                            !UseSGPR);
1488 
1489         // TODO: for flat scratch another attempt can be made with a VGPR index
1490         //       if no SGPRs can be scavenged.
1491         if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
1492           report_fatal_error("Cannot scavenge register in FI elimination!");
1493 
1494         if (!TmpSReg) {
1495           // Use frame register and restore it after.
1496           TmpSReg = FrameReg;
1497           FIOp.setReg(FrameReg);
1498           FIOp.setIsKill(false);
1499         }
1500 
1501         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), TmpSReg)
1502           .addReg(FrameReg)
1503           .addImm(Offset);
1504 
1505         if (!UseSGPR)
1506           BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
1507             .addReg(TmpSReg, RegState::Kill);
1508 
1509         if (TmpSReg == FrameReg) {
1510           // Undo frame register modification.
1511           BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_SUB_U32),
1512                   FrameReg)
1513             .addReg(FrameReg)
1514             .addImm(Offset);
1515         }
1516 
1517         return;
1518       }
1519 
1520       bool IsMUBUF = TII->isMUBUF(*MI);
1521 
1522       if (!IsMUBUF && !MFI->isEntryFunction()) {
1523         // Convert to a swizzled stack address by scaling by the wave size.
1524         //
1525         // In an entry function/kernel the offset is already swizzled.
1526 
1527         bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32;
1528         Register ResultReg =
1529             IsCopy ? MI->getOperand(0).getReg()
1530                    : RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
1531 
1532         int64_t Offset = FrameInfo.getObjectOffset(Index);
1533         if (Offset == 0) {
1534           // XXX - This never happens because of emergency scavenging slot at 0?
1535           BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg)
1536             .addImm(ST.getWavefrontSizeLog2())
1537             .addReg(FrameReg);
1538         } else {
1539           if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) {
1540             // Reuse ResultReg in intermediate step.
1541             Register ScaledReg = ResultReg;
1542 
1543             BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
1544                     ScaledReg)
1545               .addImm(ST.getWavefrontSizeLog2())
1546               .addReg(FrameReg);
1547 
1548             const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
1549 
1550             // TODO: Fold if use instruction is another add of a constant.
1551             if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
1552               // FIXME: This can fail
1553               MIB.addImm(Offset);
1554               MIB.addReg(ScaledReg, RegState::Kill);
1555               if (!IsVOP2)
1556                 MIB.addImm(0); // clamp bit
1557             } else {
1558               assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
1559                      "Need to reuse carry out register");
1560 
1561               // Use scavenged unused carry out as offset register.
1562               Register ConstOffsetReg;
1563               if (!isWave32)
1564                 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
1565               else
1566                 ConstOffsetReg = MIB.getReg(1);
1567 
1568               BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
1569                 .addImm(Offset);
1570               MIB.addReg(ConstOffsetReg, RegState::Kill);
1571               MIB.addReg(ScaledReg, RegState::Kill);
1572               MIB.addImm(0); // clamp bit
1573             }
1574           } else {
1575             // We have to produce a carry out, and there isn't a free SGPR pair
1576             // for it. We can keep the whole computation on the SALU to avoid
1577             // clobbering an additional register at the cost of an extra mov.
1578 
1579             // We may have 1 free scratch SGPR even though a carry out is
1580             // unavailable. Only one additional mov is needed.
1581             Register TmpScaledReg =
1582                 RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
1583             Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
1584 
1585             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
1586               .addReg(FrameReg)
1587               .addImm(ST.getWavefrontSizeLog2());
1588             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg)
1589               .addReg(ScaledReg, RegState::Kill)
1590               .addImm(Offset);
1591             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
1592               .addReg(ScaledReg, RegState::Kill);
1593 
1594             // If there were truly no free SGPRs, we need to undo everything.
1595             if (!TmpScaledReg.isValid()) {
1596               BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScaledReg)
1597                 .addReg(ScaledReg, RegState::Kill)
1598                 .addImm(Offset);
1599               BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
1600                 .addReg(FrameReg)
1601                 .addImm(ST.getWavefrontSizeLog2());
1602             }
1603           }
1604         }
1605 
1606         // Don't introduce an extra copy if we're just materializing in a mov.
1607         if (IsCopy)
1608           MI->eraseFromParent();
1609         else
1610           FIOp.ChangeToRegister(ResultReg, false, false, true);
1611         return;
1612       }
1613 
1614       if (IsMUBUF) {
1615         // Disable offen so we don't need a 0 vgpr base.
1616         assert(static_cast<int>(FIOperandNum) ==
1617                AMDGPU::getNamedOperandIdx(MI->getOpcode(),
1618                                           AMDGPU::OpName::vaddr));
1619 
1620         auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
1621         assert((SOffset.isReg() &&
1622                 SOffset.getReg() == MFI->getStackPtrOffsetReg()) ||
1623                (SOffset.isImm() && SOffset.getImm() == 0));
1624         if (SOffset.isReg()) {
1625           if (FrameReg == AMDGPU::NoRegister) {
1626             SOffset.ChangeToImmediate(0);
1627           } else {
1628             SOffset.setReg(FrameReg);
1629           }
1630         } else if (SOffset.isImm() && FrameReg != AMDGPU::NoRegister) {
1631           SOffset.ChangeToRegister(FrameReg, false);
1632         }
1633 
1634         int64_t Offset = FrameInfo.getObjectOffset(Index);
1635         int64_t OldImm
1636           = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
1637         int64_t NewOffset = OldImm + Offset;
1638 
1639         if (SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) &&
1640             buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
1641           MI->eraseFromParent();
1642           return;
1643         }
1644       }
1645 
1646       // If the offset is simply too big, don't convert to a scratch wave offset
1647       // relative index.
1648 
1649       FIOp.ChangeToImmediate(Offset);
1650       if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
1651         Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
1652         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
1653           .addImm(Offset);
1654         FIOp.ChangeToRegister(TmpReg, false, false, true);
1655       }
1656     }
1657   }
1658 }
1659 
getRegAsmName(MCRegister Reg) const1660 StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const {
1661   return AMDGPUInstPrinter::getRegisterName(Reg);
1662 }
1663 
1664 const TargetRegisterClass *
getVGPRClassForBitWidth(unsigned BitWidth)1665 SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) {
1666   if (BitWidth == 1)
1667     return &AMDGPU::VReg_1RegClass;
1668   if (BitWidth <= 16)
1669     return &AMDGPU::VGPR_LO16RegClass;
1670   if (BitWidth <= 32)
1671     return &AMDGPU::VGPR_32RegClass;
1672   if (BitWidth <= 64)
1673     return &AMDGPU::VReg_64RegClass;
1674   if (BitWidth <= 96)
1675     return &AMDGPU::VReg_96RegClass;
1676   if (BitWidth <= 128)
1677     return &AMDGPU::VReg_128RegClass;
1678   if (BitWidth <= 160)
1679     return &AMDGPU::VReg_160RegClass;
1680   if (BitWidth <= 192)
1681     return &AMDGPU::VReg_192RegClass;
1682   if (BitWidth <= 256)
1683     return &AMDGPU::VReg_256RegClass;
1684   if (BitWidth <= 512)
1685     return &AMDGPU::VReg_512RegClass;
1686   if (BitWidth <= 1024)
1687     return &AMDGPU::VReg_1024RegClass;
1688 
1689   return nullptr;
1690 }
1691 
1692 const TargetRegisterClass *
getAGPRClassForBitWidth(unsigned BitWidth)1693 SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) {
1694   if (BitWidth <= 16)
1695     return &AMDGPU::AGPR_LO16RegClass;
1696   if (BitWidth <= 32)
1697     return &AMDGPU::AGPR_32RegClass;
1698   if (BitWidth <= 64)
1699     return &AMDGPU::AReg_64RegClass;
1700   if (BitWidth <= 96)
1701     return &AMDGPU::AReg_96RegClass;
1702   if (BitWidth <= 128)
1703     return &AMDGPU::AReg_128RegClass;
1704   if (BitWidth <= 160)
1705     return &AMDGPU::AReg_160RegClass;
1706   if (BitWidth <= 192)
1707     return &AMDGPU::AReg_192RegClass;
1708   if (BitWidth <= 256)
1709     return &AMDGPU::AReg_256RegClass;
1710   if (BitWidth <= 512)
1711     return &AMDGPU::AReg_512RegClass;
1712   if (BitWidth <= 1024)
1713     return &AMDGPU::AReg_1024RegClass;
1714 
1715   return nullptr;
1716 }
1717 
1718 const TargetRegisterClass *
getSGPRClassForBitWidth(unsigned BitWidth)1719 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
1720   if (BitWidth <= 16)
1721     return &AMDGPU::SGPR_LO16RegClass;
1722   if (BitWidth <= 32)
1723     return &AMDGPU::SReg_32RegClass;
1724   if (BitWidth <= 64)
1725     return &AMDGPU::SReg_64RegClass;
1726   if (BitWidth <= 96)
1727     return &AMDGPU::SGPR_96RegClass;
1728   if (BitWidth <= 128)
1729     return &AMDGPU::SGPR_128RegClass;
1730   if (BitWidth <= 160)
1731     return &AMDGPU::SGPR_160RegClass;
1732   if (BitWidth <= 192)
1733     return &AMDGPU::SGPR_192RegClass;
1734   if (BitWidth <= 256)
1735     return &AMDGPU::SGPR_256RegClass;
1736   if (BitWidth <= 512)
1737     return &AMDGPU::SGPR_512RegClass;
1738   if (BitWidth <= 1024)
1739     return &AMDGPU::SGPR_1024RegClass;
1740 
1741   return nullptr;
1742 }
1743 
1744 // FIXME: This is very slow. It might be worth creating a map from physreg to
1745 // register class.
1746 const TargetRegisterClass *
getPhysRegClass(MCRegister Reg) const1747 SIRegisterInfo::getPhysRegClass(MCRegister Reg) const {
1748   static const TargetRegisterClass *const BaseClasses[] = {
1749     &AMDGPU::VGPR_LO16RegClass,
1750     &AMDGPU::VGPR_HI16RegClass,
1751     &AMDGPU::SReg_LO16RegClass,
1752     &AMDGPU::AGPR_LO16RegClass,
1753     &AMDGPU::VGPR_32RegClass,
1754     &AMDGPU::SReg_32RegClass,
1755     &AMDGPU::AGPR_32RegClass,
1756     &AMDGPU::VReg_64RegClass,
1757     &AMDGPU::SReg_64RegClass,
1758     &AMDGPU::AReg_64RegClass,
1759     &AMDGPU::VReg_96RegClass,
1760     &AMDGPU::SReg_96RegClass,
1761     &AMDGPU::AReg_96RegClass,
1762     &AMDGPU::VReg_128RegClass,
1763     &AMDGPU::SReg_128RegClass,
1764     &AMDGPU::AReg_128RegClass,
1765     &AMDGPU::VReg_160RegClass,
1766     &AMDGPU::SReg_160RegClass,
1767     &AMDGPU::AReg_160RegClass,
1768     &AMDGPU::VReg_192RegClass,
1769     &AMDGPU::SReg_192RegClass,
1770     &AMDGPU::AReg_192RegClass,
1771     &AMDGPU::VReg_256RegClass,
1772     &AMDGPU::SReg_256RegClass,
1773     &AMDGPU::AReg_256RegClass,
1774     &AMDGPU::VReg_512RegClass,
1775     &AMDGPU::SReg_512RegClass,
1776     &AMDGPU::AReg_512RegClass,
1777     &AMDGPU::SReg_1024RegClass,
1778     &AMDGPU::VReg_1024RegClass,
1779     &AMDGPU::AReg_1024RegClass,
1780     &AMDGPU::SCC_CLASSRegClass,
1781     &AMDGPU::Pseudo_SReg_32RegClass,
1782     &AMDGPU::Pseudo_SReg_128RegClass,
1783   };
1784 
1785   for (const TargetRegisterClass *BaseClass : BaseClasses) {
1786     if (BaseClass->contains(Reg)) {
1787       return BaseClass;
1788     }
1789   }
1790   return nullptr;
1791 }
1792 
1793 // TODO: It might be helpful to have some target specific flags in
1794 // TargetRegisterClass to mark which classes are VGPRs to make this trivial.
hasVGPRs(const TargetRegisterClass * RC) const1795 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
1796   unsigned Size = getRegSizeInBits(*RC);
1797   if (Size == 16) {
1798     return getCommonSubClass(&AMDGPU::VGPR_LO16RegClass, RC) != nullptr ||
1799            getCommonSubClass(&AMDGPU::VGPR_HI16RegClass, RC) != nullptr;
1800   }
1801   const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size);
1802   if (!VRC) {
1803     assert(Size < 32 && "Invalid register class size");
1804     return false;
1805   }
1806   return getCommonSubClass(VRC, RC) != nullptr;
1807 }
1808 
hasAGPRs(const TargetRegisterClass * RC) const1809 bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const {
1810   unsigned Size = getRegSizeInBits(*RC);
1811   if (Size < 16)
1812     return false;
1813   const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size);
1814   if (!ARC) {
1815     assert(getVGPRClassForBitWidth(Size) && "Invalid register class size");
1816     return false;
1817   }
1818   return getCommonSubClass(ARC, RC) != nullptr;
1819 }
1820 
1821 const TargetRegisterClass *
getEquivalentVGPRClass(const TargetRegisterClass * SRC) const1822 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const {
1823   unsigned Size = getRegSizeInBits(*SRC);
1824   const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size);
1825   assert(VRC && "Invalid register class size");
1826   return VRC;
1827 }
1828 
1829 const TargetRegisterClass *
getEquivalentAGPRClass(const TargetRegisterClass * SRC) const1830 SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const {
1831   unsigned Size = getRegSizeInBits(*SRC);
1832   const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size);
1833   assert(ARC && "Invalid register class size");
1834   return ARC;
1835 }
1836 
1837 const TargetRegisterClass *
getEquivalentSGPRClass(const TargetRegisterClass * VRC) const1838 SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const {
1839   unsigned Size = getRegSizeInBits(*VRC);
1840   if (Size == 32)
1841     return &AMDGPU::SGPR_32RegClass;
1842   const TargetRegisterClass *SRC = getSGPRClassForBitWidth(Size);
1843   assert(SRC && "Invalid register class size");
1844   return SRC;
1845 }
1846 
getSubRegClass(const TargetRegisterClass * RC,unsigned SubIdx) const1847 const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
1848                          const TargetRegisterClass *RC, unsigned SubIdx) const {
1849   if (SubIdx == AMDGPU::NoSubRegister)
1850     return RC;
1851 
1852   // We can assume that each lane corresponds to one 32-bit register.
1853   unsigned Size = getNumChannelsFromSubReg(SubIdx) * 32;
1854   if (isSGPRClass(RC)) {
1855     if (Size == 32)
1856       RC = &AMDGPU::SGPR_32RegClass;
1857     else
1858       RC = getSGPRClassForBitWidth(Size);
1859   } else if (hasAGPRs(RC)) {
1860     RC = getAGPRClassForBitWidth(Size);
1861   } else {
1862     RC = getVGPRClassForBitWidth(Size);
1863   }
1864   assert(RC && "Invalid sub-register class size");
1865   return RC;
1866 }
1867 
opCanUseInlineConstant(unsigned OpType) const1868 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
1869   if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST &&
1870       OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST)
1871     return !ST.hasMFMAInlineLiteralBug();
1872 
1873   return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
1874          OpType <= AMDGPU::OPERAND_SRC_LAST;
1875 }
1876 
shouldRewriteCopySrc(const TargetRegisterClass * DefRC,unsigned DefSubReg,const TargetRegisterClass * SrcRC,unsigned SrcSubReg) const1877 bool SIRegisterInfo::shouldRewriteCopySrc(
1878   const TargetRegisterClass *DefRC,
1879   unsigned DefSubReg,
1880   const TargetRegisterClass *SrcRC,
1881   unsigned SrcSubReg) const {
1882   // We want to prefer the smallest register class possible, so we don't want to
1883   // stop and rewrite on anything that looks like a subregister
1884   // extract. Operations mostly don't care about the super register class, so we
1885   // only want to stop on the most basic of copies between the same register
1886   // class.
1887   //
1888   // e.g. if we have something like
1889   // %0 = ...
1890   // %1 = ...
1891   // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2
1892   // %3 = COPY %2, sub0
1893   //
1894   // We want to look through the COPY to find:
1895   //  => %3 = COPY %0
1896 
1897   // Plain copy.
1898   return getCommonSubClass(DefRC, SrcRC) != nullptr;
1899 }
1900 
1901 /// Returns a lowest register that is not used at any point in the function.
1902 ///        If all registers are used, then this function will return
1903 ///         AMDGPU::NoRegister. If \p ReserveHighestVGPR = true, then return
1904 ///         highest unused register.
findUnusedRegister(const MachineRegisterInfo & MRI,const TargetRegisterClass * RC,const MachineFunction & MF,bool ReserveHighestVGPR) const1905 MCRegister SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
1906                                               const TargetRegisterClass *RC,
1907                                               const MachineFunction &MF,
1908                                               bool ReserveHighestVGPR) const {
1909   if (ReserveHighestVGPR) {
1910     for (MCRegister Reg : reverse(*RC))
1911       if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
1912         return Reg;
1913   } else {
1914     for (MCRegister Reg : *RC)
1915       if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
1916         return Reg;
1917   }
1918   return MCRegister();
1919 }
1920 
getRegSplitParts(const TargetRegisterClass * RC,unsigned EltSize) const1921 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
1922                                                    unsigned EltSize) const {
1923   const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC->MC);
1924   assert(RegBitWidth >= 32 && RegBitWidth <= 1024);
1925 
1926   const unsigned RegDWORDs = RegBitWidth / 32;
1927   const unsigned EltDWORDs = EltSize / 4;
1928   assert(RegSplitParts.size() + 1 >= EltDWORDs);
1929 
1930   const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1];
1931   const unsigned NumParts = RegDWORDs / EltDWORDs;
1932 
1933   return makeArrayRef(Parts.data(), NumParts);
1934 }
1935 
1936 const TargetRegisterClass*
getRegClassForReg(const MachineRegisterInfo & MRI,Register Reg) const1937 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
1938                                   Register Reg) const {
1939   return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegClass(Reg);
1940 }
1941 
isVGPR(const MachineRegisterInfo & MRI,Register Reg) const1942 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
1943                             Register Reg) const {
1944   const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
1945   // Registers without classes are unaddressable, SGPR-like registers.
1946   return RC && hasVGPRs(RC);
1947 }
1948 
isAGPR(const MachineRegisterInfo & MRI,Register Reg) const1949 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI,
1950                             Register Reg) const {
1951   const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
1952 
1953   // Registers without classes are unaddressable, SGPR-like registers.
1954   return RC && hasAGPRs(RC);
1955 }
1956 
shouldCoalesce(MachineInstr * MI,const TargetRegisterClass * SrcRC,unsigned SubReg,const TargetRegisterClass * DstRC,unsigned DstSubReg,const TargetRegisterClass * NewRC,LiveIntervals & LIS) const1957 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
1958                                     const TargetRegisterClass *SrcRC,
1959                                     unsigned SubReg,
1960                                     const TargetRegisterClass *DstRC,
1961                                     unsigned DstSubReg,
1962                                     const TargetRegisterClass *NewRC,
1963                                     LiveIntervals &LIS) const {
1964   unsigned SrcSize = getRegSizeInBits(*SrcRC);
1965   unsigned DstSize = getRegSizeInBits(*DstRC);
1966   unsigned NewSize = getRegSizeInBits(*NewRC);
1967 
1968   // Do not increase size of registers beyond dword, we would need to allocate
1969   // adjacent registers and constraint regalloc more than needed.
1970 
1971   // Always allow dword coalescing.
1972   if (SrcSize <= 32 || DstSize <= 32)
1973     return true;
1974 
1975   return NewSize <= DstSize || NewSize <= SrcSize;
1976 }
1977 
getRegPressureLimit(const TargetRegisterClass * RC,MachineFunction & MF) const1978 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
1979                                              MachineFunction &MF) const {
1980   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1981 
1982   unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
1983                                                        MF.getFunction());
1984   switch (RC->getID()) {
1985   default:
1986     return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
1987   case AMDGPU::VGPR_32RegClassID:
1988   case AMDGPU::VGPR_LO16RegClassID:
1989   case AMDGPU::VGPR_HI16RegClassID:
1990     return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF));
1991   case AMDGPU::SGPR_32RegClassID:
1992   case AMDGPU::SGPR_LO16RegClassID:
1993     return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF));
1994   }
1995 }
1996 
getRegPressureSetLimit(const MachineFunction & MF,unsigned Idx) const1997 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
1998                                                 unsigned Idx) const {
1999   if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 ||
2000       Idx == AMDGPU::RegisterPressureSets::AGPR_32)
2001     return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
2002                                const_cast<MachineFunction &>(MF));
2003 
2004   if (Idx == AMDGPU::RegisterPressureSets::SReg_32)
2005     return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
2006                                const_cast<MachineFunction &>(MF));
2007 
2008   llvm_unreachable("Unexpected register pressure set!");
2009 }
2010 
getRegUnitPressureSets(unsigned RegUnit) const2011 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
2012   static const int Empty[] = { -1 };
2013 
2014   if (RegPressureIgnoredUnits[RegUnit])
2015     return Empty;
2016 
2017   return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
2018 }
2019 
getReturnAddressReg(const MachineFunction & MF) const2020 MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
2021   // Not a callee saved register.
2022   return AMDGPU::SGPR30_SGPR31;
2023 }
2024 
2025 const TargetRegisterClass *
getRegClassForSizeOnBank(unsigned Size,const RegisterBank & RB,const MachineRegisterInfo & MRI) const2026 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
2027                                          const RegisterBank &RB,
2028                                          const MachineRegisterInfo &MRI) const {
2029   switch (RB.getID()) {
2030   case AMDGPU::VGPRRegBankID:
2031     return getVGPRClassForBitWidth(std::max(32u, Size));
2032   case AMDGPU::VCCRegBankID:
2033     assert(Size == 1);
2034     return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
2035                     : &AMDGPU::SReg_64_XEXECRegClass;
2036   case AMDGPU::SGPRRegBankID:
2037     return getSGPRClassForBitWidth(std::max(32u, Size));
2038   case AMDGPU::AGPRRegBankID:
2039     return getAGPRClassForBitWidth(std::max(32u, Size));
2040   default:
2041     llvm_unreachable("unknown register bank");
2042   }
2043 }
2044 
2045 const TargetRegisterClass *
getConstrainedRegClassForOperand(const MachineOperand & MO,const MachineRegisterInfo & MRI) const2046 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
2047                                          const MachineRegisterInfo &MRI) const {
2048   const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
2049   if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>())
2050     return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI);
2051 
2052   const TargetRegisterClass *RC = RCOrRB.get<const TargetRegisterClass*>();
2053   return getAllocatableClass(RC);
2054 }
2055 
getVCC() const2056 MCRegister SIRegisterInfo::getVCC() const {
2057   return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
2058 }
2059 
2060 const TargetRegisterClass *
getRegClass(unsigned RCID) const2061 SIRegisterInfo::getRegClass(unsigned RCID) const {
2062   switch ((int)RCID) {
2063   case AMDGPU::SReg_1RegClassID:
2064     return getBoolRC();
2065   case AMDGPU::SReg_1_XEXECRegClassID:
2066     return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
2067       : &AMDGPU::SReg_64_XEXECRegClass;
2068   case -1:
2069     return nullptr;
2070   default:
2071     return AMDGPUGenRegisterInfo::getRegClass(RCID);
2072   }
2073 }
2074 
2075 // Find reaching register definition
findReachingDef(Register Reg,unsigned SubReg,MachineInstr & Use,MachineRegisterInfo & MRI,LiveIntervals * LIS) const2076 MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg,
2077                                               MachineInstr &Use,
2078                                               MachineRegisterInfo &MRI,
2079                                               LiveIntervals *LIS) const {
2080   auto &MDT = LIS->getAnalysis<MachineDominatorTree>();
2081   SlotIndex UseIdx = LIS->getInstructionIndex(Use);
2082   SlotIndex DefIdx;
2083 
2084   if (Reg.isVirtual()) {
2085     if (!LIS->hasInterval(Reg))
2086       return nullptr;
2087     LiveInterval &LI = LIS->getInterval(Reg);
2088     LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
2089                                   : MRI.getMaxLaneMaskForVReg(Reg);
2090     VNInfo *V = nullptr;
2091     if (LI.hasSubRanges()) {
2092       for (auto &S : LI.subranges()) {
2093         if ((S.LaneMask & SubLanes) == SubLanes) {
2094           V = S.getVNInfoAt(UseIdx);
2095           break;
2096         }
2097       }
2098     } else {
2099       V = LI.getVNInfoAt(UseIdx);
2100     }
2101     if (!V)
2102       return nullptr;
2103     DefIdx = V->def;
2104   } else {
2105     // Find last def.
2106     for (MCRegUnitIterator Units(Reg.asMCReg(), this); Units.isValid();
2107          ++Units) {
2108       LiveRange &LR = LIS->getRegUnit(*Units);
2109       if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
2110         if (!DefIdx.isValid() ||
2111             MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
2112                           LIS->getInstructionFromIndex(V->def)))
2113           DefIdx = V->def;
2114       } else {
2115         return nullptr;
2116       }
2117     }
2118   }
2119 
2120   MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
2121 
2122   if (!Def || !MDT.dominates(Def, &Use))
2123     return nullptr;
2124 
2125   assert(Def->modifiesRegister(Reg, this));
2126 
2127   return Def;
2128 }
2129 
get32BitRegister(MCPhysReg Reg) const2130 MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const {
2131   assert(getRegSizeInBits(*getPhysRegClass(Reg)) <= 32);
2132 
2133   for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass,
2134                                          AMDGPU::SReg_32RegClass,
2135                                          AMDGPU::AGPR_32RegClass } ) {
2136     if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
2137       return Super;
2138   }
2139   if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
2140                                             &AMDGPU::VGPR_32RegClass)) {
2141       return Super;
2142   }
2143 
2144   return AMDGPU::NoRegister;
2145 }
2146 
isConstantPhysReg(MCRegister PhysReg) const2147 bool SIRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
2148   switch (PhysReg) {
2149   case AMDGPU::SGPR_NULL:
2150   case AMDGPU::SRC_SHARED_BASE:
2151   case AMDGPU::SRC_PRIVATE_BASE:
2152   case AMDGPU::SRC_SHARED_LIMIT:
2153   case AMDGPU::SRC_PRIVATE_LIMIT:
2154     return true;
2155   default:
2156     return false;
2157   }
2158 }
2159 
2160 ArrayRef<MCPhysReg>
getAllSGPR128(const MachineFunction & MF) const2161 SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const {
2162   return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
2163                       ST.getMaxNumSGPRs(MF) / 4);
2164 }
2165 
2166 ArrayRef<MCPhysReg>
getAllSGPR64(const MachineFunction & MF) const2167 SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const {
2168   return makeArrayRef(AMDGPU::SGPR_64RegClass.begin(),
2169                       ST.getMaxNumSGPRs(MF) / 2);
2170 }
2171 
2172 ArrayRef<MCPhysReg>
getAllSGPR32(const MachineFunction & MF) const2173 SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const {
2174   return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
2175 }
2176