• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPURegisterBankInfo.h"
15 #include "AMDGPUInstrInfo.h"
16 #include "AMDGPUSubtarget.h"
17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "SIRegisterInfo.h"
20 #include "llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
25 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
26 #include "llvm/CodeGen/TargetRegisterInfo.h"
27 #include "llvm/CodeGen/TargetSubtargetInfo.h"
28 #include "llvm/IR/Constants.h"
29 
30 #define GET_TARGET_REGBANK_IMPL
31 #include "AMDGPUGenRegisterBank.inc"
32 
33 // This file will be TableGen'ed at some point.
34 #include "AMDGPUGenRegisterBankInfo.def"
35 
36 using namespace llvm;
37 using namespace MIPatternMatch;
38 
39 namespace {
40 
41 // Observer to apply a register bank to new registers created by LegalizerHelper.
42 class ApplyRegBankMapping final : public GISelChangeObserver {
43 private:
44   const AMDGPURegisterBankInfo &RBI;
45   MachineRegisterInfo &MRI;
46   const RegisterBank *NewBank;
47   SmallVector<MachineInstr *, 4> NewInsts;
48 
49 public:
ApplyRegBankMapping(const AMDGPURegisterBankInfo & RBI_,MachineRegisterInfo & MRI_,const RegisterBank * RB)50   ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
51                       MachineRegisterInfo &MRI_, const RegisterBank *RB)
52     : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
53 
~ApplyRegBankMapping()54   ~ApplyRegBankMapping() {
55     for (MachineInstr *MI : NewInsts)
56       applyBank(*MI);
57   }
58 
59   /// Set any registers that don't have a set register class or bank to SALU.
applyBank(MachineInstr & MI)60   void applyBank(MachineInstr &MI) {
61     const unsigned Opc = MI.getOpcode();
62     if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
63         Opc == AMDGPU::G_SEXT) {
64       // LegalizerHelper wants to use the basic legalization artifacts when
65       // widening etc. We don't handle selection with vcc in artifact sources,
66       // so we need to use a sslect instead to handle these properly.
67       Register DstReg = MI.getOperand(0).getReg();
68       Register SrcReg = MI.getOperand(1).getReg();
69       const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
70       if (SrcBank == &AMDGPU::VCCRegBank) {
71         const LLT S32 = LLT::scalar(32);
72         assert(MRI.getType(SrcReg) == LLT::scalar(1));
73         assert(MRI.getType(DstReg) == S32);
74         assert(NewBank == &AMDGPU::VGPRRegBank);
75 
76         // Replace the extension with a select, which really uses the boolean
77         // source.
78         MachineIRBuilder B(MI);
79         auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
80         auto False = B.buildConstant(S32, 0);
81         B.buildSelect(DstReg, SrcReg, True, False);
82         MRI.setRegBank(True.getReg(0), *NewBank);
83         MRI.setRegBank(False.getReg(0), *NewBank);
84         MI.eraseFromParent();
85       }
86 
87       assert(!MRI.getRegClassOrRegBank(DstReg));
88       MRI.setRegBank(DstReg, *NewBank);
89       return;
90     }
91 
92 #ifndef NDEBUG
93     if (Opc == AMDGPU::G_TRUNC) {
94       Register DstReg = MI.getOperand(0).getReg();
95       const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
96       assert(DstBank != &AMDGPU::VCCRegBank);
97     }
98 #endif
99 
100     for (MachineOperand &Op : MI.operands()) {
101       if (!Op.isReg())
102         continue;
103 
104       Register Reg = Op.getReg();
105       if (MRI.getRegClassOrRegBank(Reg))
106         continue;
107 
108       const RegisterBank *RB = NewBank;
109       if (MRI.getType(Reg) == LLT::scalar(1)) {
110         assert(NewBank == &AMDGPU::VGPRRegBank &&
111                "s1 operands should only be used for vector bools");
112         assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
113                 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
114                "not expecting legalization artifacts here");
115         RB = &AMDGPU::VCCRegBank;
116       }
117 
118       MRI.setRegBank(Reg, *RB);
119     }
120   }
121 
erasingInstr(MachineInstr & MI)122   void erasingInstr(MachineInstr &MI) override {}
123 
createdInstr(MachineInstr & MI)124   void createdInstr(MachineInstr &MI) override {
125     // At this point, the instruction was just inserted and has no operands.
126     NewInsts.push_back(&MI);
127   }
128 
changingInstr(MachineInstr & MI)129   void changingInstr(MachineInstr &MI) override {}
changedInstr(MachineInstr & MI)130   void changedInstr(MachineInstr &MI) override {}
131 };
132 
133 }
AMDGPURegisterBankInfo(const GCNSubtarget & ST)134 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
135     : AMDGPUGenRegisterBankInfo(),
136       Subtarget(ST),
137       TRI(Subtarget.getRegisterInfo()),
138       TII(Subtarget.getInstrInfo()) {
139 
140   // HACK: Until this is fully tablegen'd.
141   static bool AlreadyInit = false;
142   if (AlreadyInit)
143     return;
144 
145   AlreadyInit = true;
146 
147   assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
148          &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
149          &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
150 }
151 
isVectorRegisterBank(const RegisterBank & Bank)152 static bool isVectorRegisterBank(const RegisterBank &Bank) {
153   unsigned BankID = Bank.getID();
154   return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
155 }
156 
copyCost(const RegisterBank & Dst,const RegisterBank & Src,unsigned Size) const157 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
158                                           const RegisterBank &Src,
159                                           unsigned Size) const {
160   // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
161   if (Dst.getID() == AMDGPU::SGPRRegBankID &&
162       isVectorRegisterBank(Src)) {
163     return std::numeric_limits<unsigned>::max();
164   }
165 
166   // Bool values are tricky, because the meaning is based on context. The SCC
167   // and VCC banks are for the natural scalar and vector conditions produced by
168   // a compare.
169   //
170   // Legalization doesn't know about the necessary context, so an s1 use may
171   // have been a truncate from an arbitrary value, in which case a copy (lowered
172   // as a compare with 0) needs to be inserted.
173   if (Size == 1 &&
174       (Dst.getID() == AMDGPU::SGPRRegBankID) &&
175       (isVectorRegisterBank(Src) ||
176        Src.getID() == AMDGPU::SGPRRegBankID ||
177        Src.getID() == AMDGPU::VCCRegBankID))
178     return std::numeric_limits<unsigned>::max();
179 
180   if (Src.getID() == AMDGPU::VCCRegBankID)
181     return std::numeric_limits<unsigned>::max();
182 
183   // There is no direct copy between AGPRs.
184   if (Dst.getID() == AMDGPU::AGPRRegBankID &&
185       Src.getID() == AMDGPU::AGPRRegBankID)
186     return 4;
187 
188   return RegisterBankInfo::copyCost(Dst, Src, Size);
189 }
190 
getBreakDownCost(const ValueMapping & ValMapping,const RegisterBank * CurBank) const191 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
192   const ValueMapping &ValMapping,
193   const RegisterBank *CurBank) const {
194   // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
195   // VGPR.
196   // FIXME: Is there a better way to do this?
197   if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
198     return 10; // This is expensive.
199 
200   assert(ValMapping.NumBreakDowns == 2 &&
201          ValMapping.BreakDown[0].Length == 32 &&
202          ValMapping.BreakDown[0].StartIdx == 0 &&
203          ValMapping.BreakDown[1].Length == 32 &&
204          ValMapping.BreakDown[1].StartIdx == 32 &&
205          ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
206 
207   // 32-bit extract of a 64-bit value is just access of a subregister, so free.
208   // TODO: Cost of 0 hits assert, though it's not clear it's what we really
209   // want.
210 
211   // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
212   // alignment restrictions, but this probably isn't important.
213   return 1;
214 }
215 
216 const RegisterBank &
getRegBankFromRegClass(const TargetRegisterClass & RC,LLT Ty) const217 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
218                                                LLT Ty) const {
219   if (&RC == &AMDGPU::SReg_1RegClass)
220     return AMDGPU::VCCRegBank;
221 
222   // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
223   // VCC-like use.
224   if (TRI->isSGPRClass(&RC)) {
225     // FIXME: This probably came from a copy from a physical register, which
226     // should be inferrrable from the copied to-type. We don't have many boolean
227     // physical register constraints so just assume a normal SGPR for now.
228     if (!Ty.isValid())
229       return AMDGPU::SGPRRegBank;
230 
231     return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
232   }
233 
234   return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
235 }
236 
237 template <unsigned NumOps>
238 RegisterBankInfo::InstructionMappings
addMappingFromTable(const MachineInstr & MI,const MachineRegisterInfo & MRI,const std::array<unsigned,NumOps> RegSrcOpIdx,ArrayRef<OpRegBankEntry<NumOps>> Table) const239 AMDGPURegisterBankInfo::addMappingFromTable(
240     const MachineInstr &MI, const MachineRegisterInfo &MRI,
241     const std::array<unsigned, NumOps> RegSrcOpIdx,
242     ArrayRef<OpRegBankEntry<NumOps>> Table) const {
243 
244   InstructionMappings AltMappings;
245 
246   SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
247 
248   unsigned Sizes[NumOps];
249   for (unsigned I = 0; I < NumOps; ++I) {
250     Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
251     Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
252   }
253 
254   for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
255     unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
256     Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
257   }
258 
259   // getInstrMapping's default mapping uses ID 1, so start at 2.
260   unsigned MappingID = 2;
261   for (const auto &Entry : Table) {
262     for (unsigned I = 0; I < NumOps; ++I) {
263       int OpIdx = RegSrcOpIdx[I];
264       Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
265     }
266 
267     AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
268                                                  getOperandsMapping(Operands),
269                                                  Operands.size()));
270   }
271 
272   return AltMappings;
273 }
274 
275 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsic(const MachineInstr & MI,const MachineRegisterInfo & MRI) const276 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
277     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
278   switch (MI.getIntrinsicID()) {
279   case Intrinsic::amdgcn_readlane: {
280     static const OpRegBankEntry<3> Table[2] = {
281       // Perfectly legal.
282       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
283 
284       // Need a readfirstlane for the index.
285       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
286     };
287 
288     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
289     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
290   }
291   case Intrinsic::amdgcn_writelane: {
292     static const OpRegBankEntry<4> Table[4] = {
293       // Perfectly legal.
294       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
295 
296       // Need readfirstlane of first op
297       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
298 
299       // Need readfirstlane of second op
300       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
301 
302       // Need readfirstlane of both ops
303       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
304     };
305 
306     // rsrc, voffset, offset
307     const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
308     return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
309   }
310   default:
311     return RegisterBankInfo::getInstrAlternativeMappings(MI);
312   }
313 }
314 
315 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsicWSideEffects(const MachineInstr & MI,const MachineRegisterInfo & MRI) const316 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
317     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
318 
319   switch (MI.getIntrinsicID()) {
320   case Intrinsic::amdgcn_buffer_load: {
321     static const OpRegBankEntry<3> Table[4] = {
322       // Perfectly legal.
323       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
324       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
325 
326       // Waterfall loop needed for rsrc. In the worst case this will execute
327       // approximately an extra 10 * wavesize + 2 instructions.
328       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
329       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1000 }
330     };
331 
332     // rsrc, voffset, offset
333     const std::array<unsigned, 3> RegSrcOpIdx = { { 2, 3, 4 } };
334     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
335   }
336   case Intrinsic::amdgcn_s_buffer_load: {
337     static const OpRegBankEntry<2> Table[4] = {
338       // Perfectly legal.
339       { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
340 
341       // Only need 1 register in loop
342       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
343 
344       // Have to waterfall the resource.
345       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
346 
347       // Have to waterfall the resource, and the offset.
348       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
349     };
350 
351     // rsrc, offset
352     const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
353     return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
354   }
355   case Intrinsic::amdgcn_ds_ordered_add:
356   case Intrinsic::amdgcn_ds_ordered_swap: {
357     // VGPR = M0, VGPR
358     static const OpRegBankEntry<3> Table[2] = {
359       // Perfectly legal.
360       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID  }, 1 },
361 
362       // Need a readfirstlane for m0
363       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
364     };
365 
366     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
367     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
368   }
369   case Intrinsic::amdgcn_s_sendmsg:
370   case Intrinsic::amdgcn_s_sendmsghalt: {
371     // FIXME: Should have no register for immediate
372     static const OpRegBankEntry<1> Table[2] = {
373       // Perfectly legal.
374       { { AMDGPU::SGPRRegBankID }, 1 },
375 
376       // Need readlane
377       { { AMDGPU::VGPRRegBankID }, 3 }
378     };
379 
380     const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
381     return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
382   }
383   default:
384     return RegisterBankInfo::getInstrAlternativeMappings(MI);
385   }
386 }
387 
memOpHasNoClobbered(const MachineMemOperand * MMO)388 static bool memOpHasNoClobbered(const MachineMemOperand *MMO) {
389   const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue());
390   return I && I->getMetadata("amdgpu.noclobber");
391 }
392 
393 // FIXME: Returns uniform if there's no source value information. This is
394 // probably wrong.
isScalarLoadLegal(const MachineInstr & MI)395 static bool isScalarLoadLegal(const MachineInstr &MI) {
396   if (!MI.hasOneMemOperand())
397     return false;
398 
399   const MachineMemOperand *MMO = *MI.memoperands_begin();
400   const unsigned AS = MMO->getAddrSpace();
401   const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
402                        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
403 
404   // There are no extending SMRD/SMEM loads, and they require 4-byte alignment.
405   return MMO->getSize() >= 4 && MMO->getAlignment() >= 4 &&
406     // Can't do a scalar atomic load.
407     !MMO->isAtomic() &&
408     // Don't use scalar loads for volatile accesses to non-constant address
409     // spaces.
410     (IsConst || !MMO->isVolatile()) &&
411     // Memory must be known constant, or not written before this load.
412     (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) &&
413     AMDGPUInstrInfo::isUniformMMO(MMO);
414 }
415 
416 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappings(const MachineInstr & MI) const417 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
418     const MachineInstr &MI) const {
419 
420   const MachineFunction &MF = *MI.getParent()->getParent();
421   const MachineRegisterInfo &MRI = MF.getRegInfo();
422 
423 
424   InstructionMappings AltMappings;
425   switch (MI.getOpcode()) {
426   case TargetOpcode::G_CONSTANT: {
427     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
428     if (Size == 1) {
429       static const OpRegBankEntry<1> Table[3] = {
430         { { AMDGPU::VGPRRegBankID }, 1 },
431         { { AMDGPU::SGPRRegBankID }, 1 },
432         { { AMDGPU::VCCRegBankID }, 1 }
433       };
434 
435       return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
436     }
437 
438     LLVM_FALLTHROUGH;
439   }
440   case TargetOpcode::G_FCONSTANT:
441   case TargetOpcode::G_FRAME_INDEX:
442   case TargetOpcode::G_GLOBAL_VALUE: {
443     static const OpRegBankEntry<1> Table[2] = {
444       { { AMDGPU::VGPRRegBankID }, 1 },
445       { { AMDGPU::SGPRRegBankID }, 1 }
446     };
447 
448     return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
449   }
450   case TargetOpcode::G_AND:
451   case TargetOpcode::G_OR:
452   case TargetOpcode::G_XOR: {
453     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
454 
455     if (Size == 1) {
456       // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
457       const InstructionMapping &SCCMapping = getInstructionMapping(
458         1, 1, getOperandsMapping(
459           {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
460            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
461            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
462         3); // Num Operands
463       AltMappings.push_back(&SCCMapping);
464 
465       const InstructionMapping &VCCMapping0 = getInstructionMapping(
466         2, 1, getOperandsMapping(
467           {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
468            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
469            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
470         3); // Num Operands
471       AltMappings.push_back(&VCCMapping0);
472       return AltMappings;
473     }
474 
475     if (Size != 64)
476       break;
477 
478     const InstructionMapping &SSMapping = getInstructionMapping(
479       1, 1, getOperandsMapping(
480         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
481          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
482          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
483       3); // Num Operands
484     AltMappings.push_back(&SSMapping);
485 
486     const InstructionMapping &VVMapping = getInstructionMapping(
487       2, 2, getOperandsMapping(
488         {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
489          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
490          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
491       3); // Num Operands
492     AltMappings.push_back(&VVMapping);
493 
494     const InstructionMapping &SVMapping = getInstructionMapping(
495       3, 3, getOperandsMapping(
496         {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
497          AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size),
498          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
499       3); // Num Operands
500     AltMappings.push_back(&SVMapping);
501 
502     // SGPR in LHS is slightly preferrable, so make it VS more expensive than
503     // SV.
504     const InstructionMapping &VSMapping = getInstructionMapping(
505       3, 4, getOperandsMapping(
506         {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
507          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
508          AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size)}),
509       3); // Num Operands
510     AltMappings.push_back(&VSMapping);
511     break;
512   }
513   case TargetOpcode::G_LOAD:
514   case TargetOpcode::G_ZEXTLOAD:
515   case TargetOpcode::G_SEXTLOAD: {
516     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
517     LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
518     unsigned PtrSize = PtrTy.getSizeInBits();
519     unsigned AS = PtrTy.getAddressSpace();
520     LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
521 
522     if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
523          AS != AMDGPUAS::PRIVATE_ADDRESS) &&
524         isScalarLoadLegal(MI)) {
525       const InstructionMapping &SSMapping = getInstructionMapping(
526           1, 1, getOperandsMapping(
527                     {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
528                      AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
529           2); // Num Operands
530       AltMappings.push_back(&SSMapping);
531     }
532 
533     const InstructionMapping &VVMapping = getInstructionMapping(
534         2, 1, getOperandsMapping(
535           {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy),
536            AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
537         2); // Num Operands
538     AltMappings.push_back(&VVMapping);
539 
540     // It may be possible to have a vgpr = load sgpr mapping here, because
541     // the mubuf instructions support this kind of load, but probably for only
542     // gfx7 and older.  However, the addressing mode matching in the instruction
543     // selector should be able to do a better job of detecting and selecting
544     // these kinds of loads from the vgpr = load vgpr mapping.
545 
546     return AltMappings;
547 
548   }
549   case TargetOpcode::G_ICMP: {
550     // TODO: Should report 32-bit for scalar output type.
551     unsigned Size = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
552     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
553       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
554                           nullptr, // Predicate operand.
555                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
556                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
557       4); // Num Operands
558     AltMappings.push_back(&SSMapping);
559 
560     const InstructionMapping &SVMapping = getInstructionMapping(2, 1,
561       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
562                           nullptr, // Predicate operand.
563                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
564                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
565       4); // Num Operands
566     AltMappings.push_back(&SVMapping);
567 
568     const InstructionMapping &VSMapping = getInstructionMapping(3, 1,
569       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
570                           nullptr, // Predicate operand.
571                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
572                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
573       4); // Num Operands
574     AltMappings.push_back(&VSMapping);
575 
576     const InstructionMapping &VVMapping = getInstructionMapping(4, 1,
577       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
578                           nullptr, // Predicate operand.
579                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
580                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
581       4); // Num Operands
582     AltMappings.push_back(&VVMapping);
583 
584     return AltMappings;
585   }
586   case TargetOpcode::G_SELECT: {
587     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
588     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
589       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
590                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
591                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
592                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
593       4); // Num Operands
594     AltMappings.push_back(&SSMapping);
595 
596     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
597       getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
598                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
599                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
600                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
601       4); // Num Operands
602     AltMappings.push_back(&VVMapping);
603 
604     return AltMappings;
605   }
606   case TargetOpcode::G_SMIN:
607   case TargetOpcode::G_SMAX:
608   case TargetOpcode::G_UMIN:
609   case TargetOpcode::G_UMAX: {
610     static const OpRegBankEntry<3> Table[4] = {
611       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
612       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
613       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
614 
615       // Scalar requires cmp+select, and extends if 16-bit.
616       // FIXME: Should there be separate costs for 32 and 16-bit
617       { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 }
618     };
619 
620     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } };
621     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
622   }
623   case TargetOpcode::G_UADDE:
624   case TargetOpcode::G_USUBE:
625   case TargetOpcode::G_SADDE:
626   case TargetOpcode::G_SSUBE: {
627     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
628     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
629       getOperandsMapping(
630         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
631          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
632          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
633          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
634          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
635       5); // Num Operands
636     AltMappings.push_back(&SSMapping);
637 
638     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
639       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
640                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
641                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
642                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
643                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
644       5); // Num Operands
645     AltMappings.push_back(&VVMapping);
646     return AltMappings;
647   }
648   case AMDGPU::G_BRCOND: {
649     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
650 
651     // TODO: Change type to 32 for scalar
652     const InstructionMapping &SMapping = getInstructionMapping(
653       1, 1, getOperandsMapping(
654         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
655       2); // Num Operands
656     AltMappings.push_back(&SMapping);
657 
658     const InstructionMapping &VMapping = getInstructionMapping(
659       1, 1, getOperandsMapping(
660         {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
661       2); // Num Operands
662     AltMappings.push_back(&VMapping);
663     return AltMappings;
664   }
665   case AMDGPU::G_INTRINSIC:
666     return getInstrAlternativeMappingsIntrinsic(MI, MRI);
667   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
668     return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
669   default:
670     break;
671   }
672   return RegisterBankInfo::getInstrAlternativeMappings(MI);
673 }
674 
split64BitValueForMapping(MachineIRBuilder & B,SmallVector<Register,2> & Regs,LLT HalfTy,Register Reg) const675 void AMDGPURegisterBankInfo::split64BitValueForMapping(
676   MachineIRBuilder &B,
677   SmallVector<Register, 2> &Regs,
678   LLT HalfTy,
679   Register Reg) const {
680   assert(HalfTy.getSizeInBits() == 32);
681   MachineRegisterInfo *MRI = B.getMRI();
682   Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
683   Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
684   const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
685   MRI->setRegBank(LoLHS, *Bank);
686   MRI->setRegBank(HiLHS, *Bank);
687 
688   Regs.push_back(LoLHS);
689   Regs.push_back(HiLHS);
690 
691   B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
692     .addDef(LoLHS)
693     .addDef(HiLHS)
694     .addUse(Reg);
695 }
696 
697 /// Replace the current type each register in \p Regs has with \p NewTy
setRegsToType(MachineRegisterInfo & MRI,ArrayRef<Register> Regs,LLT NewTy)698 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
699                           LLT NewTy) {
700   for (Register Reg : Regs) {
701     assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
702     MRI.setType(Reg, NewTy);
703   }
704 }
705 
getHalfSizedType(LLT Ty)706 static LLT getHalfSizedType(LLT Ty) {
707   if (Ty.isVector()) {
708     assert(Ty.getNumElements() % 2 == 0);
709     return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType());
710   }
711 
712   assert(Ty.getSizeInBits() % 2 == 0);
713   return LLT::scalar(Ty.getSizeInBits() / 2);
714 }
715 
716 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
717 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
718 /// execute the instruction for each unique combination of values in all lanes
719 /// in the wave. The block will be split such that rest of the instructions are
720 /// moved to a new block.
721 ///
722 /// Essentially performs this loop:
723 //
724 /// Save Execution Mask
725 /// For (Lane : Wavefront) {
726 ///   Enable Lane, Disable all other lanes
727 ///   SGPR = read SGPR value for current lane from VGPR
728 ///   VGPRResult[Lane] = use_op SGPR
729 /// }
730 /// Restore Execution Mask
731 ///
732 /// There is additional complexity to try for compare values to identify the
733 /// unique values used.
executeInWaterfallLoop(MachineIRBuilder & B,iterator_range<MachineBasicBlock::iterator> Range,SmallSet<Register,4> & SGPROperandRegs,MachineRegisterInfo & MRI) const734 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
735   MachineIRBuilder &B,
736   iterator_range<MachineBasicBlock::iterator> Range,
737   SmallSet<Register, 4> &SGPROperandRegs,
738   MachineRegisterInfo &MRI) const {
739   SmallVector<Register, 4> ResultRegs;
740   SmallVector<Register, 4> InitResultRegs;
741   SmallVector<Register, 4> PhiRegs;
742 
743   MachineBasicBlock &MBB = B.getMBB();
744   MachineFunction *MF = &B.getMF();
745 
746   const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
747   const unsigned WaveAndOpc = Subtarget.isWave32() ?
748     AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
749   const unsigned MovTermOpc = Subtarget.isWave32() ?
750     AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
751   const unsigned XorTermOpc = Subtarget.isWave32() ?
752     AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
753   const unsigned AndSaveExecOpc =  Subtarget.isWave32() ?
754     AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
755   const unsigned ExecReg =  Subtarget.isWave32() ?
756     AMDGPU::EXEC_LO : AMDGPU::EXEC;
757 
758   for (MachineInstr &MI : Range) {
759     for (MachineOperand &Def : MI.defs()) {
760       LLT ResTy = MRI.getType(Def.getReg());
761       const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
762       ResultRegs.push_back(Def.getReg());
763       Register InitReg = B.buildUndef(ResTy).getReg(0);
764       Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
765       InitResultRegs.push_back(InitReg);
766       PhiRegs.push_back(PhiReg);
767       MRI.setRegBank(PhiReg, *DefBank);
768       MRI.setRegBank(InitReg, *DefBank);
769     }
770   }
771 
772   Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
773   Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
774 
775   // Don't bother using generic instructions/registers for the exec mask.
776   B.buildInstr(TargetOpcode::IMPLICIT_DEF)
777     .addDef(InitSaveExecReg);
778 
779   Register PhiExec = MRI.createVirtualRegister(WaveRC);
780   Register NewExec = MRI.createVirtualRegister(WaveRC);
781 
782   // To insert the loop we need to split the block. Move everything before this
783   // point to a new block, and insert a new empty block before this instruction.
784   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
785   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
786   MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
787   MachineFunction::iterator MBBI(MBB);
788   ++MBBI;
789   MF->insert(MBBI, LoopBB);
790   MF->insert(MBBI, RestoreExecBB);
791   MF->insert(MBBI, RemainderBB);
792 
793   LoopBB->addSuccessor(RestoreExecBB);
794   LoopBB->addSuccessor(LoopBB);
795 
796   // Move the rest of the block into a new block.
797   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
798   RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
799 
800   MBB.addSuccessor(LoopBB);
801   RestoreExecBB->addSuccessor(RemainderBB);
802 
803   B.setInsertPt(*LoopBB, LoopBB->end());
804 
805   B.buildInstr(TargetOpcode::PHI)
806     .addDef(PhiExec)
807     .addReg(InitSaveExecReg)
808     .addMBB(&MBB)
809     .addReg(NewExec)
810     .addMBB(LoopBB);
811 
812   for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
813     B.buildInstr(TargetOpcode::G_PHI)
814       .addDef(std::get<2>(Result))
815       .addReg(std::get<0>(Result)) // Initial value / implicit_def
816       .addMBB(&MBB)
817       .addReg(std::get<1>(Result)) // Mid-loop value.
818       .addMBB(LoopBB);
819   }
820 
821   const DebugLoc &DL = B.getDL();
822 
823   // Figure out the iterator range after splicing the instructions.
824   auto NewBegin = std::prev(LoopBB->end());
825 
826   // Move the instruction into the loop. Note we moved everything after
827   // Range.end() already into a new block, so Range.end() is no longer valid.
828   LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
829 
830   auto NewEnd = LoopBB->end();
831 
832   MachineBasicBlock::iterator I = Range.begin();
833   B.setInsertPt(*LoopBB, I);
834 
835   Register CondReg;
836 
837   for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
838     for (MachineOperand &Op : MI.uses()) {
839       if (!Op.isReg() || Op.isDef())
840         continue;
841 
842       if (SGPROperandRegs.count(Op.getReg())) {
843         LLT OpTy = MRI.getType(Op.getReg());
844         unsigned OpSize = OpTy.getSizeInBits();
845 
846         // Can only do a readlane of 32-bit pieces.
847         if (OpSize == 32) {
848           // Avoid extra copies in the simple case of one 32-bit register.
849           Register CurrentLaneOpReg
850             = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
851           MRI.setType(CurrentLaneOpReg, OpTy);
852 
853           constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
854           // Read the next variant <- also loop target.
855           BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
856                   CurrentLaneOpReg)
857             .addReg(Op.getReg());
858 
859           Register NewCondReg = MRI.createVirtualRegister(WaveRC);
860           bool First = CondReg == AMDGPU::NoRegister;
861           if (First)
862             CondReg = NewCondReg;
863 
864           // Compare the just read M0 value to all possible Idx values.
865           B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
866             .addDef(NewCondReg)
867             .addReg(CurrentLaneOpReg)
868             .addReg(Op.getReg());
869           Op.setReg(CurrentLaneOpReg);
870 
871           if (!First) {
872             Register AndReg = MRI.createVirtualRegister(WaveRC);
873 
874             // If there are multiple operands to consider, and the conditions.
875             B.buildInstr(WaveAndOpc)
876               .addDef(AndReg)
877               .addReg(NewCondReg)
878               .addReg(CondReg);
879             CondReg = AndReg;
880           }
881         } else {
882           LLT S32 = LLT::scalar(32);
883           SmallVector<Register, 8> ReadlanePieces;
884 
885           // The compares can be done as 64-bit, but the extract needs to be done
886           // in 32-bit pieces.
887 
888           bool Is64 = OpSize % 64 == 0;
889 
890           LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
891           unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
892             : AMDGPU::V_CMP_EQ_U32_e64;
893 
894           // The compares can be done as 64-bit, but the extract needs to be done
895           // in 32-bit pieces.
896 
897           // Insert the unmerge before the loop.
898 
899           B.setMBB(MBB);
900           auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
901           B.setInstr(*I);
902 
903           unsigned NumPieces = Unmerge->getNumOperands() - 1;
904           for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
905             Register UnmergePiece = Unmerge.getReg(PieceIdx);
906 
907             Register CurrentLaneOpReg;
908             if (Is64) {
909               Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
910               Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
911 
912               MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
913               MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
914               MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
915 
916               // Read the next variant <- also loop target.
917               BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
918                       CurrentLaneOpRegLo)
919                 .addReg(UnmergePiece, 0, AMDGPU::sub0);
920 
921               // Read the next variant <- also loop target.
922               BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
923                       CurrentLaneOpRegHi)
924                 .addReg(UnmergePiece, 0, AMDGPU::sub1);
925 
926               CurrentLaneOpReg =
927                 B.buildMerge(LLT::scalar(64),
928                              {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
929                 .getReg(0);
930 
931               MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
932 
933               if (OpTy.getScalarSizeInBits() == 64) {
934                 // If we need to produce a 64-bit element vector, so use the
935                 // merged pieces
936                 ReadlanePieces.push_back(CurrentLaneOpReg);
937               } else {
938                 // 32-bit element type.
939                 ReadlanePieces.push_back(CurrentLaneOpRegLo);
940                 ReadlanePieces.push_back(CurrentLaneOpRegHi);
941               }
942             } else {
943               CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
944               MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
945               MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
946 
947               // Read the next variant <- also loop target.
948               BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
949                       CurrentLaneOpReg)
950                 .addReg(UnmergePiece);
951               ReadlanePieces.push_back(CurrentLaneOpReg);
952             }
953 
954             Register NewCondReg = MRI.createVirtualRegister(WaveRC);
955             bool First = CondReg == AMDGPU::NoRegister;
956             if (First)
957               CondReg = NewCondReg;
958 
959             B.buildInstr(CmpOp)
960               .addDef(NewCondReg)
961               .addReg(CurrentLaneOpReg)
962               .addReg(UnmergePiece);
963 
964             if (!First) {
965               Register AndReg = MRI.createVirtualRegister(WaveRC);
966 
967               // If there are multiple operands to consider, and the conditions.
968               B.buildInstr(WaveAndOpc)
969                 .addDef(AndReg)
970                 .addReg(NewCondReg)
971                 .addReg(CondReg);
972               CondReg = AndReg;
973             }
974           }
975 
976           // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
977           // BUILD_VECTOR
978           if (OpTy.isVector()) {
979             auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
980             Op.setReg(Merge.getReg(0));
981           } else {
982             auto Merge = B.buildMerge(OpTy, ReadlanePieces);
983             Op.setReg(Merge.getReg(0));
984           }
985 
986           MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
987         }
988       }
989     }
990   }
991 
992   B.setInsertPt(*LoopBB, LoopBB->end());
993 
994   // Update EXEC, save the original EXEC value to VCC.
995   B.buildInstr(AndSaveExecOpc)
996     .addDef(NewExec)
997     .addReg(CondReg, RegState::Kill);
998 
999   MRI.setSimpleHint(NewExec, CondReg);
1000 
1001   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
1002   B.buildInstr(XorTermOpc)
1003     .addDef(ExecReg)
1004     .addReg(ExecReg)
1005     .addReg(NewExec);
1006 
1007   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
1008   // s_cbranch_scc0?
1009 
1010   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
1011   B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
1012     .addMBB(LoopBB);
1013 
1014   // Save the EXEC mask before the loop.
1015   BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg)
1016     .addReg(ExecReg);
1017 
1018   // Restore the EXEC mask after the loop.
1019   B.setMBB(*RestoreExecBB);
1020   B.buildInstr(MovTermOpc)
1021     .addDef(ExecReg)
1022     .addReg(SaveExecReg);
1023 
1024   // Set the insert point after the original instruction, so any new
1025   // instructions will be in the remainder.
1026   B.setInsertPt(*RemainderBB, RemainderBB->begin());
1027 
1028   return true;
1029 }
1030 
1031 // Return any unique registers used by \p MI at \p OpIndices that need to be
1032 // handled in a waterfall loop. Returns these registers in \p
1033 // SGPROperandRegs. Returns true if there are any operansd to handle and a
1034 // waterfall loop is necessary.
collectWaterfallOperands(SmallSet<Register,4> & SGPROperandRegs,MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const1035 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
1036   SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
1037   MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
1038   for (unsigned Op : OpIndices) {
1039     assert(MI.getOperand(Op).isUse());
1040     Register Reg = MI.getOperand(Op).getReg();
1041     const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
1042     if (OpBank->getID() == AMDGPU::VGPRRegBankID)
1043       SGPROperandRegs.insert(Reg);
1044   }
1045 
1046   // No operands need to be replaced, so no need to loop.
1047   return !SGPROperandRegs.empty();
1048 }
1049 
executeInWaterfallLoop(MachineIRBuilder & B,MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const1050 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1051   MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
1052   ArrayRef<unsigned> OpIndices) const {
1053   // Use a set to avoid extra readfirstlanes in the case where multiple operands
1054   // are the same register.
1055   SmallSet<Register, 4> SGPROperandRegs;
1056 
1057   if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
1058     return false;
1059 
1060   MachineBasicBlock::iterator I = MI.getIterator();
1061   return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1062                                 SGPROperandRegs, MRI);
1063 }
1064 
executeInWaterfallLoop(MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const1065 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1066   MachineInstr &MI, MachineRegisterInfo &MRI,
1067   ArrayRef<unsigned> OpIndices) const {
1068   MachineIRBuilder B(MI);
1069   return executeInWaterfallLoop(B, MI, MRI, OpIndices);
1070 }
1071 
1072 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
constrainOpWithReadfirstlane(MachineInstr & MI,MachineRegisterInfo & MRI,unsigned OpIdx) const1073 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1074     MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1075   Register Reg = MI.getOperand(OpIdx).getReg();
1076   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1077   if (Bank != &AMDGPU::VGPRRegBank)
1078     return;
1079 
1080   MachineIRBuilder B(MI);
1081   Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1082   B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
1083     .addDef(SGPR)
1084     .addReg(Reg);
1085 
1086   MRI.setType(SGPR, MRI.getType(Reg));
1087 
1088   const TargetRegisterClass *Constrained =
1089       constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
1090   (void)Constrained;
1091   assert(Constrained && "Failed to constrain readfirstlane src reg");
1092 
1093   MI.getOperand(OpIdx).setReg(SGPR);
1094 }
1095 
1096 // When regbankselect repairs registers, it will insert a repair instruction
1097 // which defines the repaired register.  Then it calls applyMapping and expects
1098 // that the targets will either delete or rewrite the originally wrote to the
1099 // repaired registers.  Beccause of this, we end up in a situation where
1100 // we have 2 instructions defining the same registers.
getOtherVRegDef(const MachineRegisterInfo & MRI,Register Reg,const MachineInstr & MI)1101 static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI,
1102                                      Register Reg,
1103                                      const MachineInstr &MI) {
1104   // Is there some way we can assert that there are exactly 2 def instructions?
1105   for (MachineInstr &Other : MRI.def_instructions(Reg)) {
1106     if (&Other != &MI)
1107       return &Other;
1108   }
1109 
1110   return nullptr;
1111 }
1112 
applyMappingWideLoad(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI) const1113 bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
1114                         const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1115                                               MachineRegisterInfo &MRI) const {
1116   Register DstReg = MI.getOperand(0).getReg();
1117   const LLT LoadTy =  MRI.getType(DstReg);
1118   unsigned LoadSize = LoadTy.getSizeInBits();
1119   const unsigned MaxNonSmrdLoadSize = 128;
1120   // 128-bit loads are supported for all instruction types.
1121   if (LoadSize <= MaxNonSmrdLoadSize)
1122     return false;
1123 
1124   SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0));
1125   SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1));
1126 
1127   // If the pointer is an SGPR, we have nothing to do.
1128   if (SrcRegs.empty()) {
1129     const RegisterBank *PtrBank =
1130       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1131     if (PtrBank == &AMDGPU::SGPRRegBank)
1132       return false;
1133     SrcRegs.push_back(MI.getOperand(1).getReg());
1134   }
1135 
1136   assert(LoadSize % MaxNonSmrdLoadSize == 0);
1137 
1138   // We want to get the repair instruction now, because it will help us
1139   // determine which instruction the legalizer inserts that will also
1140   // write to DstReg.
1141   MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI);
1142 
1143   // RegBankSelect only emits scalar types, so we need to reset the pointer
1144   // operand to a pointer type.
1145   Register BasePtrReg = SrcRegs[0];
1146   LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1147   MRI.setType(BasePtrReg, PtrTy);
1148 
1149   MachineIRBuilder B(MI);
1150 
1151   unsigned SplitElts =
1152       MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits();
1153   const LLT LoadSplitTy =  LLT::vector(SplitElts, LoadTy.getScalarType());
1154   ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
1155   GISelObserverWrapper Observer(&O);
1156   B.setChangeObserver(Observer);
1157   LegalizerHelper Helper(B.getMF(), Observer, B);
1158   if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1159     return false;
1160 
1161   // At this point, the legalizer has split the original load into smaller
1162   // loads.  At the end of lowering, it inserts an instruction (LegalizedInst)
1163   // that combines the outputs of the lower loads and writes it to DstReg.
1164   // The register bank selector has also added the RepairInst which writes to
1165   // DstReg as well.
1166 
1167   MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst);
1168 
1169   // Replace the output of the LegalizedInst with a temporary register, since
1170   // RepairInst already defines DstReg.
1171   Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg));
1172   LegalizedInst->getOperand(0).setReg(TmpReg);
1173   B.setInsertPt(*RepairInst->getParent(), RepairInst);
1174 
1175   for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) {
1176     Register IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
1177     B.buildConstant(IdxReg, DefIdx);
1178     MRI.setRegBank(IdxReg, AMDGPU::VGPRRegBank);
1179     B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg);
1180   }
1181 
1182   MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1183   return true;
1184 }
1185 
applyMappingImage(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI,int RsrcIdx) const1186 bool AMDGPURegisterBankInfo::applyMappingImage(
1187     MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1188     MachineRegisterInfo &MRI, int RsrcIdx) const {
1189   const int NumDefs = MI.getNumExplicitDefs();
1190 
1191   // The reported argument index is relative to the IR intrinsic call arguments,
1192   // so we need to shift by the number of defs and the intrinsic ID.
1193   RsrcIdx += NumDefs + 1;
1194 
1195   // Insert copies to VGPR arguments.
1196   applyDefaultMapping(OpdMapper);
1197 
1198   // Fixup any SGPR arguments.
1199   SmallVector<unsigned, 4> SGPRIndexes;
1200   for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1201     if (!MI.getOperand(I).isReg())
1202       continue;
1203 
1204     // If this intrinsic has a sampler, it immediately follows rsrc.
1205     if (I == RsrcIdx || I == RsrcIdx + 1)
1206       SGPRIndexes.push_back(I);
1207   }
1208 
1209   executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1210   return true;
1211 }
1212 
1213 // FIXME: Duplicated from LegalizerHelper
minMaxToCompare(unsigned Opc)1214 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
1215   switch (Opc) {
1216   case TargetOpcode::G_SMIN:
1217     return CmpInst::ICMP_SLT;
1218   case TargetOpcode::G_SMAX:
1219     return CmpInst::ICMP_SGT;
1220   case TargetOpcode::G_UMIN:
1221     return CmpInst::ICMP_ULT;
1222   case TargetOpcode::G_UMAX:
1223     return CmpInst::ICMP_UGT;
1224   default:
1225     llvm_unreachable("not in integer min/max");
1226   }
1227 }
1228 
1229 // FIXME: Duplicated from LegalizerHelper, except changing the boolean type.
lowerScalarMinMax(MachineIRBuilder & B,MachineInstr & MI) const1230 void AMDGPURegisterBankInfo::lowerScalarMinMax(MachineIRBuilder &B,
1231                                                MachineInstr &MI) const {
1232   Register Dst = MI.getOperand(0).getReg();
1233   Register Src0 = MI.getOperand(1).getReg();
1234   Register Src1 = MI.getOperand(2).getReg();
1235 
1236   const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
1237   LLT CmpType = LLT::scalar(32);
1238 
1239   auto Cmp = B.buildICmp(Pred, CmpType, Src0, Src1);
1240   B.buildSelect(Dst, Cmp, Src0, Src1);
1241 
1242   B.getMRI()->setRegBank(Cmp.getReg(0), AMDGPU::SGPRRegBank);
1243   MI.eraseFromParent();
1244 }
1245 
1246 // For cases where only a single copy is inserted for matching register banks.
1247 // Replace the register in the instruction operand
substituteSimpleCopyRegs(const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,unsigned OpIdx)1248 static void substituteSimpleCopyRegs(
1249   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1250   SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1251   if (!SrcReg.empty()) {
1252     assert(SrcReg.size() == 1);
1253     OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1254   }
1255 }
1256 
1257 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg) const1258 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1259                                                 MachineRegisterInfo &MRI,
1260                                                 Register Reg) const {
1261   if (!Subtarget.hasUnpackedD16VMem())
1262     return Reg;
1263 
1264   const LLT S16 = LLT::scalar(16);
1265   LLT StoreVT = MRI.getType(Reg);
1266   if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1267     return Reg;
1268 
1269   auto Unmerge = B.buildUnmerge(S16, Reg);
1270 
1271 
1272   SmallVector<Register, 4> WideRegs;
1273   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1274     WideRegs.push_back(Unmerge.getReg(I));
1275 
1276   const LLT S32 = LLT::scalar(32);
1277   int NumElts = StoreVT.getNumElements();
1278 
1279   return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1280 }
1281 
1282 static std::pair<Register, unsigned>
getBaseWithConstantOffset(MachineRegisterInfo & MRI,Register Reg)1283 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1284   int64_t Const;
1285   if (mi_match(Reg, MRI, m_ICst(Const)))
1286     return std::make_pair(Register(), Const);
1287 
1288   Register Base;
1289   if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1290     return std::make_pair(Base, Const);
1291 
1292   // TODO: Handle G_OR used for add case
1293   return std::make_pair(Reg, 0);
1294 }
1295 
1296 std::pair<Register, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const1297 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1298                                            Register OrigOffset) const {
1299   const unsigned MaxImm = 4095;
1300   Register BaseReg;
1301   unsigned ImmOffset;
1302   const LLT S32 = LLT::scalar(32);
1303 
1304   std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1305                                                            OrigOffset);
1306 
1307   unsigned C1 = 0;
1308   if (ImmOffset != 0) {
1309     // If the immediate value is too big for the immoffset field, put the value
1310     // and -4096 into the immoffset field so that the value that is copied/added
1311     // for the voffset field is a multiple of 4096, and it stands more chance
1312     // of being CSEd with the copy/add for another similar load/store.
1313     // However, do not do that rounding down to a multiple of 4096 if that is a
1314     // negative number, as it appears to be illegal to have a negative offset
1315     // in the vgpr, even if adding the immediate offset makes it positive.
1316     unsigned Overflow = ImmOffset & ~MaxImm;
1317     ImmOffset -= Overflow;
1318     if ((int32_t)Overflow < 0) {
1319       Overflow += ImmOffset;
1320       ImmOffset = 0;
1321     }
1322 
1323     C1 = ImmOffset;
1324     if (Overflow != 0) {
1325       if (!BaseReg)
1326         BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1327       else {
1328         auto OverflowVal = B.buildConstant(S32, Overflow);
1329         BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1330       }
1331     }
1332   }
1333 
1334   if (!BaseReg)
1335     BaseReg = B.buildConstant(S32, 0).getReg(0);
1336 
1337   return {BaseReg, C1};
1338 }
1339 
isZero(Register Reg,MachineRegisterInfo & MRI)1340 static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
1341   int64_t C;
1342   return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
1343 }
1344 
extractGLC(unsigned CachePolicy)1345 static unsigned extractGLC(unsigned CachePolicy) {
1346   return CachePolicy & 1;
1347 }
1348 
extractSLC(unsigned CachePolicy)1349 static unsigned extractSLC(unsigned CachePolicy) {
1350   return (CachePolicy >> 1) & 1;
1351 }
1352 
extractDLC(unsigned CachePolicy)1353 static unsigned extractDLC(unsigned CachePolicy) {
1354   return (CachePolicy >> 2) & 1;
1355 }
1356 
1357 MachineInstr *
selectStoreIntrinsic(MachineIRBuilder & B,MachineInstr & MI) const1358 AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
1359                                              MachineInstr &MI) const {
1360    MachineRegisterInfo &MRI = *B.getMRI();
1361   executeInWaterfallLoop(B, MI, MRI, {2, 4});
1362 
1363   // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
1364 
1365   Register VData = MI.getOperand(1).getReg();
1366   LLT Ty = MRI.getType(VData);
1367 
1368   int EltSize = Ty.getScalarSizeInBits();
1369   int Size = Ty.getSizeInBits();
1370 
1371   // FIXME: Broken integer truncstore.
1372   if (EltSize != 32)
1373     report_fatal_error("unhandled intrinsic store");
1374 
1375   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
1376   const int MemSize = (*MI.memoperands_begin())->getSize();
1377 
1378 
1379   Register RSrc = MI.getOperand(2).getReg();
1380   Register VOffset = MI.getOperand(3).getReg();
1381   Register SOffset = MI.getOperand(4).getReg();
1382   unsigned CachePolicy = MI.getOperand(5).getImm();
1383 
1384   unsigned ImmOffset;
1385   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
1386 
1387   const bool Offen = !isZero(VOffset, MRI);
1388 
1389   unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact;
1390   switch (8 * MemSize) {
1391   case 8:
1392     Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
1393                   AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
1394     break;
1395   case 16:
1396     Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
1397                   AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
1398     break;
1399   default:
1400     Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
1401                   AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
1402     if (Size > 32)
1403       Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
1404     break;
1405   }
1406 
1407 
1408   // Set the insertion point back to the instruction in case it was moved into a
1409   // loop.
1410   B.setInstr(MI);
1411 
1412   MachineInstrBuilder MIB = B.buildInstr(Opc)
1413     .addUse(VData);
1414 
1415   if (Offen)
1416     MIB.addUse(VOffset);
1417 
1418   MIB.addUse(RSrc)
1419      .addUse(SOffset)
1420      .addImm(ImmOffset)
1421      .addImm(extractGLC(CachePolicy))
1422      .addImm(extractSLC(CachePolicy))
1423      .addImm(0) // tfe: FIXME: Remove from inst
1424      .addImm(extractDLC(CachePolicy))
1425      .cloneMemRefs(MI);
1426 
1427   // FIXME: We need a way to report failure from applyMappingImpl.
1428   // Insert constrain copies before inserting the loop.
1429   if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1430     report_fatal_error("failed to constrain selected store intrinsic");
1431 
1432   return MIB;
1433 }
1434 
buildVCopy(MachineIRBuilder & B,Register DstReg,Register SrcReg) const1435 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1436                                         Register SrcReg) const {
1437   MachineRegisterInfo &MRI = *B.getMRI();
1438   LLT SrcTy = MRI.getType(SrcReg);
1439   if (SrcTy.getSizeInBits() == 32) {
1440     // Use a v_mov_b32 here to make the exec dependency explicit.
1441     B.buildInstr(AMDGPU::V_MOV_B32_e32)
1442       .addDef(DstReg)
1443       .addUse(SrcReg);
1444     return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1445            constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1446   }
1447 
1448   Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1449   Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1450 
1451   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1452     .addDef(TmpReg0)
1453     .addUse(SrcReg, 0, AMDGPU::sub0);
1454   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1455     .addDef(TmpReg1)
1456     .addUse(SrcReg, 0, AMDGPU::sub1);
1457   B.buildInstr(AMDGPU::REG_SEQUENCE)
1458     .addDef(DstReg)
1459     .addUse(TmpReg0)
1460     .addImm(AMDGPU::sub0)
1461     .addUse(TmpReg1)
1462     .addImm(AMDGPU::sub1);
1463 
1464   return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1465          constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1466 }
1467 
applyMappingImpl(const OperandsMapper & OpdMapper) const1468 void AMDGPURegisterBankInfo::applyMappingImpl(
1469     const OperandsMapper &OpdMapper) const {
1470   MachineInstr &MI = OpdMapper.getMI();
1471   unsigned Opc = MI.getOpcode();
1472   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1473   switch (Opc) {
1474   case AMDGPU::G_PHI: {
1475     Register DstReg = MI.getOperand(0).getReg();
1476     LLT DstTy = MRI.getType(DstReg);
1477     if (DstTy != LLT::scalar(1))
1478       break;
1479 
1480     const LLT S32 = LLT::scalar(32);
1481     const RegisterBank *DstBank =
1482       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1483     if (DstBank == &AMDGPU::VCCRegBank) {
1484       applyDefaultMapping(OpdMapper);
1485       // The standard handling only considers the result register bank for
1486       // phis. For VCC, blindly inserting a copy when the phi is lowered will
1487       // produce an invalid copy. We can only copy with some kind of compare to
1488       // get a vector boolean result. Insert a regitser bank copy that will be
1489       // correctly lowered to a compare.
1490       MachineIRBuilder B(*MI.getParent()->getParent());
1491 
1492       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
1493         Register SrcReg = MI.getOperand(I).getReg();
1494         const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
1495 
1496         if (SrcBank != &AMDGPU::VCCRegBank) {
1497           MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
1498           B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
1499 
1500           auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
1501           MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
1502           MI.getOperand(I).setReg(Copy.getReg(0));
1503         }
1504       }
1505 
1506       return;
1507     }
1508 
1509     // Phi handling is strange and only considers the bank of the destination.
1510     substituteSimpleCopyRegs(OpdMapper, 0);
1511 
1512     // Promote SGPR/VGPR booleans to s32
1513     MachineFunction *MF = MI.getParent()->getParent();
1514     ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
1515     GISelObserverWrapper Observer(&ApplyBank);
1516     MachineIRBuilder B(MI);
1517     LegalizerHelper Helper(*MF, Observer, B);
1518 
1519     if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
1520       llvm_unreachable("widen scalar should have succeeded");
1521 
1522     return;
1523   }
1524   case AMDGPU::G_ICMP:
1525   case AMDGPU::G_UADDO:
1526   case AMDGPU::G_USUBO:
1527   case AMDGPU::G_UADDE:
1528   case AMDGPU::G_SADDE:
1529   case AMDGPU::G_USUBE:
1530   case AMDGPU::G_SSUBE: {
1531     unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
1532     Register DstReg = MI.getOperand(BoolDstOp).getReg();
1533 
1534     const RegisterBank *DstBank =
1535       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1536     if (DstBank != &AMDGPU::SGPRRegBank)
1537       break;
1538 
1539     const bool HasCarryIn = MI.getNumOperands() == 5;
1540 
1541     // If this is a scalar compare, promote the result to s32, as the selection
1542     // will end up using a copy to a 32-bit vreg.
1543     const LLT S32 = LLT::scalar(32);
1544     Register NewDstReg = MRI.createGenericVirtualRegister(S32);
1545     MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
1546     MI.getOperand(BoolDstOp).setReg(NewDstReg);
1547     MachineIRBuilder B(MI);
1548 
1549     if (HasCarryIn) {
1550       Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
1551       MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
1552       B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
1553       MI.getOperand(4).setReg(NewSrcReg);
1554     }
1555 
1556     MachineBasicBlock *MBB = MI.getParent();
1557     B.setInsertPt(*MBB, std::next(MI.getIterator()));
1558     B.buildTrunc(DstReg, NewDstReg);
1559     return;
1560   }
1561   case AMDGPU::G_SELECT: {
1562     Register DstReg = MI.getOperand(0).getReg();
1563     LLT DstTy = MRI.getType(DstReg);
1564 
1565     SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
1566     if (CondRegs.empty())
1567       CondRegs.push_back(MI.getOperand(1).getReg());
1568     else {
1569       assert(CondRegs.size() == 1);
1570     }
1571 
1572     const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
1573     if (CondBank == &AMDGPU::SGPRRegBank) {
1574       MachineIRBuilder B(MI);
1575       const LLT S32 = LLT::scalar(32);
1576       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
1577       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
1578 
1579       MI.getOperand(1).setReg(NewCondReg);
1580       B.buildZExt(NewCondReg, CondRegs[0]);
1581     }
1582 
1583     if (DstTy.getSizeInBits() != 64)
1584       break;
1585 
1586     MachineIRBuilder B(MI);
1587     LLT HalfTy = getHalfSizedType(DstTy);
1588 
1589     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1590     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
1591     SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
1592 
1593     // All inputs are SGPRs, nothing special to do.
1594     if (DefRegs.empty()) {
1595       assert(Src1Regs.empty() && Src2Regs.empty());
1596       break;
1597     }
1598 
1599     if (Src1Regs.empty())
1600       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
1601     else {
1602       setRegsToType(MRI, Src1Regs, HalfTy);
1603     }
1604 
1605     if (Src2Regs.empty())
1606       split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
1607     else
1608       setRegsToType(MRI, Src2Regs, HalfTy);
1609 
1610     setRegsToType(MRI, DefRegs, HalfTy);
1611 
1612     B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
1613     B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
1614 
1615     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1616     MI.eraseFromParent();
1617     return;
1618   }
1619   case AMDGPU::G_BRCOND: {
1620     Register CondReg = MI.getOperand(0).getReg();
1621     // FIXME: Should use legalizer helper, but should change bool ext type.
1622     const RegisterBank *CondBank =
1623       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1624 
1625     if (CondBank == &AMDGPU::SGPRRegBank) {
1626       MachineIRBuilder B(MI);
1627       const LLT S32 = LLT::scalar(32);
1628       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
1629       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
1630 
1631       MI.getOperand(0).setReg(NewCondReg);
1632       B.buildZExt(NewCondReg, CondReg);
1633       return;
1634     }
1635 
1636     break;
1637   }
1638   case AMDGPU::G_AND:
1639   case AMDGPU::G_OR:
1640   case AMDGPU::G_XOR: {
1641     // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
1642     // there is a VGPR input.
1643     Register DstReg = MI.getOperand(0).getReg();
1644     LLT DstTy = MRI.getType(DstReg);
1645 
1646     if (DstTy.getSizeInBits() == 1) {
1647       const RegisterBank *DstBank =
1648         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1649       if (DstBank == &AMDGPU::VCCRegBank)
1650         break;
1651 
1652       MachineFunction *MF = MI.getParent()->getParent();
1653       ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
1654       GISelObserverWrapper Observer(&ApplyBank);
1655       MachineIRBuilder B(MI);
1656       LegalizerHelper Helper(*MF, Observer, B);
1657 
1658       if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
1659           LegalizerHelper::Legalized)
1660         llvm_unreachable("widen scalar should have succeeded");
1661       return;
1662     }
1663 
1664     if (DstTy.getSizeInBits() != 64)
1665       break;
1666 
1667     LLT HalfTy = getHalfSizedType(DstTy);
1668     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1669     SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
1670     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
1671 
1672     // All inputs are SGPRs, nothing special to do.
1673     if (DefRegs.empty()) {
1674       assert(Src0Regs.empty() && Src1Regs.empty());
1675       break;
1676     }
1677 
1678     assert(DefRegs.size() == 2);
1679     assert(Src0Regs.size() == Src1Regs.size() &&
1680            (Src0Regs.empty() || Src0Regs.size() == 2));
1681 
1682     // Depending on where the source registers came from, the generic code may
1683     // have decided to split the inputs already or not. If not, we still need to
1684     // extract the values.
1685     MachineIRBuilder B(MI);
1686 
1687     if (Src0Regs.empty())
1688       split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
1689     else
1690       setRegsToType(MRI, Src0Regs, HalfTy);
1691 
1692     if (Src1Regs.empty())
1693       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
1694     else
1695       setRegsToType(MRI, Src1Regs, HalfTy);
1696 
1697     setRegsToType(MRI, DefRegs, HalfTy);
1698 
1699     B.buildInstr(Opc)
1700       .addDef(DefRegs[0])
1701       .addUse(Src0Regs[0])
1702       .addUse(Src1Regs[0]);
1703 
1704     B.buildInstr(Opc)
1705       .addDef(DefRegs[1])
1706       .addUse(Src0Regs[1])
1707       .addUse(Src1Regs[1]);
1708 
1709     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1710     MI.eraseFromParent();
1711     return;
1712   }
1713   case AMDGPU::G_ADD:
1714   case AMDGPU::G_SUB:
1715   case AMDGPU::G_MUL: {
1716     Register DstReg = MI.getOperand(0).getReg();
1717     LLT DstTy = MRI.getType(DstReg);
1718     if (DstTy != LLT::scalar(16))
1719       break;
1720 
1721     const RegisterBank *DstBank =
1722       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1723     if (DstBank == &AMDGPU::VGPRRegBank)
1724       break;
1725 
1726     // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
1727     MachineFunction *MF = MI.getParent()->getParent();
1728     MachineIRBuilder B(MI);
1729     ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
1730     GISelObserverWrapper Observer(&ApplySALU);
1731     LegalizerHelper Helper(*MF, Observer, B);
1732 
1733     if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
1734         LegalizerHelper::Legalized)
1735       llvm_unreachable("widen scalar should have succeeded");
1736     return;
1737   }
1738   case AMDGPU::G_SMIN:
1739   case AMDGPU::G_SMAX:
1740   case AMDGPU::G_UMIN:
1741   case AMDGPU::G_UMAX: {
1742     Register DstReg = MI.getOperand(0).getReg();
1743     const RegisterBank *DstBank =
1744       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1745     if (DstBank == &AMDGPU::VGPRRegBank)
1746       break;
1747 
1748     MachineFunction *MF = MI.getParent()->getParent();
1749     MachineIRBuilder B(MI);
1750 
1751     // Turn scalar min/max into a compare and select.
1752     LLT Ty = MRI.getType(DstReg);
1753     LLT S32 = LLT::scalar(32);
1754     LLT S16 = LLT::scalar(16);
1755 
1756     if (Ty == S16) {
1757       ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
1758       GISelObserverWrapper Observer(&ApplySALU);
1759       LegalizerHelper Helper(*MF, Observer, B);
1760 
1761       // Need to widen to s32, and expand as cmp + select.
1762       if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
1763         llvm_unreachable("widenScalar should have succeeded");
1764 
1765       // FIXME: This is relying on widenScalar leaving MI in place.
1766       lowerScalarMinMax(B, MI);
1767     } else
1768       lowerScalarMinMax(B, MI);
1769 
1770     return;
1771   }
1772   case AMDGPU::G_SEXT:
1773   case AMDGPU::G_ZEXT: {
1774     Register SrcReg = MI.getOperand(1).getReg();
1775     LLT SrcTy = MRI.getType(SrcReg);
1776     bool Signed = Opc == AMDGPU::G_SEXT;
1777 
1778     MachineIRBuilder B(MI);
1779     const RegisterBank *SrcBank =
1780       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1781 
1782     Register DstReg = MI.getOperand(0).getReg();
1783     LLT DstTy = MRI.getType(DstReg);
1784     if (DstTy.isScalar() &&
1785         SrcBank != &AMDGPU::SGPRRegBank &&
1786         SrcBank != &AMDGPU::VCCRegBank &&
1787         // FIXME: Should handle any type that round to s64 when irregular
1788         // breakdowns supported.
1789         DstTy.getSizeInBits() == 64 &&
1790         SrcTy.getSizeInBits() <= 32) {
1791       const LLT S32 = LLT::scalar(32);
1792       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1793 
1794       // Extend to 32-bit, and then extend the low half.
1795       if (Signed) {
1796         // TODO: Should really be buildSExtOrCopy
1797         B.buildSExtOrTrunc(DefRegs[0], SrcReg);
1798 
1799         // Replicate sign bit from 32-bit extended part.
1800         auto ShiftAmt = B.buildConstant(S32, 31);
1801         MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
1802         B.buildAShr(DefRegs[1], DefRegs[0], ShiftAmt);
1803       } else {
1804         B.buildZExtOrTrunc(DefRegs[0], SrcReg);
1805         B.buildConstant(DefRegs[1], 0);
1806       }
1807 
1808       MRI.setRegBank(DstReg, *SrcBank);
1809       MI.eraseFromParent();
1810       return;
1811     }
1812 
1813     if (SrcTy != LLT::scalar(1))
1814       return;
1815 
1816     if (SrcBank == &AMDGPU::VCCRegBank) {
1817       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1818 
1819       const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
1820 
1821       unsigned DstSize = DstTy.getSizeInBits();
1822       // 64-bit select is SGPR only
1823       const bool UseSel64 = DstSize > 32 &&
1824         SrcBank->getID() == AMDGPU::SGPRRegBankID;
1825 
1826       // TODO: Should s16 select be legal?
1827       LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
1828       auto True = B.buildConstant(SelType, Signed ? -1 : 1);
1829       auto False = B.buildConstant(SelType, 0);
1830 
1831       MRI.setRegBank(True.getReg(0), *DstBank);
1832       MRI.setRegBank(False.getReg(0), *DstBank);
1833       MRI.setRegBank(DstReg, *DstBank);
1834 
1835       if (DstSize > 32) {
1836         B.buildSelect(DefRegs[0], SrcReg, True, False);
1837         B.buildCopy(DefRegs[1], DefRegs[0]);
1838       } else if (DstSize < 32) {
1839         auto Sel = B.buildSelect(SelType, SrcReg, True, False);
1840         MRI.setRegBank(Sel.getReg(0), *DstBank);
1841         B.buildTrunc(DstReg, Sel);
1842       } else {
1843         B.buildSelect(DstReg, SrcReg, True, False);
1844       }
1845 
1846       MI.eraseFromParent();
1847       return;
1848     }
1849 
1850     // Fixup the case with an s1 src that isn't a condition register. Use shifts
1851     // instead of introducing a compare to avoid an unnecessary condition
1852     // register (and since there's no scalar 16-bit compares).
1853     auto Ext = B.buildAnyExt(DstTy, SrcReg);
1854     auto ShiftAmt = B.buildConstant(LLT::scalar(32), DstTy.getSizeInBits() - 1);
1855     auto Shl = B.buildShl(DstTy, Ext, ShiftAmt);
1856 
1857     if (MI.getOpcode() == AMDGPU::G_SEXT)
1858       B.buildAShr(DstReg, Shl, ShiftAmt);
1859     else
1860       B.buildLShr(DstReg, Shl, ShiftAmt);
1861 
1862     MRI.setRegBank(DstReg, *SrcBank);
1863     MRI.setRegBank(Ext.getReg(0), *SrcBank);
1864     MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
1865     MRI.setRegBank(Shl.getReg(0), *SrcBank);
1866     MI.eraseFromParent();
1867     return;
1868   }
1869   case AMDGPU::G_BUILD_VECTOR:
1870   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
1871     Register DstReg = MI.getOperand(0).getReg();
1872     LLT DstTy = MRI.getType(DstReg);
1873     if (DstTy != LLT::vector(2, 16))
1874       break;
1875 
1876     assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
1877     substituteSimpleCopyRegs(OpdMapper, 1);
1878     substituteSimpleCopyRegs(OpdMapper, 2);
1879 
1880     const RegisterBank *DstBank =
1881       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1882     if (DstBank == &AMDGPU::SGPRRegBank)
1883       break; // Can use S_PACK_* instructions.
1884 
1885     MachineIRBuilder B(MI);
1886 
1887     Register Lo = MI.getOperand(1).getReg();
1888     Register Hi = MI.getOperand(2).getReg();
1889     const LLT S32 = LLT::scalar(32);
1890 
1891     const RegisterBank *BankLo =
1892       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1893     const RegisterBank *BankHi =
1894       OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1895 
1896     Register ZextLo;
1897     Register ShiftHi;
1898 
1899     if (Opc == AMDGPU::G_BUILD_VECTOR) {
1900       ZextLo = B.buildZExt(S32, Lo).getReg(0);
1901       MRI.setRegBank(ZextLo, *BankLo);
1902 
1903       Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
1904       MRI.setRegBank(ZextHi, *BankHi);
1905 
1906       auto ShiftAmt = B.buildConstant(S32, 16);
1907       MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
1908 
1909       ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
1910       MRI.setRegBank(ShiftHi, *BankHi);
1911     } else {
1912       Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
1913       MRI.setRegBank(MaskLo, *BankLo);
1914 
1915       auto ShiftAmt = B.buildConstant(S32, 16);
1916       MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
1917 
1918       ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
1919       MRI.setRegBank(ShiftHi, *BankHi);
1920 
1921       ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
1922       MRI.setRegBank(ZextLo, *BankLo);
1923     }
1924 
1925     auto Or = B.buildOr(S32, ZextLo, ShiftHi);
1926     MRI.setRegBank(Or.getReg(0), *DstBank);
1927 
1928     B.buildBitcast(DstReg, Or);
1929     MI.eraseFromParent();
1930     return;
1931   }
1932   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
1933     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1934 
1935     assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
1936 
1937     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1938     MachineIRBuilder B(MI);
1939 
1940     const ValueMapping &DstMapping
1941       = OpdMapper.getInstrMapping().getOperandMapping(0);
1942     const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
1943     const RegisterBank *SrcBank =
1944       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1945 
1946     Register DstReg = MI.getOperand(0).getReg();
1947     Register SrcReg = MI.getOperand(1).getReg();
1948     Register IdxReg = MI.getOperand(2).getReg();
1949 
1950     // If this is a VGPR result only because the index was a VGPR result, the
1951     // actual indexing will be done on the SGPR source vector, which will
1952     // produce a scalar result. We need to copy to the VGPR result inside the
1953     // waterfall loop.
1954     const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
1955                                 SrcBank == &AMDGPU::SGPRRegBank;
1956     if (DstRegs.empty()) {
1957       applyDefaultMapping(OpdMapper);
1958 
1959       executeInWaterfallLoop(MI, MRI, { 2 });
1960 
1961       if (NeedCopyToVGPR) {
1962         // We don't want a phi for this temporary reg.
1963         Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
1964         MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
1965         MI.getOperand(0).setReg(TmpReg);
1966         B.setInsertPt(*MI.getParent(), ++MI.getIterator());
1967 
1968         // Use a v_mov_b32 here to make the exec dependency explicit.
1969         buildVCopy(B, DstReg, TmpReg);
1970       }
1971 
1972       return;
1973     }
1974 
1975     assert(DstTy.getSizeInBits() == 64);
1976 
1977     LLT SrcTy = MRI.getType(SrcReg);
1978     const LLT S32 = LLT::scalar(32);
1979     LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
1980 
1981     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
1982     auto One = B.buildConstant(S32, 1);
1983 
1984     // Split the vector index into 32-bit pieces. Prepare to move all of the
1985     // new instructions into a waterfall loop if necessary.
1986     //
1987     // Don't put the bitcast or constant in the loop.
1988     MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
1989 
1990     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
1991     auto IdxLo = B.buildShl(S32, IdxReg, One);
1992     auto IdxHi = B.buildAdd(S32, IdxLo, One);
1993 
1994     auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
1995     auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
1996 
1997     MRI.setRegBank(DstReg, *DstBank);
1998     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
1999     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2000     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2001     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2002 
2003     SmallSet<Register, 4> OpsToWaterfall;
2004     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2005       MI.eraseFromParent();
2006       return;
2007     }
2008 
2009     // Remove the original instruction to avoid potentially confusing the
2010     // waterfall loop logic.
2011     B.setInstr(*Span.begin());
2012     MI.eraseFromParent();
2013     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2014                            OpsToWaterfall, MRI);
2015 
2016     if (NeedCopyToVGPR) {
2017       MachineBasicBlock *LoopBB = Extract1->getParent();
2018       Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2019       Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2020       MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2021       MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2022 
2023       Extract0->getOperand(0).setReg(TmpReg0);
2024       Extract1->getOperand(0).setReg(TmpReg1);
2025 
2026       B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2027 
2028       buildVCopy(B, DstRegs[0], TmpReg0);
2029       buildVCopy(B, DstRegs[1], TmpReg1);
2030     }
2031 
2032     return;
2033   }
2034   case AMDGPU::G_INSERT_VECTOR_ELT: {
2035     SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2036 
2037     assert(OpdMapper.getVRegs(0).empty());
2038     assert(OpdMapper.getVRegs(1).empty());
2039     assert(OpdMapper.getVRegs(3).empty());
2040 
2041     if (InsRegs.empty()) {
2042       applyDefaultMapping(OpdMapper);
2043       executeInWaterfallLoop(MI, MRI, { 3 });
2044       return;
2045     }
2046 
2047     Register DstReg = MI.getOperand(0).getReg();
2048     Register SrcReg = MI.getOperand(1).getReg();
2049     Register InsReg = MI.getOperand(2).getReg();
2050     Register IdxReg = MI.getOperand(3).getReg();
2051     LLT SrcTy = MRI.getType(SrcReg);
2052     LLT InsTy = MRI.getType(InsReg);
2053     (void)InsTy;
2054 
2055     assert(InsTy.getSizeInBits() == 64);
2056 
2057     const LLT S32 = LLT::scalar(32);
2058     LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
2059 
2060     MachineIRBuilder B(MI);
2061     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2062     auto One = B.buildConstant(S32, 1);
2063 
2064     // Split the vector index into 32-bit pieces. Prepare to move all of the
2065     // new instructions into a waterfall loop if necessary.
2066     //
2067     // Don't put the bitcast or constant in the loop.
2068     MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2069 
2070     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2071     auto IdxLo = B.buildShl(S32, IdxReg, One);
2072     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2073 
2074     auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2075     auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2076     B.buildBitcast(DstReg, InsHi);
2077 
2078     const RegisterBank *DstBank =
2079       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2080     const RegisterBank *SrcBank =
2081       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2082     const RegisterBank *InsSrcBank =
2083       OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2084 
2085     MRI.setRegBank(InsReg, *InsSrcBank);
2086     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2087     MRI.setRegBank(InsLo.getReg(0), *DstBank);
2088     MRI.setRegBank(InsHi.getReg(0), *DstBank);
2089     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2090     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2091     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2092 
2093 
2094     SmallSet<Register, 4> OpsToWaterfall;
2095     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2096       MI.eraseFromParent();
2097       return;
2098     }
2099 
2100     B.setInstr(*Span.begin());
2101     MI.eraseFromParent();
2102 
2103     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2104                            OpsToWaterfall, MRI);
2105     return;
2106   }
2107   case AMDGPU::G_INTRINSIC: {
2108     switch (MI.getIntrinsicID()) {
2109     case Intrinsic::amdgcn_s_buffer_load: {
2110       // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS
2111       executeInWaterfallLoop(MI, MRI, { 2, 3 });
2112       return;
2113     }
2114     case Intrinsic::amdgcn_readlane: {
2115       substituteSimpleCopyRegs(OpdMapper, 2);
2116 
2117       assert(OpdMapper.getVRegs(0).empty());
2118       assert(OpdMapper.getVRegs(3).empty());
2119 
2120       // Make sure the index is an SGPR. It doesn't make sense to run this in a
2121       // waterfall loop, so assume it's a uniform value.
2122       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2123       return;
2124     }
2125     case Intrinsic::amdgcn_writelane: {
2126       assert(OpdMapper.getVRegs(0).empty());
2127       assert(OpdMapper.getVRegs(2).empty());
2128       assert(OpdMapper.getVRegs(3).empty());
2129 
2130       substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
2131       constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
2132       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2133       return;
2134     }
2135     default:
2136       break;
2137     }
2138     break;
2139   }
2140   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
2141     auto IntrID = MI.getIntrinsicID();
2142     switch (IntrID) {
2143     case Intrinsic::amdgcn_buffer_load: {
2144       executeInWaterfallLoop(MI, MRI, { 2 });
2145       return;
2146     }
2147     case Intrinsic::amdgcn_ds_ordered_add:
2148     case Intrinsic::amdgcn_ds_ordered_swap: {
2149       // This is only allowed to execute with 1 lane, so readfirstlane is safe.
2150       assert(OpdMapper.getVRegs(0).empty());
2151       substituteSimpleCopyRegs(OpdMapper, 3);
2152       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2153       return;
2154     }
2155     case Intrinsic::amdgcn_ds_gws_init:
2156     case Intrinsic::amdgcn_ds_gws_barrier:
2157     case Intrinsic::amdgcn_ds_gws_sema_br: {
2158       // Only the first lane is executes, so readfirstlane is safe.
2159       substituteSimpleCopyRegs(OpdMapper, 1);
2160       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2161       return;
2162     }
2163     case Intrinsic::amdgcn_ds_gws_sema_v:
2164     case Intrinsic::amdgcn_ds_gws_sema_p:
2165     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
2166       // Only the first lane is executes, so readfirstlane is safe.
2167       constrainOpWithReadfirstlane(MI, MRI, 1); // M0
2168       return;
2169     }
2170     case Intrinsic::amdgcn_s_sendmsg:
2171     case Intrinsic::amdgcn_s_sendmsghalt: {
2172       // FIXME: Should this use a waterfall loop?
2173       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2174       return;
2175     }
2176     case Intrinsic::amdgcn_raw_buffer_load:
2177     case Intrinsic::amdgcn_raw_buffer_load_format:
2178     case Intrinsic::amdgcn_raw_tbuffer_load:
2179     case Intrinsic::amdgcn_raw_buffer_store:
2180     case Intrinsic::amdgcn_raw_buffer_store_format:
2181     case Intrinsic::amdgcn_raw_tbuffer_store: {
2182       applyDefaultMapping(OpdMapper);
2183       executeInWaterfallLoop(MI, MRI, {2, 4});
2184       return;
2185     }
2186     case Intrinsic::amdgcn_struct_buffer_load:
2187     case Intrinsic::amdgcn_struct_buffer_store:
2188     case Intrinsic::amdgcn_struct_tbuffer_load:
2189     case Intrinsic::amdgcn_struct_tbuffer_store: {
2190       applyDefaultMapping(OpdMapper);
2191       executeInWaterfallLoop(MI, MRI, {2, 5});
2192       return;
2193     }
2194     default: {
2195       if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
2196               AMDGPU::lookupRsrcIntrinsic(IntrID)) {
2197         // Non-images can have complications from operands that allow both SGPR
2198         // and VGPR. For now it's too complicated to figure out the final opcode
2199         // to derive the register bank from the MCInstrDesc.
2200         if (RSrcIntrin->IsImage) {
2201           applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
2202           return;
2203         }
2204       }
2205 
2206       break;
2207     }
2208     }
2209     break;
2210   }
2211   case AMDGPU::G_LOAD:
2212   case AMDGPU::G_ZEXTLOAD:
2213   case AMDGPU::G_SEXTLOAD: {
2214     if (applyMappingWideLoad(MI, OpdMapper, MRI))
2215       return;
2216     break;
2217   }
2218   default:
2219     break;
2220   }
2221 
2222   return applyDefaultMapping(OpdMapper);
2223 }
2224 
isSALUMapping(const MachineInstr & MI) const2225 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
2226   const MachineFunction &MF = *MI.getParent()->getParent();
2227   const MachineRegisterInfo &MRI = MF.getRegInfo();
2228   for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
2229     if (!MI.getOperand(i).isReg())
2230       continue;
2231     Register Reg = MI.getOperand(i).getReg();
2232     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
2233       if (Bank->getID() != AMDGPU::SGPRRegBankID)
2234         return false;
2235     }
2236   }
2237   return true;
2238 }
2239 
2240 const RegisterBankInfo::InstructionMapping &
getDefaultMappingSOP(const MachineInstr & MI) const2241 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
2242   const MachineFunction &MF = *MI.getParent()->getParent();
2243   const MachineRegisterInfo &MRI = MF.getRegInfo();
2244   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
2245 
2246   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
2247     unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
2248     OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2249   }
2250   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
2251                                MI.getNumOperands());
2252 }
2253 
2254 const RegisterBankInfo::InstructionMapping &
getDefaultMappingVOP(const MachineInstr & MI) const2255 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
2256   const MachineFunction &MF = *MI.getParent()->getParent();
2257   const MachineRegisterInfo &MRI = MF.getRegInfo();
2258   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
2259   unsigned OpdIdx = 0;
2260 
2261   unsigned Size0 = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2262   OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
2263 
2264   if (MI.getOperand(OpdIdx).isIntrinsicID())
2265     OpdsMapping[OpdIdx++] = nullptr;
2266 
2267   Register Reg1 = MI.getOperand(OpdIdx).getReg();
2268   unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI);
2269 
2270   unsigned DefaultBankID = Size1 == 1 ?
2271     AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
2272   unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI, DefaultBankID);
2273 
2274   OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1);
2275 
2276   for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) {
2277     const MachineOperand &MO = MI.getOperand(OpdIdx);
2278     if (!MO.isReg())
2279       continue;
2280 
2281     unsigned Size = getSizeInBits(MO.getReg(), MRI, *TRI);
2282     unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
2283     OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(BankID, Size);
2284   }
2285 
2286   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
2287                                MI.getNumOperands());
2288 }
2289 
2290 const RegisterBankInfo::InstructionMapping &
getDefaultMappingAllVGPR(const MachineInstr & MI) const2291 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
2292   const MachineFunction &MF = *MI.getParent()->getParent();
2293   const MachineRegisterInfo &MRI = MF.getRegInfo();
2294   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
2295 
2296   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
2297     const MachineOperand &Op = MI.getOperand(I);
2298     if (!Op.isReg())
2299       continue;
2300 
2301     unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
2302     OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2303   }
2304 
2305   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
2306                                MI.getNumOperands());
2307 }
2308 
2309 const RegisterBankInfo::InstructionMapping &
getImageMapping(const MachineRegisterInfo & MRI,const MachineInstr & MI,int RsrcIdx) const2310 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
2311                                         const MachineInstr &MI,
2312                                         int RsrcIdx) const {
2313   // The reported argument index is relative to the IR intrinsic call arguments,
2314   // so we need to shift by the number of defs and the intrinsic ID.
2315   RsrcIdx += MI.getNumExplicitDefs() + 1;
2316 
2317   const int NumOps = MI.getNumOperands();
2318   SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
2319 
2320   // TODO: Should packed/unpacked D16 difference be reported here as part of
2321   // the value mapping?
2322   for (int I = 0; I != NumOps; ++I) {
2323     if (!MI.getOperand(I).isReg())
2324       continue;
2325 
2326     Register OpReg = MI.getOperand(I).getReg();
2327     unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
2328 
2329     // FIXME: Probably need a new intrinsic register bank searchable table to
2330     // handle arbitrary intrinsics easily.
2331     //
2332     // If this has a sampler, it immediately follows rsrc.
2333     const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
2334 
2335     if (MustBeSGPR) {
2336       // If this must be an SGPR, so we must report whatever it is as legal.
2337       unsigned NewBank = getRegBankID(OpReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
2338       OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
2339     } else {
2340       // Some operands must be VGPR, and these are easy to copy to.
2341       OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2342     }
2343   }
2344 
2345   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
2346 }
2347 
2348 const RegisterBankInfo::InstructionMapping &
getInstrMappingForLoad(const MachineInstr & MI) const2349 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
2350 
2351   const MachineFunction &MF = *MI.getParent()->getParent();
2352   const MachineRegisterInfo &MRI = MF.getRegInfo();
2353   SmallVector<const ValueMapping*, 2> OpdsMapping(2);
2354   unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2355   LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
2356   Register PtrReg = MI.getOperand(1).getReg();
2357   LLT PtrTy = MRI.getType(PtrReg);
2358   unsigned AS = PtrTy.getAddressSpace();
2359   unsigned PtrSize = PtrTy.getSizeInBits();
2360 
2361   const ValueMapping *ValMapping;
2362   const ValueMapping *PtrMapping;
2363 
2364   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
2365 
2366   if (PtrBank == &AMDGPU::SGPRRegBank &&
2367       (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
2368        AS != AMDGPUAS::PRIVATE_ADDRESS) &&
2369       isScalarLoadLegal(MI)) {
2370     // We have a uniform instruction so we want to use an SMRD load
2371     ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2372     PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
2373   } else {
2374     ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy);
2375     PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
2376   }
2377 
2378   OpdsMapping[0] = ValMapping;
2379   OpdsMapping[1] = PtrMapping;
2380   const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
2381       1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
2382   return Mapping;
2383 
2384   // FIXME: Do we want to add a mapping for FLAT load, or should we just
2385   // handle that during instruction selection?
2386 }
2387 
2388 unsigned
getRegBankID(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI,unsigned Default) const2389 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
2390                                      const MachineRegisterInfo &MRI,
2391                                      const TargetRegisterInfo &TRI,
2392                                      unsigned Default) const {
2393   const RegisterBank *Bank = getRegBank(Reg, MRI, TRI);
2394   return Bank ? Bank->getID() : Default;
2395 }
2396 
2397 
regBankUnion(unsigned RB0,unsigned RB1)2398 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
2399   return (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) ?
2400     AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2401 }
2402 
regBankBoolUnion(int RB0,int RB1)2403 static int regBankBoolUnion(int RB0, int RB1) {
2404   if (RB0 == -1)
2405     return RB1;
2406   if (RB1 == -1)
2407     return RB0;
2408 
2409   // vcc, vcc -> vcc
2410   // vcc, sgpr -> vcc
2411   // vcc, vgpr -> vcc
2412   if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
2413     return AMDGPU::VCCRegBankID;
2414 
2415   // vcc, vgpr -> vgpr
2416   return regBankUnion(RB0, RB1);
2417 }
2418 
2419 const RegisterBankInfo::ValueMapping *
getSGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const2420 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
2421                                          const MachineRegisterInfo &MRI,
2422                                          const TargetRegisterInfo &TRI) const {
2423   // Lie and claim anything is legal, even though this needs to be an SGPR
2424   // applyMapping will have to deal with it as a waterfall loop.
2425   unsigned Bank = getRegBankID(Reg, MRI, TRI, AMDGPU::SGPRRegBankID);
2426   unsigned Size = getSizeInBits(Reg, MRI, TRI);
2427   return AMDGPU::getValueMapping(Bank, Size);
2428 }
2429 
2430 const RegisterBankInfo::ValueMapping *
getVGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const2431 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
2432                                          const MachineRegisterInfo &MRI,
2433                                          const TargetRegisterInfo &TRI) const {
2434   unsigned Size = getSizeInBits(Reg, MRI, TRI);
2435   return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2436 }
2437 
2438 const RegisterBankInfo::ValueMapping *
getAGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const2439 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
2440                                          const MachineRegisterInfo &MRI,
2441                                          const TargetRegisterInfo &TRI) const {
2442   unsigned Size = getSizeInBits(Reg, MRI, TRI);
2443   return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
2444 }
2445 
2446 ///
2447 /// This function must return a legal mapping, because
2448 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
2449 /// in RegBankSelect::Mode::Fast.  Any mapping that would cause a
2450 /// VGPR to SGPR generated is illegal.
2451 ///
2452 const RegisterBankInfo::InstructionMapping &
getInstrMapping(const MachineInstr & MI) const2453 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
2454   const MachineFunction &MF = *MI.getParent()->getParent();
2455   const MachineRegisterInfo &MRI = MF.getRegInfo();
2456 
2457   if (MI.isRegSequence()) {
2458     // If any input is a VGPR, the result must be a VGPR. The default handling
2459     // assumes any copy between banks is legal.
2460     unsigned BankID = AMDGPU::SGPRRegBankID;
2461 
2462     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2463       auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI, *TRI);
2464       // It doesn't make sense to use vcc or scc banks here, so just ignore
2465       // them.
2466       if (OpBank != AMDGPU::SGPRRegBankID) {
2467         BankID = AMDGPU::VGPRRegBankID;
2468         break;
2469       }
2470     }
2471     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2472 
2473     const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
2474     return getInstructionMapping(
2475         1, /*Cost*/ 1,
2476         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
2477   }
2478 
2479   // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
2480   // properly.
2481   //
2482   // TODO: There are additional exec masking dependencies to analyze.
2483   if (MI.getOpcode() == TargetOpcode::G_PHI) {
2484     // TODO: Generate proper invalid bank enum.
2485     int ResultBank = -1;
2486     Register DstReg = MI.getOperand(0).getReg();
2487 
2488     // Sometimes the result may have already been assigned a bank.
2489     if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
2490       ResultBank = DstBank->getID();
2491 
2492     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2493       Register Reg = MI.getOperand(I).getReg();
2494       const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
2495 
2496       // FIXME: Assuming VGPR for any undetermined inputs.
2497       if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
2498         ResultBank = AMDGPU::VGPRRegBankID;
2499         break;
2500       }
2501 
2502       // FIXME: Need to promote SGPR case to s32
2503       unsigned OpBank = Bank->getID();
2504       ResultBank = regBankBoolUnion(ResultBank, OpBank);
2505     }
2506 
2507     assert(ResultBank != -1);
2508 
2509     unsigned Size = MRI.getType(DstReg).getSizeInBits();
2510 
2511     const ValueMapping &ValMap =
2512         getValueMapping(0, Size, getRegBank(ResultBank));
2513     return getInstructionMapping(
2514         1, /*Cost*/ 1,
2515         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
2516   }
2517 
2518   const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
2519   if (Mapping.isValid())
2520     return Mapping;
2521 
2522   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
2523 
2524   switch (MI.getOpcode()) {
2525   default:
2526     return getInvalidInstructionMapping();
2527 
2528   case AMDGPU::G_AND:
2529   case AMDGPU::G_OR:
2530   case AMDGPU::G_XOR: {
2531     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2532     if (Size == 1) {
2533       const RegisterBank *DstBank
2534         = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
2535 
2536       unsigned TargetBankID = -1;
2537       unsigned BankLHS = -1;
2538       unsigned BankRHS = -1;
2539       if (DstBank) {
2540         TargetBankID = DstBank->getID();
2541         if (DstBank == &AMDGPU::VCCRegBank) {
2542           TargetBankID = AMDGPU::VCCRegBankID;
2543           BankLHS = AMDGPU::VCCRegBankID;
2544           BankRHS = AMDGPU::VCCRegBankID;
2545         } else {
2546           BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
2547                                  AMDGPU::SGPRRegBankID);
2548           BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2549                                  AMDGPU::SGPRRegBankID);
2550         }
2551       } else {
2552         BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
2553                                AMDGPU::VCCRegBankID);
2554         BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2555                                AMDGPU::VCCRegBankID);
2556 
2557         // Both inputs should be true booleans to produce a boolean result.
2558         if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
2559           TargetBankID = AMDGPU::VGPRRegBankID;
2560         } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
2561           TargetBankID = AMDGPU::VCCRegBankID;
2562           BankLHS = AMDGPU::VCCRegBankID;
2563           BankRHS = AMDGPU::VCCRegBankID;
2564         } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
2565           TargetBankID = AMDGPU::SGPRRegBankID;
2566         }
2567       }
2568 
2569       OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
2570       OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
2571       OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
2572       break;
2573     }
2574 
2575     if (Size == 64) {
2576 
2577       if (isSALUMapping(MI)) {
2578         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
2579         OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
2580       } else {
2581         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
2582         unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI/*, DefaultBankID*/);
2583         OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
2584 
2585         unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI/*, DefaultBankID*/);
2586         OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
2587       }
2588 
2589       break;
2590     }
2591 
2592     LLVM_FALLTHROUGH;
2593   }
2594   case AMDGPU::G_PTR_ADD:
2595   case AMDGPU::G_ADD:
2596   case AMDGPU::G_SUB:
2597   case AMDGPU::G_MUL:
2598   case AMDGPU::G_SHL:
2599   case AMDGPU::G_LSHR:
2600   case AMDGPU::G_ASHR:
2601   case AMDGPU::G_UADDO:
2602   case AMDGPU::G_USUBO:
2603   case AMDGPU::G_UADDE:
2604   case AMDGPU::G_SADDE:
2605   case AMDGPU::G_USUBE:
2606   case AMDGPU::G_SSUBE:
2607   case AMDGPU::G_SMIN:
2608   case AMDGPU::G_SMAX:
2609   case AMDGPU::G_UMIN:
2610   case AMDGPU::G_UMAX:
2611     if (isSALUMapping(MI))
2612       return getDefaultMappingSOP(MI);
2613     LLVM_FALLTHROUGH;
2614 
2615   case AMDGPU::G_FADD:
2616   case AMDGPU::G_FSUB:
2617   case AMDGPU::G_FPTOSI:
2618   case AMDGPU::G_FPTOUI:
2619   case AMDGPU::G_FMUL:
2620   case AMDGPU::G_FMA:
2621   case AMDGPU::G_FMAD:
2622   case AMDGPU::G_FSQRT:
2623   case AMDGPU::G_FFLOOR:
2624   case AMDGPU::G_FCEIL:
2625   case AMDGPU::G_FRINT:
2626   case AMDGPU::G_SITOFP:
2627   case AMDGPU::G_UITOFP:
2628   case AMDGPU::G_FPTRUNC:
2629   case AMDGPU::G_FPEXT:
2630   case AMDGPU::G_FEXP2:
2631   case AMDGPU::G_FLOG2:
2632   case AMDGPU::G_FMINNUM:
2633   case AMDGPU::G_FMAXNUM:
2634   case AMDGPU::G_FMINNUM_IEEE:
2635   case AMDGPU::G_FMAXNUM_IEEE:
2636   case AMDGPU::G_FCANONICALIZE:
2637   case AMDGPU::G_INTRINSIC_TRUNC:
2638   case AMDGPU::G_AMDGPU_FFBH_U32:
2639     return getDefaultMappingVOP(MI);
2640   case AMDGPU::G_UMULH:
2641   case AMDGPU::G_SMULH: {
2642     if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
2643       return getDefaultMappingSOP(MI);
2644     return getDefaultMappingVOP(MI);
2645   }
2646   case AMDGPU::G_IMPLICIT_DEF: {
2647     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2648     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2649     break;
2650   }
2651   case AMDGPU::G_FCONSTANT:
2652   case AMDGPU::G_CONSTANT:
2653   case AMDGPU::G_GLOBAL_VALUE:
2654   case AMDGPU::G_BLOCK_ADDR:
2655   case AMDGPU::G_READCYCLECOUNTER: {
2656     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2657     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2658     break;
2659   }
2660   case AMDGPU::G_FRAME_INDEX: {
2661     // TODO: This should be the same as other constants, but eliminateFrameIndex
2662     // currently assumes VALU uses.
2663     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2664     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2665     break;
2666   }
2667   case AMDGPU::G_INSERT: {
2668     unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
2669                                           AMDGPU::VGPRRegBankID;
2670     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2671     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2672     unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
2673     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
2674     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
2675     OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
2676     OpdsMapping[3] = nullptr;
2677     break;
2678   }
2679   case AMDGPU::G_EXTRACT: {
2680     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2681     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2682     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2683     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
2684     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
2685     OpdsMapping[2] = nullptr;
2686     break;
2687   }
2688   case AMDGPU::G_BUILD_VECTOR:
2689   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
2690     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
2691     if (DstTy == LLT::vector(2, 16)) {
2692       unsigned DstSize = DstTy.getSizeInBits();
2693       unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2694       unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2695       unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2696       unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
2697 
2698       OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
2699       OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
2700       OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
2701       break;
2702     }
2703 
2704     LLVM_FALLTHROUGH;
2705   }
2706   case AMDGPU::G_MERGE_VALUES:
2707   case AMDGPU::G_CONCAT_VECTORS: {
2708     unsigned Bank = isSALUMapping(MI) ?
2709       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2710     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2711     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2712 
2713     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
2714     // Op1 and Dst should use the same register bank.
2715     for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
2716       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
2717     break;
2718   }
2719   case AMDGPU::G_BITCAST:
2720   case AMDGPU::G_INTTOPTR:
2721   case AMDGPU::G_PTRTOINT:
2722   case AMDGPU::G_CTLZ:
2723   case AMDGPU::G_CTLZ_ZERO_UNDEF:
2724   case AMDGPU::G_CTTZ:
2725   case AMDGPU::G_CTTZ_ZERO_UNDEF:
2726   case AMDGPU::G_CTPOP:
2727   case AMDGPU::G_BSWAP:
2728   case AMDGPU::G_BITREVERSE:
2729   case AMDGPU::G_FABS:
2730   case AMDGPU::G_FNEG: {
2731     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2732     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2733     OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
2734     break;
2735   }
2736   case AMDGPU::G_TRUNC: {
2737     Register Dst = MI.getOperand(0).getReg();
2738     Register Src = MI.getOperand(1).getReg();
2739     unsigned Bank = getRegBankID(Src, MRI, *TRI);
2740     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
2741     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
2742     OpdsMapping[0] = DstSize == 1 && Bank != AMDGPU::SGPRRegBankID ?
2743       AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize) :
2744       AMDGPU::getValueMapping(Bank, DstSize);
2745     OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
2746     break;
2747   }
2748   case AMDGPU::G_ZEXT:
2749   case AMDGPU::G_SEXT:
2750   case AMDGPU::G_ANYEXT: {
2751     Register Dst = MI.getOperand(0).getReg();
2752     Register Src = MI.getOperand(1).getReg();
2753     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
2754     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
2755 
2756     unsigned DstBank;
2757     const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
2758     assert(SrcBank);
2759     switch (SrcBank->getID()) {
2760     case AMDGPU::SGPRRegBankID:
2761       DstBank = AMDGPU::SGPRRegBankID;
2762       break;
2763     default:
2764       DstBank = AMDGPU::VGPRRegBankID;
2765       break;
2766     }
2767 
2768     // TODO: Should anyext be split into 32-bit part as well?
2769     if (MI.getOpcode() == AMDGPU::G_ANYEXT) {
2770       OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize);
2771       OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank->getID(), SrcSize);
2772     } else {
2773       // Scalar extend can use 64-bit BFE, but VGPRs require extending to
2774       // 32-bits, and then to 64.
2775       OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
2776       OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
2777                                                          SrcSize);
2778     }
2779     break;
2780   }
2781   case AMDGPU::G_FCMP: {
2782     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2783     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2784     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
2785     OpdsMapping[1] = nullptr; // Predicate Operand.
2786     OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
2787     OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2788     break;
2789   }
2790   case AMDGPU::G_STORE: {
2791     assert(MI.getOperand(0).isReg());
2792     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2793     // FIXME: We need to specify a different reg bank once scalar stores
2794     // are supported.
2795     const ValueMapping *ValMapping =
2796         AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2797     // FIXME: Depending on the type of store, the pointer could be in
2798     // the SGPR Reg bank.
2799     // FIXME: Pointer size should be based on the address space.
2800     const ValueMapping *PtrMapping =
2801         AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
2802 
2803     OpdsMapping[0] = ValMapping;
2804     OpdsMapping[1] = PtrMapping;
2805     break;
2806   }
2807 
2808   case AMDGPU::G_ICMP: {
2809     auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
2810     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2811     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2812     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
2813 
2814     bool CanUseSCC = Op2Bank == AMDGPU::SGPRRegBankID &&
2815                      Op3Bank == AMDGPU::SGPRRegBankID &&
2816       (Size == 32 || (Size == 64 &&
2817                       (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
2818                       Subtarget.hasScalarCompareEq64()));
2819 
2820     unsigned Op0Bank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
2821 
2822     // TODO: Use 32-bit for scalar output size.
2823     // SCC results will need to be copied to a 32-bit SGPR virtual register.
2824     const unsigned ResultSize = 1;
2825 
2826     OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, ResultSize);
2827     OpdsMapping[1] = nullptr; // Predicate Operand.
2828     OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
2829     OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size);
2830     break;
2831   }
2832   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2833     // VGPR index can be used for waterfall when indexing a SGPR vector.
2834     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2835     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2836     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2837     unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2838     unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2839     unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
2840 
2841     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
2842     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
2843 
2844     // The index can be either if the source vector is VGPR.
2845     OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
2846     break;
2847   }
2848   case AMDGPU::G_INSERT_VECTOR_ELT: {
2849     unsigned OutputBankID = isSALUMapping(MI) ?
2850       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2851 
2852     unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2853     unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2854     unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
2855     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2856     unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(),
2857                                             MRI, *TRI);
2858     unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
2859 
2860     OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
2861     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, VecSize);
2862     OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(InsertEltBankID,
2863                                                        InsertSize);
2864 
2865     // The index can be either if the source vector is VGPR.
2866     OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
2867     break;
2868   }
2869   case AMDGPU::G_UNMERGE_VALUES: {
2870     unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
2871       AMDGPU::VGPRRegBankID;
2872 
2873     // Op1 and Dst should use the same register bank.
2874     // FIXME: Shouldn't this be the default? Why do we need to handle this?
2875     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
2876       unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
2877       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
2878     }
2879     break;
2880   }
2881   case AMDGPU::G_INTRINSIC: {
2882     switch (MI.getIntrinsicID()) {
2883     default:
2884       return getInvalidInstructionMapping();
2885     case Intrinsic::amdgcn_div_fmas:
2886     case Intrinsic::amdgcn_div_fixup:
2887     case Intrinsic::amdgcn_trig_preop:
2888     case Intrinsic::amdgcn_sin:
2889     case Intrinsic::amdgcn_cos:
2890     case Intrinsic::amdgcn_log_clamp:
2891     case Intrinsic::amdgcn_rcp:
2892     case Intrinsic::amdgcn_rcp_legacy:
2893     case Intrinsic::amdgcn_rsq:
2894     case Intrinsic::amdgcn_rsq_legacy:
2895     case Intrinsic::amdgcn_rsq_clamp:
2896     case Intrinsic::amdgcn_ldexp:
2897     case Intrinsic::amdgcn_frexp_mant:
2898     case Intrinsic::amdgcn_frexp_exp:
2899     case Intrinsic::amdgcn_fract:
2900     case Intrinsic::amdgcn_cvt_pkrtz:
2901     case Intrinsic::amdgcn_cvt_pknorm_i16:
2902     case Intrinsic::amdgcn_cvt_pknorm_u16:
2903     case Intrinsic::amdgcn_cvt_pk_i16:
2904     case Intrinsic::amdgcn_cvt_pk_u16:
2905     case Intrinsic::amdgcn_fmed3:
2906     case Intrinsic::amdgcn_cubeid:
2907     case Intrinsic::amdgcn_cubema:
2908     case Intrinsic::amdgcn_cubesc:
2909     case Intrinsic::amdgcn_cubetc:
2910     case Intrinsic::amdgcn_sffbh:
2911     case Intrinsic::amdgcn_fmad_ftz:
2912     case Intrinsic::amdgcn_mbcnt_lo:
2913     case Intrinsic::amdgcn_mbcnt_hi:
2914     case Intrinsic::amdgcn_ubfe:
2915     case Intrinsic::amdgcn_sbfe:
2916     case Intrinsic::amdgcn_mul_u24:
2917     case Intrinsic::amdgcn_mul_i24:
2918     case Intrinsic::amdgcn_lerp:
2919     case Intrinsic::amdgcn_sad_u8:
2920     case Intrinsic::amdgcn_msad_u8:
2921     case Intrinsic::amdgcn_sad_hi_u8:
2922     case Intrinsic::amdgcn_sad_u16:
2923     case Intrinsic::amdgcn_qsad_pk_u16_u8:
2924     case Intrinsic::amdgcn_mqsad_pk_u16_u8:
2925     case Intrinsic::amdgcn_mqsad_u32_u8:
2926     case Intrinsic::amdgcn_cvt_pk_u8_f32:
2927     case Intrinsic::amdgcn_alignbit:
2928     case Intrinsic::amdgcn_alignbyte:
2929     case Intrinsic::amdgcn_fdot2:
2930     case Intrinsic::amdgcn_sdot2:
2931     case Intrinsic::amdgcn_udot2:
2932     case Intrinsic::amdgcn_sdot4:
2933     case Intrinsic::amdgcn_udot4:
2934     case Intrinsic::amdgcn_sdot8:
2935     case Intrinsic::amdgcn_udot8:
2936     case Intrinsic::amdgcn_wwm:
2937     case Intrinsic::amdgcn_wqm:
2938       return getDefaultMappingVOP(MI);
2939     case Intrinsic::amdgcn_ds_swizzle:
2940     case Intrinsic::amdgcn_ds_permute:
2941     case Intrinsic::amdgcn_ds_bpermute:
2942     case Intrinsic::amdgcn_update_dpp:
2943       return getDefaultMappingAllVGPR(MI);
2944     case Intrinsic::amdgcn_kernarg_segment_ptr:
2945     case Intrinsic::amdgcn_s_getpc:
2946     case Intrinsic::amdgcn_groupstaticsize: {
2947       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2948       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2949       break;
2950     }
2951     case Intrinsic::amdgcn_wqm_vote: {
2952       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2953       OpdsMapping[0] = OpdsMapping[2]
2954         = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
2955       break;
2956     }
2957     case Intrinsic::amdgcn_s_buffer_load: {
2958       // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS
2959       Register RSrc = MI.getOperand(2).getReg();   // SGPR
2960       Register Offset = MI.getOperand(3).getReg(); // SGPR/imm
2961 
2962       unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2963       unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
2964       unsigned Size3 = MRI.getType(Offset).getSizeInBits();
2965 
2966       unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
2967       unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
2968 
2969       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size0);
2970       OpdsMapping[1] = nullptr; // intrinsic id
2971 
2972       // Lie and claim everything is legal, even though some need to be
2973       // SGPRs. applyMapping will have to deal with it as a waterfall loop.
2974       OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
2975       OpdsMapping[3] = AMDGPU::getValueMapping(OffsetBank, Size3);
2976       OpdsMapping[4] = nullptr;
2977       break;
2978     }
2979     case Intrinsic::amdgcn_div_scale: {
2980       unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2981       unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2982       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
2983       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
2984 
2985       unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
2986       OpdsMapping[3] = AMDGPU::getValueMapping(
2987         getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI), SrcSize);
2988       OpdsMapping[4] = AMDGPU::getValueMapping(
2989         getRegBankID(MI.getOperand(4).getReg(), MRI, *TRI), SrcSize);
2990 
2991       break;
2992     }
2993     case Intrinsic::amdgcn_class: {
2994       Register Src0Reg = MI.getOperand(2).getReg();
2995       Register Src1Reg = MI.getOperand(3).getReg();
2996       unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
2997       unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
2998       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2999       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
3000       OpdsMapping[2] = AMDGPU::getValueMapping(getRegBankID(Src0Reg, MRI, *TRI),
3001                                                Src0Size);
3002       OpdsMapping[3] = AMDGPU::getValueMapping(getRegBankID(Src1Reg, MRI, *TRI),
3003                                                Src1Size);
3004       break;
3005     }
3006     case Intrinsic::amdgcn_icmp:
3007     case Intrinsic::amdgcn_fcmp: {
3008       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3009       // This is not VCCRegBank because this is not used in boolean contexts.
3010       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
3011       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3012       unsigned Op1Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
3013       unsigned Op2Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
3014       OpdsMapping[2] = AMDGPU::getValueMapping(Op1Bank, OpSize);
3015       OpdsMapping[3] = AMDGPU::getValueMapping(Op2Bank, OpSize);
3016       break;
3017     }
3018     case Intrinsic::amdgcn_readlane: {
3019       // This must be an SGPR, but accept a VGPR.
3020       Register IdxReg = MI.getOperand(3).getReg();
3021       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
3022       unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
3023       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
3024       LLVM_FALLTHROUGH;
3025     }
3026     case Intrinsic::amdgcn_readfirstlane: {
3027       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3028       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3029       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
3030       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
3031       break;
3032     }
3033     case Intrinsic::amdgcn_writelane: {
3034       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3035       Register SrcReg = MI.getOperand(2).getReg();
3036       unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
3037       unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
3038       Register IdxReg = MI.getOperand(3).getReg();
3039       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
3040       unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
3041       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
3042 
3043       // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
3044       // to legalize.
3045       OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
3046       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
3047       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
3048       break;
3049     }
3050     case Intrinsic::amdgcn_if_break: {
3051       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3052       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3053       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3054       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3055       break;
3056     }
3057     case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
3058     case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
3059     case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
3060     case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
3061     case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
3062     case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
3063     case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
3064     case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
3065     case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
3066     case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
3067     case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
3068     case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
3069     case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
3070     case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
3071     case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
3072     case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
3073     case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
3074     case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
3075     case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
3076     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: {
3077       // Default for MAI intrinsics.
3078       // srcC can also be an immediate which can be folded later.
3079       // FIXME: Should we eventually add an alternative mapping with AGPR src
3080       // for srcA/srcB?
3081       //
3082       // vdst, srcA, srcB, srcC
3083       OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3084       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3085       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3086       OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3087       break;
3088     }
3089     }
3090     break;
3091   }
3092   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
3093     auto IntrID = MI.getIntrinsicID();
3094     switch (IntrID) {
3095     case Intrinsic::amdgcn_s_getreg:
3096     case Intrinsic::amdgcn_s_memtime:
3097     case Intrinsic::amdgcn_s_memrealtime:
3098     case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
3099       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3100       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3101       break;
3102     }
3103     case Intrinsic::amdgcn_ds_append:
3104     case Intrinsic::amdgcn_ds_consume:
3105     case Intrinsic::amdgcn_ds_fadd:
3106     case Intrinsic::amdgcn_ds_fmin:
3107     case Intrinsic::amdgcn_ds_fmax:
3108     case Intrinsic::amdgcn_atomic_inc:
3109     case Intrinsic::amdgcn_atomic_dec:
3110       return getDefaultMappingAllVGPR(MI);
3111     case Intrinsic::amdgcn_ds_ordered_add:
3112     case Intrinsic::amdgcn_ds_ordered_swap: {
3113       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3114       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
3115       unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
3116                                  AMDGPU::SGPRRegBankID);
3117       OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
3118       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3119       break;
3120     }
3121     case Intrinsic::amdgcn_exp_compr:
3122       OpdsMapping[0] = nullptr; // IntrinsicID
3123       // FIXME: These are immediate values which can't be read from registers.
3124       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3125       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3126       // FIXME: Could we support packed types here?
3127       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3128       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3129       // FIXME: These are immediate values which can't be read from registers.
3130       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3131       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3132       break;
3133     case Intrinsic::amdgcn_exp:
3134       // FIXME: Could we support packed types here?
3135       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3136       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3137       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3138       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3139       break;
3140     case Intrinsic::amdgcn_buffer_load: {
3141       Register RSrc = MI.getOperand(2).getReg();   // SGPR
3142       Register VIndex = MI.getOperand(3).getReg(); // VGPR
3143       Register Offset = MI.getOperand(4).getReg(); // SGPR/VGPR/imm
3144 
3145       unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3146       unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
3147       unsigned Size3 = MRI.getType(VIndex).getSizeInBits();
3148       unsigned Size4 = MRI.getType(Offset).getSizeInBits();
3149 
3150       unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
3151       unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
3152 
3153       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
3154       OpdsMapping[1] = nullptr; // intrinsic id
3155 
3156       // Lie and claim everything is legal, even though some need to be
3157       // SGPRs. applyMapping will have to deal with it as a waterfall loop.
3158       OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
3159       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size3);
3160       OpdsMapping[4] = AMDGPU::getValueMapping(OffsetBank, Size4);
3161       OpdsMapping[5] = nullptr;
3162       OpdsMapping[6] = nullptr;
3163       break;
3164     }
3165     case Intrinsic::amdgcn_s_sendmsg:
3166     case Intrinsic::amdgcn_s_sendmsghalt: {
3167       // This must be an SGPR, but accept a VGPR.
3168       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
3169                                    AMDGPU::SGPRRegBankID);
3170       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
3171       break;
3172     }
3173     case Intrinsic::amdgcn_end_cf:
3174     case Intrinsic::amdgcn_init_exec: {
3175       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3176       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3177       break;
3178     }
3179     case Intrinsic::amdgcn_else: {
3180       unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3181       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3182       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
3183       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
3184       break;
3185     }
3186     case Intrinsic::amdgcn_kill: {
3187       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3188       break;
3189     }
3190     case Intrinsic::amdgcn_raw_buffer_load:
3191     case Intrinsic::amdgcn_raw_tbuffer_load: {
3192       // FIXME: Should make intrinsic ID the last operand of the instruction,
3193       // then this would be the same as store
3194       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3195       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3196       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3197       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3198       break;
3199     }
3200     case Intrinsic::amdgcn_raw_buffer_store:
3201     case Intrinsic::amdgcn_raw_buffer_store_format:
3202     case Intrinsic::amdgcn_raw_tbuffer_store: {
3203       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3204       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3205       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3206       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3207       break;
3208     }
3209     case Intrinsic::amdgcn_struct_buffer_load:
3210     case Intrinsic::amdgcn_struct_tbuffer_load: {
3211       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3212       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3213       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3214       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3215       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3216       break;
3217     }
3218     case Intrinsic::amdgcn_struct_buffer_store:
3219     case Intrinsic::amdgcn_struct_tbuffer_store: {
3220       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3221       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3222       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3223       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3224       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3225       break;
3226     }
3227     case Intrinsic::amdgcn_init_exec_from_input: {
3228       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3229       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3230       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3231       break;
3232     }
3233     case Intrinsic::amdgcn_ds_gws_init:
3234     case Intrinsic::amdgcn_ds_gws_barrier:
3235     case Intrinsic::amdgcn_ds_gws_sema_br: {
3236       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3237 
3238       // This must be an SGPR, but accept a VGPR.
3239       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
3240                                    AMDGPU::SGPRRegBankID);
3241       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
3242       break;
3243     }
3244     case Intrinsic::amdgcn_ds_gws_sema_v:
3245     case Intrinsic::amdgcn_ds_gws_sema_p:
3246     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3247       // This must be an SGPR, but accept a VGPR.
3248       unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
3249                                    AMDGPU::SGPRRegBankID);
3250       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
3251       break;
3252     }
3253     default:
3254       if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3255               AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3256         // Non-images can have complications from operands that allow both SGPR
3257         // and VGPR. For now it's too complicated to figure out the final opcode
3258         // to derive the register bank from the MCInstrDesc.
3259         if (RSrcIntrin->IsImage)
3260           return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
3261       }
3262 
3263       return getInvalidInstructionMapping();
3264     }
3265     break;
3266   }
3267   case AMDGPU::G_SELECT: {
3268     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3269     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
3270                                     AMDGPU::SGPRRegBankID);
3271     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI,
3272                                     AMDGPU::SGPRRegBankID);
3273     bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
3274                     Op3Bank == AMDGPU::SGPRRegBankID;
3275 
3276     unsigned CondBankDefault = SGPRSrcs ?
3277       AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
3278     unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
3279                                      CondBankDefault);
3280     if (CondBank == AMDGPU::SGPRRegBankID)
3281       CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
3282     else if (CondBank == AMDGPU::VGPRRegBankID)
3283       CondBank = AMDGPU::VCCRegBankID;
3284 
3285     unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
3286       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3287 
3288     assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
3289 
3290     // TODO: Should report 32-bit for scalar condition type.
3291     if (Size == 64) {
3292       OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
3293       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
3294       OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
3295       OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
3296     } else {
3297       OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
3298       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
3299       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
3300       OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
3301     }
3302 
3303     break;
3304   }
3305 
3306   case AMDGPU::G_LOAD:
3307   case AMDGPU::G_ZEXTLOAD:
3308   case AMDGPU::G_SEXTLOAD:
3309     return getInstrMappingForLoad(MI);
3310 
3311   case AMDGPU::G_ATOMICRMW_XCHG:
3312   case AMDGPU::G_ATOMICRMW_ADD:
3313   case AMDGPU::G_ATOMICRMW_SUB:
3314   case AMDGPU::G_ATOMICRMW_AND:
3315   case AMDGPU::G_ATOMICRMW_OR:
3316   case AMDGPU::G_ATOMICRMW_XOR:
3317   case AMDGPU::G_ATOMICRMW_MAX:
3318   case AMDGPU::G_ATOMICRMW_MIN:
3319   case AMDGPU::G_ATOMICRMW_UMAX:
3320   case AMDGPU::G_ATOMICRMW_UMIN:
3321   case AMDGPU::G_ATOMICRMW_FADD:
3322   case AMDGPU::G_ATOMIC_CMPXCHG:
3323   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
3324     return getDefaultMappingAllVGPR(MI);
3325   }
3326   case AMDGPU::G_BRCOND: {
3327     unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI,
3328                                  AMDGPU::SGPRRegBankID);
3329     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
3330     if (Bank != AMDGPU::SGPRRegBankID)
3331       Bank = AMDGPU::VCCRegBankID;
3332 
3333     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
3334     break;
3335   }
3336   }
3337 
3338   return getInstructionMapping(/*ID*/1, /*Cost*/1,
3339                                getOperandsMapping(OpdsMapping),
3340                                MI.getNumOperands());
3341 }
3342