1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPURegisterBankInfo.h"
15 #include "AMDGPUInstrInfo.h"
16 #include "AMDGPUSubtarget.h"
17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "SIRegisterInfo.h"
20 #include "llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
25 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
26 #include "llvm/CodeGen/TargetRegisterInfo.h"
27 #include "llvm/CodeGen/TargetSubtargetInfo.h"
28 #include "llvm/IR/Constants.h"
29
30 #define GET_TARGET_REGBANK_IMPL
31 #include "AMDGPUGenRegisterBank.inc"
32
33 // This file will be TableGen'ed at some point.
34 #include "AMDGPUGenRegisterBankInfo.def"
35
36 using namespace llvm;
37 using namespace MIPatternMatch;
38
39 namespace {
40
41 // Observer to apply a register bank to new registers created by LegalizerHelper.
42 class ApplyRegBankMapping final : public GISelChangeObserver {
43 private:
44 const AMDGPURegisterBankInfo &RBI;
45 MachineRegisterInfo &MRI;
46 const RegisterBank *NewBank;
47 SmallVector<MachineInstr *, 4> NewInsts;
48
49 public:
ApplyRegBankMapping(const AMDGPURegisterBankInfo & RBI_,MachineRegisterInfo & MRI_,const RegisterBank * RB)50 ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
51 MachineRegisterInfo &MRI_, const RegisterBank *RB)
52 : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
53
~ApplyRegBankMapping()54 ~ApplyRegBankMapping() {
55 for (MachineInstr *MI : NewInsts)
56 applyBank(*MI);
57 }
58
59 /// Set any registers that don't have a set register class or bank to SALU.
applyBank(MachineInstr & MI)60 void applyBank(MachineInstr &MI) {
61 const unsigned Opc = MI.getOpcode();
62 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
63 Opc == AMDGPU::G_SEXT) {
64 // LegalizerHelper wants to use the basic legalization artifacts when
65 // widening etc. We don't handle selection with vcc in artifact sources,
66 // so we need to use a sslect instead to handle these properly.
67 Register DstReg = MI.getOperand(0).getReg();
68 Register SrcReg = MI.getOperand(1).getReg();
69 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
70 if (SrcBank == &AMDGPU::VCCRegBank) {
71 const LLT S32 = LLT::scalar(32);
72 assert(MRI.getType(SrcReg) == LLT::scalar(1));
73 assert(MRI.getType(DstReg) == S32);
74 assert(NewBank == &AMDGPU::VGPRRegBank);
75
76 // Replace the extension with a select, which really uses the boolean
77 // source.
78 MachineIRBuilder B(MI);
79 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
80 auto False = B.buildConstant(S32, 0);
81 B.buildSelect(DstReg, SrcReg, True, False);
82 MRI.setRegBank(True.getReg(0), *NewBank);
83 MRI.setRegBank(False.getReg(0), *NewBank);
84 MI.eraseFromParent();
85 }
86
87 assert(!MRI.getRegClassOrRegBank(DstReg));
88 MRI.setRegBank(DstReg, *NewBank);
89 return;
90 }
91
92 #ifndef NDEBUG
93 if (Opc == AMDGPU::G_TRUNC) {
94 Register DstReg = MI.getOperand(0).getReg();
95 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
96 assert(DstBank != &AMDGPU::VCCRegBank);
97 }
98 #endif
99
100 for (MachineOperand &Op : MI.operands()) {
101 if (!Op.isReg())
102 continue;
103
104 Register Reg = Op.getReg();
105 if (MRI.getRegClassOrRegBank(Reg))
106 continue;
107
108 const RegisterBank *RB = NewBank;
109 if (MRI.getType(Reg) == LLT::scalar(1)) {
110 assert(NewBank == &AMDGPU::VGPRRegBank &&
111 "s1 operands should only be used for vector bools");
112 assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
113 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
114 "not expecting legalization artifacts here");
115 RB = &AMDGPU::VCCRegBank;
116 }
117
118 MRI.setRegBank(Reg, *RB);
119 }
120 }
121
erasingInstr(MachineInstr & MI)122 void erasingInstr(MachineInstr &MI) override {}
123
createdInstr(MachineInstr & MI)124 void createdInstr(MachineInstr &MI) override {
125 // At this point, the instruction was just inserted and has no operands.
126 NewInsts.push_back(&MI);
127 }
128
changingInstr(MachineInstr & MI)129 void changingInstr(MachineInstr &MI) override {}
changedInstr(MachineInstr & MI)130 void changedInstr(MachineInstr &MI) override {}
131 };
132
133 }
AMDGPURegisterBankInfo(const GCNSubtarget & ST)134 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
135 : AMDGPUGenRegisterBankInfo(),
136 Subtarget(ST),
137 TRI(Subtarget.getRegisterInfo()),
138 TII(Subtarget.getInstrInfo()) {
139
140 // HACK: Until this is fully tablegen'd.
141 static bool AlreadyInit = false;
142 if (AlreadyInit)
143 return;
144
145 AlreadyInit = true;
146
147 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
148 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
149 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
150 }
151
isVectorRegisterBank(const RegisterBank & Bank)152 static bool isVectorRegisterBank(const RegisterBank &Bank) {
153 unsigned BankID = Bank.getID();
154 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
155 }
156
copyCost(const RegisterBank & Dst,const RegisterBank & Src,unsigned Size) const157 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
158 const RegisterBank &Src,
159 unsigned Size) const {
160 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
161 if (Dst.getID() == AMDGPU::SGPRRegBankID &&
162 isVectorRegisterBank(Src)) {
163 return std::numeric_limits<unsigned>::max();
164 }
165
166 // Bool values are tricky, because the meaning is based on context. The SCC
167 // and VCC banks are for the natural scalar and vector conditions produced by
168 // a compare.
169 //
170 // Legalization doesn't know about the necessary context, so an s1 use may
171 // have been a truncate from an arbitrary value, in which case a copy (lowered
172 // as a compare with 0) needs to be inserted.
173 if (Size == 1 &&
174 (Dst.getID() == AMDGPU::SGPRRegBankID) &&
175 (isVectorRegisterBank(Src) ||
176 Src.getID() == AMDGPU::SGPRRegBankID ||
177 Src.getID() == AMDGPU::VCCRegBankID))
178 return std::numeric_limits<unsigned>::max();
179
180 if (Src.getID() == AMDGPU::VCCRegBankID)
181 return std::numeric_limits<unsigned>::max();
182
183 // There is no direct copy between AGPRs.
184 if (Dst.getID() == AMDGPU::AGPRRegBankID &&
185 Src.getID() == AMDGPU::AGPRRegBankID)
186 return 4;
187
188 return RegisterBankInfo::copyCost(Dst, Src, Size);
189 }
190
getBreakDownCost(const ValueMapping & ValMapping,const RegisterBank * CurBank) const191 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
192 const ValueMapping &ValMapping,
193 const RegisterBank *CurBank) const {
194 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
195 // VGPR.
196 // FIXME: Is there a better way to do this?
197 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
198 return 10; // This is expensive.
199
200 assert(ValMapping.NumBreakDowns == 2 &&
201 ValMapping.BreakDown[0].Length == 32 &&
202 ValMapping.BreakDown[0].StartIdx == 0 &&
203 ValMapping.BreakDown[1].Length == 32 &&
204 ValMapping.BreakDown[1].StartIdx == 32 &&
205 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
206
207 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
208 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
209 // want.
210
211 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
212 // alignment restrictions, but this probably isn't important.
213 return 1;
214 }
215
216 const RegisterBank &
getRegBankFromRegClass(const TargetRegisterClass & RC,LLT Ty) const217 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
218 LLT Ty) const {
219 if (&RC == &AMDGPU::SReg_1RegClass)
220 return AMDGPU::VCCRegBank;
221
222 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
223 // VCC-like use.
224 if (TRI->isSGPRClass(&RC)) {
225 // FIXME: This probably came from a copy from a physical register, which
226 // should be inferrrable from the copied to-type. We don't have many boolean
227 // physical register constraints so just assume a normal SGPR for now.
228 if (!Ty.isValid())
229 return AMDGPU::SGPRRegBank;
230
231 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
232 }
233
234 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
235 }
236
237 template <unsigned NumOps>
238 RegisterBankInfo::InstructionMappings
addMappingFromTable(const MachineInstr & MI,const MachineRegisterInfo & MRI,const std::array<unsigned,NumOps> RegSrcOpIdx,ArrayRef<OpRegBankEntry<NumOps>> Table) const239 AMDGPURegisterBankInfo::addMappingFromTable(
240 const MachineInstr &MI, const MachineRegisterInfo &MRI,
241 const std::array<unsigned, NumOps> RegSrcOpIdx,
242 ArrayRef<OpRegBankEntry<NumOps>> Table) const {
243
244 InstructionMappings AltMappings;
245
246 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
247
248 unsigned Sizes[NumOps];
249 for (unsigned I = 0; I < NumOps; ++I) {
250 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
251 Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
252 }
253
254 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
255 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
256 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
257 }
258
259 // getInstrMapping's default mapping uses ID 1, so start at 2.
260 unsigned MappingID = 2;
261 for (const auto &Entry : Table) {
262 for (unsigned I = 0; I < NumOps; ++I) {
263 int OpIdx = RegSrcOpIdx[I];
264 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
265 }
266
267 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
268 getOperandsMapping(Operands),
269 Operands.size()));
270 }
271
272 return AltMappings;
273 }
274
275 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsic(const MachineInstr & MI,const MachineRegisterInfo & MRI) const276 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
277 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
278 switch (MI.getIntrinsicID()) {
279 case Intrinsic::amdgcn_readlane: {
280 static const OpRegBankEntry<3> Table[2] = {
281 // Perfectly legal.
282 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
283
284 // Need a readfirstlane for the index.
285 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
286 };
287
288 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
289 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
290 }
291 case Intrinsic::amdgcn_writelane: {
292 static const OpRegBankEntry<4> Table[4] = {
293 // Perfectly legal.
294 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
295
296 // Need readfirstlane of first op
297 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
298
299 // Need readfirstlane of second op
300 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
301
302 // Need readfirstlane of both ops
303 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
304 };
305
306 // rsrc, voffset, offset
307 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
308 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
309 }
310 default:
311 return RegisterBankInfo::getInstrAlternativeMappings(MI);
312 }
313 }
314
315 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsicWSideEffects(const MachineInstr & MI,const MachineRegisterInfo & MRI) const316 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
317 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
318
319 switch (MI.getIntrinsicID()) {
320 case Intrinsic::amdgcn_buffer_load: {
321 static const OpRegBankEntry<3> Table[4] = {
322 // Perfectly legal.
323 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
324 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
325
326 // Waterfall loop needed for rsrc. In the worst case this will execute
327 // approximately an extra 10 * wavesize + 2 instructions.
328 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
329 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1000 }
330 };
331
332 // rsrc, voffset, offset
333 const std::array<unsigned, 3> RegSrcOpIdx = { { 2, 3, 4 } };
334 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
335 }
336 case Intrinsic::amdgcn_s_buffer_load: {
337 static const OpRegBankEntry<2> Table[4] = {
338 // Perfectly legal.
339 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
340
341 // Only need 1 register in loop
342 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
343
344 // Have to waterfall the resource.
345 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
346
347 // Have to waterfall the resource, and the offset.
348 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
349 };
350
351 // rsrc, offset
352 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
353 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
354 }
355 case Intrinsic::amdgcn_ds_ordered_add:
356 case Intrinsic::amdgcn_ds_ordered_swap: {
357 // VGPR = M0, VGPR
358 static const OpRegBankEntry<3> Table[2] = {
359 // Perfectly legal.
360 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
361
362 // Need a readfirstlane for m0
363 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
364 };
365
366 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
367 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
368 }
369 case Intrinsic::amdgcn_s_sendmsg:
370 case Intrinsic::amdgcn_s_sendmsghalt: {
371 // FIXME: Should have no register for immediate
372 static const OpRegBankEntry<1> Table[2] = {
373 // Perfectly legal.
374 { { AMDGPU::SGPRRegBankID }, 1 },
375
376 // Need readlane
377 { { AMDGPU::VGPRRegBankID }, 3 }
378 };
379
380 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
381 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
382 }
383 default:
384 return RegisterBankInfo::getInstrAlternativeMappings(MI);
385 }
386 }
387
memOpHasNoClobbered(const MachineMemOperand * MMO)388 static bool memOpHasNoClobbered(const MachineMemOperand *MMO) {
389 const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue());
390 return I && I->getMetadata("amdgpu.noclobber");
391 }
392
393 // FIXME: Returns uniform if there's no source value information. This is
394 // probably wrong.
isScalarLoadLegal(const MachineInstr & MI)395 static bool isScalarLoadLegal(const MachineInstr &MI) {
396 if (!MI.hasOneMemOperand())
397 return false;
398
399 const MachineMemOperand *MMO = *MI.memoperands_begin();
400 const unsigned AS = MMO->getAddrSpace();
401 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
402 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
403
404 // There are no extending SMRD/SMEM loads, and they require 4-byte alignment.
405 return MMO->getSize() >= 4 && MMO->getAlignment() >= 4 &&
406 // Can't do a scalar atomic load.
407 !MMO->isAtomic() &&
408 // Don't use scalar loads for volatile accesses to non-constant address
409 // spaces.
410 (IsConst || !MMO->isVolatile()) &&
411 // Memory must be known constant, or not written before this load.
412 (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) &&
413 AMDGPUInstrInfo::isUniformMMO(MMO);
414 }
415
416 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappings(const MachineInstr & MI) const417 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
418 const MachineInstr &MI) const {
419
420 const MachineFunction &MF = *MI.getParent()->getParent();
421 const MachineRegisterInfo &MRI = MF.getRegInfo();
422
423
424 InstructionMappings AltMappings;
425 switch (MI.getOpcode()) {
426 case TargetOpcode::G_CONSTANT: {
427 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
428 if (Size == 1) {
429 static const OpRegBankEntry<1> Table[3] = {
430 { { AMDGPU::VGPRRegBankID }, 1 },
431 { { AMDGPU::SGPRRegBankID }, 1 },
432 { { AMDGPU::VCCRegBankID }, 1 }
433 };
434
435 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
436 }
437
438 LLVM_FALLTHROUGH;
439 }
440 case TargetOpcode::G_FCONSTANT:
441 case TargetOpcode::G_FRAME_INDEX:
442 case TargetOpcode::G_GLOBAL_VALUE: {
443 static const OpRegBankEntry<1> Table[2] = {
444 { { AMDGPU::VGPRRegBankID }, 1 },
445 { { AMDGPU::SGPRRegBankID }, 1 }
446 };
447
448 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
449 }
450 case TargetOpcode::G_AND:
451 case TargetOpcode::G_OR:
452 case TargetOpcode::G_XOR: {
453 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
454
455 if (Size == 1) {
456 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
457 const InstructionMapping &SCCMapping = getInstructionMapping(
458 1, 1, getOperandsMapping(
459 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
460 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
461 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
462 3); // Num Operands
463 AltMappings.push_back(&SCCMapping);
464
465 const InstructionMapping &VCCMapping0 = getInstructionMapping(
466 2, 1, getOperandsMapping(
467 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
468 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
469 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
470 3); // Num Operands
471 AltMappings.push_back(&VCCMapping0);
472 return AltMappings;
473 }
474
475 if (Size != 64)
476 break;
477
478 const InstructionMapping &SSMapping = getInstructionMapping(
479 1, 1, getOperandsMapping(
480 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
481 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
482 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
483 3); // Num Operands
484 AltMappings.push_back(&SSMapping);
485
486 const InstructionMapping &VVMapping = getInstructionMapping(
487 2, 2, getOperandsMapping(
488 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
489 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
490 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
491 3); // Num Operands
492 AltMappings.push_back(&VVMapping);
493
494 const InstructionMapping &SVMapping = getInstructionMapping(
495 3, 3, getOperandsMapping(
496 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
497 AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size),
498 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
499 3); // Num Operands
500 AltMappings.push_back(&SVMapping);
501
502 // SGPR in LHS is slightly preferrable, so make it VS more expensive than
503 // SV.
504 const InstructionMapping &VSMapping = getInstructionMapping(
505 3, 4, getOperandsMapping(
506 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
507 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
508 AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size)}),
509 3); // Num Operands
510 AltMappings.push_back(&VSMapping);
511 break;
512 }
513 case TargetOpcode::G_LOAD:
514 case TargetOpcode::G_ZEXTLOAD:
515 case TargetOpcode::G_SEXTLOAD: {
516 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
517 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
518 unsigned PtrSize = PtrTy.getSizeInBits();
519 unsigned AS = PtrTy.getAddressSpace();
520 LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
521
522 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
523 AS != AMDGPUAS::PRIVATE_ADDRESS) &&
524 isScalarLoadLegal(MI)) {
525 const InstructionMapping &SSMapping = getInstructionMapping(
526 1, 1, getOperandsMapping(
527 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
528 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
529 2); // Num Operands
530 AltMappings.push_back(&SSMapping);
531 }
532
533 const InstructionMapping &VVMapping = getInstructionMapping(
534 2, 1, getOperandsMapping(
535 {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy),
536 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
537 2); // Num Operands
538 AltMappings.push_back(&VVMapping);
539
540 // It may be possible to have a vgpr = load sgpr mapping here, because
541 // the mubuf instructions support this kind of load, but probably for only
542 // gfx7 and older. However, the addressing mode matching in the instruction
543 // selector should be able to do a better job of detecting and selecting
544 // these kinds of loads from the vgpr = load vgpr mapping.
545
546 return AltMappings;
547
548 }
549 case TargetOpcode::G_ICMP: {
550 // TODO: Should report 32-bit for scalar output type.
551 unsigned Size = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
552 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
553 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
554 nullptr, // Predicate operand.
555 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
556 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
557 4); // Num Operands
558 AltMappings.push_back(&SSMapping);
559
560 const InstructionMapping &SVMapping = getInstructionMapping(2, 1,
561 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
562 nullptr, // Predicate operand.
563 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
564 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
565 4); // Num Operands
566 AltMappings.push_back(&SVMapping);
567
568 const InstructionMapping &VSMapping = getInstructionMapping(3, 1,
569 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
570 nullptr, // Predicate operand.
571 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
572 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
573 4); // Num Operands
574 AltMappings.push_back(&VSMapping);
575
576 const InstructionMapping &VVMapping = getInstructionMapping(4, 1,
577 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
578 nullptr, // Predicate operand.
579 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
580 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
581 4); // Num Operands
582 AltMappings.push_back(&VVMapping);
583
584 return AltMappings;
585 }
586 case TargetOpcode::G_SELECT: {
587 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
588 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
589 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
590 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
591 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
592 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
593 4); // Num Operands
594 AltMappings.push_back(&SSMapping);
595
596 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
597 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
598 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
599 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
600 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
601 4); // Num Operands
602 AltMappings.push_back(&VVMapping);
603
604 return AltMappings;
605 }
606 case TargetOpcode::G_SMIN:
607 case TargetOpcode::G_SMAX:
608 case TargetOpcode::G_UMIN:
609 case TargetOpcode::G_UMAX: {
610 static const OpRegBankEntry<3> Table[4] = {
611 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
612 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
613 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
614
615 // Scalar requires cmp+select, and extends if 16-bit.
616 // FIXME: Should there be separate costs for 32 and 16-bit
617 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 }
618 };
619
620 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } };
621 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
622 }
623 case TargetOpcode::G_UADDE:
624 case TargetOpcode::G_USUBE:
625 case TargetOpcode::G_SADDE:
626 case TargetOpcode::G_SSUBE: {
627 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
628 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
629 getOperandsMapping(
630 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
631 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
632 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
633 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
634 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
635 5); // Num Operands
636 AltMappings.push_back(&SSMapping);
637
638 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
639 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
640 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
641 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
642 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
643 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
644 5); // Num Operands
645 AltMappings.push_back(&VVMapping);
646 return AltMappings;
647 }
648 case AMDGPU::G_BRCOND: {
649 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
650
651 // TODO: Change type to 32 for scalar
652 const InstructionMapping &SMapping = getInstructionMapping(
653 1, 1, getOperandsMapping(
654 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
655 2); // Num Operands
656 AltMappings.push_back(&SMapping);
657
658 const InstructionMapping &VMapping = getInstructionMapping(
659 1, 1, getOperandsMapping(
660 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
661 2); // Num Operands
662 AltMappings.push_back(&VMapping);
663 return AltMappings;
664 }
665 case AMDGPU::G_INTRINSIC:
666 return getInstrAlternativeMappingsIntrinsic(MI, MRI);
667 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
668 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
669 default:
670 break;
671 }
672 return RegisterBankInfo::getInstrAlternativeMappings(MI);
673 }
674
split64BitValueForMapping(MachineIRBuilder & B,SmallVector<Register,2> & Regs,LLT HalfTy,Register Reg) const675 void AMDGPURegisterBankInfo::split64BitValueForMapping(
676 MachineIRBuilder &B,
677 SmallVector<Register, 2> &Regs,
678 LLT HalfTy,
679 Register Reg) const {
680 assert(HalfTy.getSizeInBits() == 32);
681 MachineRegisterInfo *MRI = B.getMRI();
682 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
683 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
684 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
685 MRI->setRegBank(LoLHS, *Bank);
686 MRI->setRegBank(HiLHS, *Bank);
687
688 Regs.push_back(LoLHS);
689 Regs.push_back(HiLHS);
690
691 B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
692 .addDef(LoLHS)
693 .addDef(HiLHS)
694 .addUse(Reg);
695 }
696
697 /// Replace the current type each register in \p Regs has with \p NewTy
setRegsToType(MachineRegisterInfo & MRI,ArrayRef<Register> Regs,LLT NewTy)698 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
699 LLT NewTy) {
700 for (Register Reg : Regs) {
701 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
702 MRI.setType(Reg, NewTy);
703 }
704 }
705
getHalfSizedType(LLT Ty)706 static LLT getHalfSizedType(LLT Ty) {
707 if (Ty.isVector()) {
708 assert(Ty.getNumElements() % 2 == 0);
709 return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType());
710 }
711
712 assert(Ty.getSizeInBits() % 2 == 0);
713 return LLT::scalar(Ty.getSizeInBits() / 2);
714 }
715
716 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
717 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
718 /// execute the instruction for each unique combination of values in all lanes
719 /// in the wave. The block will be split such that rest of the instructions are
720 /// moved to a new block.
721 ///
722 /// Essentially performs this loop:
723 //
724 /// Save Execution Mask
725 /// For (Lane : Wavefront) {
726 /// Enable Lane, Disable all other lanes
727 /// SGPR = read SGPR value for current lane from VGPR
728 /// VGPRResult[Lane] = use_op SGPR
729 /// }
730 /// Restore Execution Mask
731 ///
732 /// There is additional complexity to try for compare values to identify the
733 /// unique values used.
executeInWaterfallLoop(MachineIRBuilder & B,iterator_range<MachineBasicBlock::iterator> Range,SmallSet<Register,4> & SGPROperandRegs,MachineRegisterInfo & MRI) const734 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
735 MachineIRBuilder &B,
736 iterator_range<MachineBasicBlock::iterator> Range,
737 SmallSet<Register, 4> &SGPROperandRegs,
738 MachineRegisterInfo &MRI) const {
739 SmallVector<Register, 4> ResultRegs;
740 SmallVector<Register, 4> InitResultRegs;
741 SmallVector<Register, 4> PhiRegs;
742
743 MachineBasicBlock &MBB = B.getMBB();
744 MachineFunction *MF = &B.getMF();
745
746 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
747 const unsigned WaveAndOpc = Subtarget.isWave32() ?
748 AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
749 const unsigned MovTermOpc = Subtarget.isWave32() ?
750 AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
751 const unsigned XorTermOpc = Subtarget.isWave32() ?
752 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
753 const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
754 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
755 const unsigned ExecReg = Subtarget.isWave32() ?
756 AMDGPU::EXEC_LO : AMDGPU::EXEC;
757
758 for (MachineInstr &MI : Range) {
759 for (MachineOperand &Def : MI.defs()) {
760 LLT ResTy = MRI.getType(Def.getReg());
761 const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
762 ResultRegs.push_back(Def.getReg());
763 Register InitReg = B.buildUndef(ResTy).getReg(0);
764 Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
765 InitResultRegs.push_back(InitReg);
766 PhiRegs.push_back(PhiReg);
767 MRI.setRegBank(PhiReg, *DefBank);
768 MRI.setRegBank(InitReg, *DefBank);
769 }
770 }
771
772 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
773 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
774
775 // Don't bother using generic instructions/registers for the exec mask.
776 B.buildInstr(TargetOpcode::IMPLICIT_DEF)
777 .addDef(InitSaveExecReg);
778
779 Register PhiExec = MRI.createVirtualRegister(WaveRC);
780 Register NewExec = MRI.createVirtualRegister(WaveRC);
781
782 // To insert the loop we need to split the block. Move everything before this
783 // point to a new block, and insert a new empty block before this instruction.
784 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
785 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
786 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
787 MachineFunction::iterator MBBI(MBB);
788 ++MBBI;
789 MF->insert(MBBI, LoopBB);
790 MF->insert(MBBI, RestoreExecBB);
791 MF->insert(MBBI, RemainderBB);
792
793 LoopBB->addSuccessor(RestoreExecBB);
794 LoopBB->addSuccessor(LoopBB);
795
796 // Move the rest of the block into a new block.
797 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
798 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
799
800 MBB.addSuccessor(LoopBB);
801 RestoreExecBB->addSuccessor(RemainderBB);
802
803 B.setInsertPt(*LoopBB, LoopBB->end());
804
805 B.buildInstr(TargetOpcode::PHI)
806 .addDef(PhiExec)
807 .addReg(InitSaveExecReg)
808 .addMBB(&MBB)
809 .addReg(NewExec)
810 .addMBB(LoopBB);
811
812 for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
813 B.buildInstr(TargetOpcode::G_PHI)
814 .addDef(std::get<2>(Result))
815 .addReg(std::get<0>(Result)) // Initial value / implicit_def
816 .addMBB(&MBB)
817 .addReg(std::get<1>(Result)) // Mid-loop value.
818 .addMBB(LoopBB);
819 }
820
821 const DebugLoc &DL = B.getDL();
822
823 // Figure out the iterator range after splicing the instructions.
824 auto NewBegin = std::prev(LoopBB->end());
825
826 // Move the instruction into the loop. Note we moved everything after
827 // Range.end() already into a new block, so Range.end() is no longer valid.
828 LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
829
830 auto NewEnd = LoopBB->end();
831
832 MachineBasicBlock::iterator I = Range.begin();
833 B.setInsertPt(*LoopBB, I);
834
835 Register CondReg;
836
837 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
838 for (MachineOperand &Op : MI.uses()) {
839 if (!Op.isReg() || Op.isDef())
840 continue;
841
842 if (SGPROperandRegs.count(Op.getReg())) {
843 LLT OpTy = MRI.getType(Op.getReg());
844 unsigned OpSize = OpTy.getSizeInBits();
845
846 // Can only do a readlane of 32-bit pieces.
847 if (OpSize == 32) {
848 // Avoid extra copies in the simple case of one 32-bit register.
849 Register CurrentLaneOpReg
850 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
851 MRI.setType(CurrentLaneOpReg, OpTy);
852
853 constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
854 // Read the next variant <- also loop target.
855 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
856 CurrentLaneOpReg)
857 .addReg(Op.getReg());
858
859 Register NewCondReg = MRI.createVirtualRegister(WaveRC);
860 bool First = CondReg == AMDGPU::NoRegister;
861 if (First)
862 CondReg = NewCondReg;
863
864 // Compare the just read M0 value to all possible Idx values.
865 B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
866 .addDef(NewCondReg)
867 .addReg(CurrentLaneOpReg)
868 .addReg(Op.getReg());
869 Op.setReg(CurrentLaneOpReg);
870
871 if (!First) {
872 Register AndReg = MRI.createVirtualRegister(WaveRC);
873
874 // If there are multiple operands to consider, and the conditions.
875 B.buildInstr(WaveAndOpc)
876 .addDef(AndReg)
877 .addReg(NewCondReg)
878 .addReg(CondReg);
879 CondReg = AndReg;
880 }
881 } else {
882 LLT S32 = LLT::scalar(32);
883 SmallVector<Register, 8> ReadlanePieces;
884
885 // The compares can be done as 64-bit, but the extract needs to be done
886 // in 32-bit pieces.
887
888 bool Is64 = OpSize % 64 == 0;
889
890 LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
891 unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
892 : AMDGPU::V_CMP_EQ_U32_e64;
893
894 // The compares can be done as 64-bit, but the extract needs to be done
895 // in 32-bit pieces.
896
897 // Insert the unmerge before the loop.
898
899 B.setMBB(MBB);
900 auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
901 B.setInstr(*I);
902
903 unsigned NumPieces = Unmerge->getNumOperands() - 1;
904 for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
905 Register UnmergePiece = Unmerge.getReg(PieceIdx);
906
907 Register CurrentLaneOpReg;
908 if (Is64) {
909 Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
910 Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
911
912 MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
913 MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
914 MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
915
916 // Read the next variant <- also loop target.
917 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
918 CurrentLaneOpRegLo)
919 .addReg(UnmergePiece, 0, AMDGPU::sub0);
920
921 // Read the next variant <- also loop target.
922 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
923 CurrentLaneOpRegHi)
924 .addReg(UnmergePiece, 0, AMDGPU::sub1);
925
926 CurrentLaneOpReg =
927 B.buildMerge(LLT::scalar(64),
928 {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
929 .getReg(0);
930
931 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
932
933 if (OpTy.getScalarSizeInBits() == 64) {
934 // If we need to produce a 64-bit element vector, so use the
935 // merged pieces
936 ReadlanePieces.push_back(CurrentLaneOpReg);
937 } else {
938 // 32-bit element type.
939 ReadlanePieces.push_back(CurrentLaneOpRegLo);
940 ReadlanePieces.push_back(CurrentLaneOpRegHi);
941 }
942 } else {
943 CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
944 MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
945 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
946
947 // Read the next variant <- also loop target.
948 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
949 CurrentLaneOpReg)
950 .addReg(UnmergePiece);
951 ReadlanePieces.push_back(CurrentLaneOpReg);
952 }
953
954 Register NewCondReg = MRI.createVirtualRegister(WaveRC);
955 bool First = CondReg == AMDGPU::NoRegister;
956 if (First)
957 CondReg = NewCondReg;
958
959 B.buildInstr(CmpOp)
960 .addDef(NewCondReg)
961 .addReg(CurrentLaneOpReg)
962 .addReg(UnmergePiece);
963
964 if (!First) {
965 Register AndReg = MRI.createVirtualRegister(WaveRC);
966
967 // If there are multiple operands to consider, and the conditions.
968 B.buildInstr(WaveAndOpc)
969 .addDef(AndReg)
970 .addReg(NewCondReg)
971 .addReg(CondReg);
972 CondReg = AndReg;
973 }
974 }
975
976 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
977 // BUILD_VECTOR
978 if (OpTy.isVector()) {
979 auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
980 Op.setReg(Merge.getReg(0));
981 } else {
982 auto Merge = B.buildMerge(OpTy, ReadlanePieces);
983 Op.setReg(Merge.getReg(0));
984 }
985
986 MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
987 }
988 }
989 }
990 }
991
992 B.setInsertPt(*LoopBB, LoopBB->end());
993
994 // Update EXEC, save the original EXEC value to VCC.
995 B.buildInstr(AndSaveExecOpc)
996 .addDef(NewExec)
997 .addReg(CondReg, RegState::Kill);
998
999 MRI.setSimpleHint(NewExec, CondReg);
1000
1001 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
1002 B.buildInstr(XorTermOpc)
1003 .addDef(ExecReg)
1004 .addReg(ExecReg)
1005 .addReg(NewExec);
1006
1007 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
1008 // s_cbranch_scc0?
1009
1010 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
1011 B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
1012 .addMBB(LoopBB);
1013
1014 // Save the EXEC mask before the loop.
1015 BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg)
1016 .addReg(ExecReg);
1017
1018 // Restore the EXEC mask after the loop.
1019 B.setMBB(*RestoreExecBB);
1020 B.buildInstr(MovTermOpc)
1021 .addDef(ExecReg)
1022 .addReg(SaveExecReg);
1023
1024 // Set the insert point after the original instruction, so any new
1025 // instructions will be in the remainder.
1026 B.setInsertPt(*RemainderBB, RemainderBB->begin());
1027
1028 return true;
1029 }
1030
1031 // Return any unique registers used by \p MI at \p OpIndices that need to be
1032 // handled in a waterfall loop. Returns these registers in \p
1033 // SGPROperandRegs. Returns true if there are any operansd to handle and a
1034 // waterfall loop is necessary.
collectWaterfallOperands(SmallSet<Register,4> & SGPROperandRegs,MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const1035 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
1036 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
1037 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
1038 for (unsigned Op : OpIndices) {
1039 assert(MI.getOperand(Op).isUse());
1040 Register Reg = MI.getOperand(Op).getReg();
1041 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
1042 if (OpBank->getID() == AMDGPU::VGPRRegBankID)
1043 SGPROperandRegs.insert(Reg);
1044 }
1045
1046 // No operands need to be replaced, so no need to loop.
1047 return !SGPROperandRegs.empty();
1048 }
1049
executeInWaterfallLoop(MachineIRBuilder & B,MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const1050 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1051 MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
1052 ArrayRef<unsigned> OpIndices) const {
1053 // Use a set to avoid extra readfirstlanes in the case where multiple operands
1054 // are the same register.
1055 SmallSet<Register, 4> SGPROperandRegs;
1056
1057 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
1058 return false;
1059
1060 MachineBasicBlock::iterator I = MI.getIterator();
1061 return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1062 SGPROperandRegs, MRI);
1063 }
1064
executeInWaterfallLoop(MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const1065 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1066 MachineInstr &MI, MachineRegisterInfo &MRI,
1067 ArrayRef<unsigned> OpIndices) const {
1068 MachineIRBuilder B(MI);
1069 return executeInWaterfallLoop(B, MI, MRI, OpIndices);
1070 }
1071
1072 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
constrainOpWithReadfirstlane(MachineInstr & MI,MachineRegisterInfo & MRI,unsigned OpIdx) const1073 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1074 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1075 Register Reg = MI.getOperand(OpIdx).getReg();
1076 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1077 if (Bank != &AMDGPU::VGPRRegBank)
1078 return;
1079
1080 MachineIRBuilder B(MI);
1081 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1082 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
1083 .addDef(SGPR)
1084 .addReg(Reg);
1085
1086 MRI.setType(SGPR, MRI.getType(Reg));
1087
1088 const TargetRegisterClass *Constrained =
1089 constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
1090 (void)Constrained;
1091 assert(Constrained && "Failed to constrain readfirstlane src reg");
1092
1093 MI.getOperand(OpIdx).setReg(SGPR);
1094 }
1095
1096 // When regbankselect repairs registers, it will insert a repair instruction
1097 // which defines the repaired register. Then it calls applyMapping and expects
1098 // that the targets will either delete or rewrite the originally wrote to the
1099 // repaired registers. Beccause of this, we end up in a situation where
1100 // we have 2 instructions defining the same registers.
getOtherVRegDef(const MachineRegisterInfo & MRI,Register Reg,const MachineInstr & MI)1101 static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI,
1102 Register Reg,
1103 const MachineInstr &MI) {
1104 // Is there some way we can assert that there are exactly 2 def instructions?
1105 for (MachineInstr &Other : MRI.def_instructions(Reg)) {
1106 if (&Other != &MI)
1107 return &Other;
1108 }
1109
1110 return nullptr;
1111 }
1112
applyMappingWideLoad(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI) const1113 bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
1114 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1115 MachineRegisterInfo &MRI) const {
1116 Register DstReg = MI.getOperand(0).getReg();
1117 const LLT LoadTy = MRI.getType(DstReg);
1118 unsigned LoadSize = LoadTy.getSizeInBits();
1119 const unsigned MaxNonSmrdLoadSize = 128;
1120 // 128-bit loads are supported for all instruction types.
1121 if (LoadSize <= MaxNonSmrdLoadSize)
1122 return false;
1123
1124 SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0));
1125 SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1));
1126
1127 // If the pointer is an SGPR, we have nothing to do.
1128 if (SrcRegs.empty()) {
1129 const RegisterBank *PtrBank =
1130 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1131 if (PtrBank == &AMDGPU::SGPRRegBank)
1132 return false;
1133 SrcRegs.push_back(MI.getOperand(1).getReg());
1134 }
1135
1136 assert(LoadSize % MaxNonSmrdLoadSize == 0);
1137
1138 // We want to get the repair instruction now, because it will help us
1139 // determine which instruction the legalizer inserts that will also
1140 // write to DstReg.
1141 MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI);
1142
1143 // RegBankSelect only emits scalar types, so we need to reset the pointer
1144 // operand to a pointer type.
1145 Register BasePtrReg = SrcRegs[0];
1146 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1147 MRI.setType(BasePtrReg, PtrTy);
1148
1149 MachineIRBuilder B(MI);
1150
1151 unsigned SplitElts =
1152 MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits();
1153 const LLT LoadSplitTy = LLT::vector(SplitElts, LoadTy.getScalarType());
1154 ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
1155 GISelObserverWrapper Observer(&O);
1156 B.setChangeObserver(Observer);
1157 LegalizerHelper Helper(B.getMF(), Observer, B);
1158 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1159 return false;
1160
1161 // At this point, the legalizer has split the original load into smaller
1162 // loads. At the end of lowering, it inserts an instruction (LegalizedInst)
1163 // that combines the outputs of the lower loads and writes it to DstReg.
1164 // The register bank selector has also added the RepairInst which writes to
1165 // DstReg as well.
1166
1167 MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst);
1168
1169 // Replace the output of the LegalizedInst with a temporary register, since
1170 // RepairInst already defines DstReg.
1171 Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg));
1172 LegalizedInst->getOperand(0).setReg(TmpReg);
1173 B.setInsertPt(*RepairInst->getParent(), RepairInst);
1174
1175 for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) {
1176 Register IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
1177 B.buildConstant(IdxReg, DefIdx);
1178 MRI.setRegBank(IdxReg, AMDGPU::VGPRRegBank);
1179 B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg);
1180 }
1181
1182 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1183 return true;
1184 }
1185
applyMappingImage(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI,int RsrcIdx) const1186 bool AMDGPURegisterBankInfo::applyMappingImage(
1187 MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1188 MachineRegisterInfo &MRI, int RsrcIdx) const {
1189 const int NumDefs = MI.getNumExplicitDefs();
1190
1191 // The reported argument index is relative to the IR intrinsic call arguments,
1192 // so we need to shift by the number of defs and the intrinsic ID.
1193 RsrcIdx += NumDefs + 1;
1194
1195 // Insert copies to VGPR arguments.
1196 applyDefaultMapping(OpdMapper);
1197
1198 // Fixup any SGPR arguments.
1199 SmallVector<unsigned, 4> SGPRIndexes;
1200 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1201 if (!MI.getOperand(I).isReg())
1202 continue;
1203
1204 // If this intrinsic has a sampler, it immediately follows rsrc.
1205 if (I == RsrcIdx || I == RsrcIdx + 1)
1206 SGPRIndexes.push_back(I);
1207 }
1208
1209 executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1210 return true;
1211 }
1212
1213 // FIXME: Duplicated from LegalizerHelper
minMaxToCompare(unsigned Opc)1214 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
1215 switch (Opc) {
1216 case TargetOpcode::G_SMIN:
1217 return CmpInst::ICMP_SLT;
1218 case TargetOpcode::G_SMAX:
1219 return CmpInst::ICMP_SGT;
1220 case TargetOpcode::G_UMIN:
1221 return CmpInst::ICMP_ULT;
1222 case TargetOpcode::G_UMAX:
1223 return CmpInst::ICMP_UGT;
1224 default:
1225 llvm_unreachable("not in integer min/max");
1226 }
1227 }
1228
1229 // FIXME: Duplicated from LegalizerHelper, except changing the boolean type.
lowerScalarMinMax(MachineIRBuilder & B,MachineInstr & MI) const1230 void AMDGPURegisterBankInfo::lowerScalarMinMax(MachineIRBuilder &B,
1231 MachineInstr &MI) const {
1232 Register Dst = MI.getOperand(0).getReg();
1233 Register Src0 = MI.getOperand(1).getReg();
1234 Register Src1 = MI.getOperand(2).getReg();
1235
1236 const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
1237 LLT CmpType = LLT::scalar(32);
1238
1239 auto Cmp = B.buildICmp(Pred, CmpType, Src0, Src1);
1240 B.buildSelect(Dst, Cmp, Src0, Src1);
1241
1242 B.getMRI()->setRegBank(Cmp.getReg(0), AMDGPU::SGPRRegBank);
1243 MI.eraseFromParent();
1244 }
1245
1246 // For cases where only a single copy is inserted for matching register banks.
1247 // Replace the register in the instruction operand
substituteSimpleCopyRegs(const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,unsigned OpIdx)1248 static void substituteSimpleCopyRegs(
1249 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1250 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1251 if (!SrcReg.empty()) {
1252 assert(SrcReg.size() == 1);
1253 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1254 }
1255 }
1256
1257 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg) const1258 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1259 MachineRegisterInfo &MRI,
1260 Register Reg) const {
1261 if (!Subtarget.hasUnpackedD16VMem())
1262 return Reg;
1263
1264 const LLT S16 = LLT::scalar(16);
1265 LLT StoreVT = MRI.getType(Reg);
1266 if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1267 return Reg;
1268
1269 auto Unmerge = B.buildUnmerge(S16, Reg);
1270
1271
1272 SmallVector<Register, 4> WideRegs;
1273 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1274 WideRegs.push_back(Unmerge.getReg(I));
1275
1276 const LLT S32 = LLT::scalar(32);
1277 int NumElts = StoreVT.getNumElements();
1278
1279 return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1280 }
1281
1282 static std::pair<Register, unsigned>
getBaseWithConstantOffset(MachineRegisterInfo & MRI,Register Reg)1283 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1284 int64_t Const;
1285 if (mi_match(Reg, MRI, m_ICst(Const)))
1286 return std::make_pair(Register(), Const);
1287
1288 Register Base;
1289 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1290 return std::make_pair(Base, Const);
1291
1292 // TODO: Handle G_OR used for add case
1293 return std::make_pair(Reg, 0);
1294 }
1295
1296 std::pair<Register, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const1297 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1298 Register OrigOffset) const {
1299 const unsigned MaxImm = 4095;
1300 Register BaseReg;
1301 unsigned ImmOffset;
1302 const LLT S32 = LLT::scalar(32);
1303
1304 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1305 OrigOffset);
1306
1307 unsigned C1 = 0;
1308 if (ImmOffset != 0) {
1309 // If the immediate value is too big for the immoffset field, put the value
1310 // and -4096 into the immoffset field so that the value that is copied/added
1311 // for the voffset field is a multiple of 4096, and it stands more chance
1312 // of being CSEd with the copy/add for another similar load/store.
1313 // However, do not do that rounding down to a multiple of 4096 if that is a
1314 // negative number, as it appears to be illegal to have a negative offset
1315 // in the vgpr, even if adding the immediate offset makes it positive.
1316 unsigned Overflow = ImmOffset & ~MaxImm;
1317 ImmOffset -= Overflow;
1318 if ((int32_t)Overflow < 0) {
1319 Overflow += ImmOffset;
1320 ImmOffset = 0;
1321 }
1322
1323 C1 = ImmOffset;
1324 if (Overflow != 0) {
1325 if (!BaseReg)
1326 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1327 else {
1328 auto OverflowVal = B.buildConstant(S32, Overflow);
1329 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1330 }
1331 }
1332 }
1333
1334 if (!BaseReg)
1335 BaseReg = B.buildConstant(S32, 0).getReg(0);
1336
1337 return {BaseReg, C1};
1338 }
1339
isZero(Register Reg,MachineRegisterInfo & MRI)1340 static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
1341 int64_t C;
1342 return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
1343 }
1344
extractGLC(unsigned CachePolicy)1345 static unsigned extractGLC(unsigned CachePolicy) {
1346 return CachePolicy & 1;
1347 }
1348
extractSLC(unsigned CachePolicy)1349 static unsigned extractSLC(unsigned CachePolicy) {
1350 return (CachePolicy >> 1) & 1;
1351 }
1352
extractDLC(unsigned CachePolicy)1353 static unsigned extractDLC(unsigned CachePolicy) {
1354 return (CachePolicy >> 2) & 1;
1355 }
1356
1357 MachineInstr *
selectStoreIntrinsic(MachineIRBuilder & B,MachineInstr & MI) const1358 AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
1359 MachineInstr &MI) const {
1360 MachineRegisterInfo &MRI = *B.getMRI();
1361 executeInWaterfallLoop(B, MI, MRI, {2, 4});
1362
1363 // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
1364
1365 Register VData = MI.getOperand(1).getReg();
1366 LLT Ty = MRI.getType(VData);
1367
1368 int EltSize = Ty.getScalarSizeInBits();
1369 int Size = Ty.getSizeInBits();
1370
1371 // FIXME: Broken integer truncstore.
1372 if (EltSize != 32)
1373 report_fatal_error("unhandled intrinsic store");
1374
1375 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
1376 const int MemSize = (*MI.memoperands_begin())->getSize();
1377
1378
1379 Register RSrc = MI.getOperand(2).getReg();
1380 Register VOffset = MI.getOperand(3).getReg();
1381 Register SOffset = MI.getOperand(4).getReg();
1382 unsigned CachePolicy = MI.getOperand(5).getImm();
1383
1384 unsigned ImmOffset;
1385 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
1386
1387 const bool Offen = !isZero(VOffset, MRI);
1388
1389 unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact;
1390 switch (8 * MemSize) {
1391 case 8:
1392 Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
1393 AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
1394 break;
1395 case 16:
1396 Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
1397 AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
1398 break;
1399 default:
1400 Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
1401 AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
1402 if (Size > 32)
1403 Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
1404 break;
1405 }
1406
1407
1408 // Set the insertion point back to the instruction in case it was moved into a
1409 // loop.
1410 B.setInstr(MI);
1411
1412 MachineInstrBuilder MIB = B.buildInstr(Opc)
1413 .addUse(VData);
1414
1415 if (Offen)
1416 MIB.addUse(VOffset);
1417
1418 MIB.addUse(RSrc)
1419 .addUse(SOffset)
1420 .addImm(ImmOffset)
1421 .addImm(extractGLC(CachePolicy))
1422 .addImm(extractSLC(CachePolicy))
1423 .addImm(0) // tfe: FIXME: Remove from inst
1424 .addImm(extractDLC(CachePolicy))
1425 .cloneMemRefs(MI);
1426
1427 // FIXME: We need a way to report failure from applyMappingImpl.
1428 // Insert constrain copies before inserting the loop.
1429 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1430 report_fatal_error("failed to constrain selected store intrinsic");
1431
1432 return MIB;
1433 }
1434
buildVCopy(MachineIRBuilder & B,Register DstReg,Register SrcReg) const1435 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1436 Register SrcReg) const {
1437 MachineRegisterInfo &MRI = *B.getMRI();
1438 LLT SrcTy = MRI.getType(SrcReg);
1439 if (SrcTy.getSizeInBits() == 32) {
1440 // Use a v_mov_b32 here to make the exec dependency explicit.
1441 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1442 .addDef(DstReg)
1443 .addUse(SrcReg);
1444 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1445 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1446 }
1447
1448 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1449 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1450
1451 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1452 .addDef(TmpReg0)
1453 .addUse(SrcReg, 0, AMDGPU::sub0);
1454 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1455 .addDef(TmpReg1)
1456 .addUse(SrcReg, 0, AMDGPU::sub1);
1457 B.buildInstr(AMDGPU::REG_SEQUENCE)
1458 .addDef(DstReg)
1459 .addUse(TmpReg0)
1460 .addImm(AMDGPU::sub0)
1461 .addUse(TmpReg1)
1462 .addImm(AMDGPU::sub1);
1463
1464 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1465 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1466 }
1467
applyMappingImpl(const OperandsMapper & OpdMapper) const1468 void AMDGPURegisterBankInfo::applyMappingImpl(
1469 const OperandsMapper &OpdMapper) const {
1470 MachineInstr &MI = OpdMapper.getMI();
1471 unsigned Opc = MI.getOpcode();
1472 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1473 switch (Opc) {
1474 case AMDGPU::G_PHI: {
1475 Register DstReg = MI.getOperand(0).getReg();
1476 LLT DstTy = MRI.getType(DstReg);
1477 if (DstTy != LLT::scalar(1))
1478 break;
1479
1480 const LLT S32 = LLT::scalar(32);
1481 const RegisterBank *DstBank =
1482 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1483 if (DstBank == &AMDGPU::VCCRegBank) {
1484 applyDefaultMapping(OpdMapper);
1485 // The standard handling only considers the result register bank for
1486 // phis. For VCC, blindly inserting a copy when the phi is lowered will
1487 // produce an invalid copy. We can only copy with some kind of compare to
1488 // get a vector boolean result. Insert a regitser bank copy that will be
1489 // correctly lowered to a compare.
1490 MachineIRBuilder B(*MI.getParent()->getParent());
1491
1492 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
1493 Register SrcReg = MI.getOperand(I).getReg();
1494 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
1495
1496 if (SrcBank != &AMDGPU::VCCRegBank) {
1497 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
1498 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
1499
1500 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
1501 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
1502 MI.getOperand(I).setReg(Copy.getReg(0));
1503 }
1504 }
1505
1506 return;
1507 }
1508
1509 // Phi handling is strange and only considers the bank of the destination.
1510 substituteSimpleCopyRegs(OpdMapper, 0);
1511
1512 // Promote SGPR/VGPR booleans to s32
1513 MachineFunction *MF = MI.getParent()->getParent();
1514 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
1515 GISelObserverWrapper Observer(&ApplyBank);
1516 MachineIRBuilder B(MI);
1517 LegalizerHelper Helper(*MF, Observer, B);
1518
1519 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
1520 llvm_unreachable("widen scalar should have succeeded");
1521
1522 return;
1523 }
1524 case AMDGPU::G_ICMP:
1525 case AMDGPU::G_UADDO:
1526 case AMDGPU::G_USUBO:
1527 case AMDGPU::G_UADDE:
1528 case AMDGPU::G_SADDE:
1529 case AMDGPU::G_USUBE:
1530 case AMDGPU::G_SSUBE: {
1531 unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
1532 Register DstReg = MI.getOperand(BoolDstOp).getReg();
1533
1534 const RegisterBank *DstBank =
1535 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1536 if (DstBank != &AMDGPU::SGPRRegBank)
1537 break;
1538
1539 const bool HasCarryIn = MI.getNumOperands() == 5;
1540
1541 // If this is a scalar compare, promote the result to s32, as the selection
1542 // will end up using a copy to a 32-bit vreg.
1543 const LLT S32 = LLT::scalar(32);
1544 Register NewDstReg = MRI.createGenericVirtualRegister(S32);
1545 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
1546 MI.getOperand(BoolDstOp).setReg(NewDstReg);
1547 MachineIRBuilder B(MI);
1548
1549 if (HasCarryIn) {
1550 Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
1551 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
1552 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
1553 MI.getOperand(4).setReg(NewSrcReg);
1554 }
1555
1556 MachineBasicBlock *MBB = MI.getParent();
1557 B.setInsertPt(*MBB, std::next(MI.getIterator()));
1558 B.buildTrunc(DstReg, NewDstReg);
1559 return;
1560 }
1561 case AMDGPU::G_SELECT: {
1562 Register DstReg = MI.getOperand(0).getReg();
1563 LLT DstTy = MRI.getType(DstReg);
1564
1565 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
1566 if (CondRegs.empty())
1567 CondRegs.push_back(MI.getOperand(1).getReg());
1568 else {
1569 assert(CondRegs.size() == 1);
1570 }
1571
1572 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
1573 if (CondBank == &AMDGPU::SGPRRegBank) {
1574 MachineIRBuilder B(MI);
1575 const LLT S32 = LLT::scalar(32);
1576 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
1577 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
1578
1579 MI.getOperand(1).setReg(NewCondReg);
1580 B.buildZExt(NewCondReg, CondRegs[0]);
1581 }
1582
1583 if (DstTy.getSizeInBits() != 64)
1584 break;
1585
1586 MachineIRBuilder B(MI);
1587 LLT HalfTy = getHalfSizedType(DstTy);
1588
1589 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1590 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
1591 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
1592
1593 // All inputs are SGPRs, nothing special to do.
1594 if (DefRegs.empty()) {
1595 assert(Src1Regs.empty() && Src2Regs.empty());
1596 break;
1597 }
1598
1599 if (Src1Regs.empty())
1600 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
1601 else {
1602 setRegsToType(MRI, Src1Regs, HalfTy);
1603 }
1604
1605 if (Src2Regs.empty())
1606 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
1607 else
1608 setRegsToType(MRI, Src2Regs, HalfTy);
1609
1610 setRegsToType(MRI, DefRegs, HalfTy);
1611
1612 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
1613 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
1614
1615 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1616 MI.eraseFromParent();
1617 return;
1618 }
1619 case AMDGPU::G_BRCOND: {
1620 Register CondReg = MI.getOperand(0).getReg();
1621 // FIXME: Should use legalizer helper, but should change bool ext type.
1622 const RegisterBank *CondBank =
1623 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1624
1625 if (CondBank == &AMDGPU::SGPRRegBank) {
1626 MachineIRBuilder B(MI);
1627 const LLT S32 = LLT::scalar(32);
1628 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
1629 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
1630
1631 MI.getOperand(0).setReg(NewCondReg);
1632 B.buildZExt(NewCondReg, CondReg);
1633 return;
1634 }
1635
1636 break;
1637 }
1638 case AMDGPU::G_AND:
1639 case AMDGPU::G_OR:
1640 case AMDGPU::G_XOR: {
1641 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
1642 // there is a VGPR input.
1643 Register DstReg = MI.getOperand(0).getReg();
1644 LLT DstTy = MRI.getType(DstReg);
1645
1646 if (DstTy.getSizeInBits() == 1) {
1647 const RegisterBank *DstBank =
1648 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1649 if (DstBank == &AMDGPU::VCCRegBank)
1650 break;
1651
1652 MachineFunction *MF = MI.getParent()->getParent();
1653 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
1654 GISelObserverWrapper Observer(&ApplyBank);
1655 MachineIRBuilder B(MI);
1656 LegalizerHelper Helper(*MF, Observer, B);
1657
1658 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
1659 LegalizerHelper::Legalized)
1660 llvm_unreachable("widen scalar should have succeeded");
1661 return;
1662 }
1663
1664 if (DstTy.getSizeInBits() != 64)
1665 break;
1666
1667 LLT HalfTy = getHalfSizedType(DstTy);
1668 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1669 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
1670 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
1671
1672 // All inputs are SGPRs, nothing special to do.
1673 if (DefRegs.empty()) {
1674 assert(Src0Regs.empty() && Src1Regs.empty());
1675 break;
1676 }
1677
1678 assert(DefRegs.size() == 2);
1679 assert(Src0Regs.size() == Src1Regs.size() &&
1680 (Src0Regs.empty() || Src0Regs.size() == 2));
1681
1682 // Depending on where the source registers came from, the generic code may
1683 // have decided to split the inputs already or not. If not, we still need to
1684 // extract the values.
1685 MachineIRBuilder B(MI);
1686
1687 if (Src0Regs.empty())
1688 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
1689 else
1690 setRegsToType(MRI, Src0Regs, HalfTy);
1691
1692 if (Src1Regs.empty())
1693 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
1694 else
1695 setRegsToType(MRI, Src1Regs, HalfTy);
1696
1697 setRegsToType(MRI, DefRegs, HalfTy);
1698
1699 B.buildInstr(Opc)
1700 .addDef(DefRegs[0])
1701 .addUse(Src0Regs[0])
1702 .addUse(Src1Regs[0]);
1703
1704 B.buildInstr(Opc)
1705 .addDef(DefRegs[1])
1706 .addUse(Src0Regs[1])
1707 .addUse(Src1Regs[1]);
1708
1709 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1710 MI.eraseFromParent();
1711 return;
1712 }
1713 case AMDGPU::G_ADD:
1714 case AMDGPU::G_SUB:
1715 case AMDGPU::G_MUL: {
1716 Register DstReg = MI.getOperand(0).getReg();
1717 LLT DstTy = MRI.getType(DstReg);
1718 if (DstTy != LLT::scalar(16))
1719 break;
1720
1721 const RegisterBank *DstBank =
1722 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1723 if (DstBank == &AMDGPU::VGPRRegBank)
1724 break;
1725
1726 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
1727 MachineFunction *MF = MI.getParent()->getParent();
1728 MachineIRBuilder B(MI);
1729 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
1730 GISelObserverWrapper Observer(&ApplySALU);
1731 LegalizerHelper Helper(*MF, Observer, B);
1732
1733 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
1734 LegalizerHelper::Legalized)
1735 llvm_unreachable("widen scalar should have succeeded");
1736 return;
1737 }
1738 case AMDGPU::G_SMIN:
1739 case AMDGPU::G_SMAX:
1740 case AMDGPU::G_UMIN:
1741 case AMDGPU::G_UMAX: {
1742 Register DstReg = MI.getOperand(0).getReg();
1743 const RegisterBank *DstBank =
1744 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1745 if (DstBank == &AMDGPU::VGPRRegBank)
1746 break;
1747
1748 MachineFunction *MF = MI.getParent()->getParent();
1749 MachineIRBuilder B(MI);
1750
1751 // Turn scalar min/max into a compare and select.
1752 LLT Ty = MRI.getType(DstReg);
1753 LLT S32 = LLT::scalar(32);
1754 LLT S16 = LLT::scalar(16);
1755
1756 if (Ty == S16) {
1757 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
1758 GISelObserverWrapper Observer(&ApplySALU);
1759 LegalizerHelper Helper(*MF, Observer, B);
1760
1761 // Need to widen to s32, and expand as cmp + select.
1762 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
1763 llvm_unreachable("widenScalar should have succeeded");
1764
1765 // FIXME: This is relying on widenScalar leaving MI in place.
1766 lowerScalarMinMax(B, MI);
1767 } else
1768 lowerScalarMinMax(B, MI);
1769
1770 return;
1771 }
1772 case AMDGPU::G_SEXT:
1773 case AMDGPU::G_ZEXT: {
1774 Register SrcReg = MI.getOperand(1).getReg();
1775 LLT SrcTy = MRI.getType(SrcReg);
1776 bool Signed = Opc == AMDGPU::G_SEXT;
1777
1778 MachineIRBuilder B(MI);
1779 const RegisterBank *SrcBank =
1780 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1781
1782 Register DstReg = MI.getOperand(0).getReg();
1783 LLT DstTy = MRI.getType(DstReg);
1784 if (DstTy.isScalar() &&
1785 SrcBank != &AMDGPU::SGPRRegBank &&
1786 SrcBank != &AMDGPU::VCCRegBank &&
1787 // FIXME: Should handle any type that round to s64 when irregular
1788 // breakdowns supported.
1789 DstTy.getSizeInBits() == 64 &&
1790 SrcTy.getSizeInBits() <= 32) {
1791 const LLT S32 = LLT::scalar(32);
1792 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1793
1794 // Extend to 32-bit, and then extend the low half.
1795 if (Signed) {
1796 // TODO: Should really be buildSExtOrCopy
1797 B.buildSExtOrTrunc(DefRegs[0], SrcReg);
1798
1799 // Replicate sign bit from 32-bit extended part.
1800 auto ShiftAmt = B.buildConstant(S32, 31);
1801 MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
1802 B.buildAShr(DefRegs[1], DefRegs[0], ShiftAmt);
1803 } else {
1804 B.buildZExtOrTrunc(DefRegs[0], SrcReg);
1805 B.buildConstant(DefRegs[1], 0);
1806 }
1807
1808 MRI.setRegBank(DstReg, *SrcBank);
1809 MI.eraseFromParent();
1810 return;
1811 }
1812
1813 if (SrcTy != LLT::scalar(1))
1814 return;
1815
1816 if (SrcBank == &AMDGPU::VCCRegBank) {
1817 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1818
1819 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
1820
1821 unsigned DstSize = DstTy.getSizeInBits();
1822 // 64-bit select is SGPR only
1823 const bool UseSel64 = DstSize > 32 &&
1824 SrcBank->getID() == AMDGPU::SGPRRegBankID;
1825
1826 // TODO: Should s16 select be legal?
1827 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
1828 auto True = B.buildConstant(SelType, Signed ? -1 : 1);
1829 auto False = B.buildConstant(SelType, 0);
1830
1831 MRI.setRegBank(True.getReg(0), *DstBank);
1832 MRI.setRegBank(False.getReg(0), *DstBank);
1833 MRI.setRegBank(DstReg, *DstBank);
1834
1835 if (DstSize > 32) {
1836 B.buildSelect(DefRegs[0], SrcReg, True, False);
1837 B.buildCopy(DefRegs[1], DefRegs[0]);
1838 } else if (DstSize < 32) {
1839 auto Sel = B.buildSelect(SelType, SrcReg, True, False);
1840 MRI.setRegBank(Sel.getReg(0), *DstBank);
1841 B.buildTrunc(DstReg, Sel);
1842 } else {
1843 B.buildSelect(DstReg, SrcReg, True, False);
1844 }
1845
1846 MI.eraseFromParent();
1847 return;
1848 }
1849
1850 // Fixup the case with an s1 src that isn't a condition register. Use shifts
1851 // instead of introducing a compare to avoid an unnecessary condition
1852 // register (and since there's no scalar 16-bit compares).
1853 auto Ext = B.buildAnyExt(DstTy, SrcReg);
1854 auto ShiftAmt = B.buildConstant(LLT::scalar(32), DstTy.getSizeInBits() - 1);
1855 auto Shl = B.buildShl(DstTy, Ext, ShiftAmt);
1856
1857 if (MI.getOpcode() == AMDGPU::G_SEXT)
1858 B.buildAShr(DstReg, Shl, ShiftAmt);
1859 else
1860 B.buildLShr(DstReg, Shl, ShiftAmt);
1861
1862 MRI.setRegBank(DstReg, *SrcBank);
1863 MRI.setRegBank(Ext.getReg(0), *SrcBank);
1864 MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
1865 MRI.setRegBank(Shl.getReg(0), *SrcBank);
1866 MI.eraseFromParent();
1867 return;
1868 }
1869 case AMDGPU::G_BUILD_VECTOR:
1870 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
1871 Register DstReg = MI.getOperand(0).getReg();
1872 LLT DstTy = MRI.getType(DstReg);
1873 if (DstTy != LLT::vector(2, 16))
1874 break;
1875
1876 assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
1877 substituteSimpleCopyRegs(OpdMapper, 1);
1878 substituteSimpleCopyRegs(OpdMapper, 2);
1879
1880 const RegisterBank *DstBank =
1881 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1882 if (DstBank == &AMDGPU::SGPRRegBank)
1883 break; // Can use S_PACK_* instructions.
1884
1885 MachineIRBuilder B(MI);
1886
1887 Register Lo = MI.getOperand(1).getReg();
1888 Register Hi = MI.getOperand(2).getReg();
1889 const LLT S32 = LLT::scalar(32);
1890
1891 const RegisterBank *BankLo =
1892 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1893 const RegisterBank *BankHi =
1894 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1895
1896 Register ZextLo;
1897 Register ShiftHi;
1898
1899 if (Opc == AMDGPU::G_BUILD_VECTOR) {
1900 ZextLo = B.buildZExt(S32, Lo).getReg(0);
1901 MRI.setRegBank(ZextLo, *BankLo);
1902
1903 Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
1904 MRI.setRegBank(ZextHi, *BankHi);
1905
1906 auto ShiftAmt = B.buildConstant(S32, 16);
1907 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
1908
1909 ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
1910 MRI.setRegBank(ShiftHi, *BankHi);
1911 } else {
1912 Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
1913 MRI.setRegBank(MaskLo, *BankLo);
1914
1915 auto ShiftAmt = B.buildConstant(S32, 16);
1916 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
1917
1918 ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
1919 MRI.setRegBank(ShiftHi, *BankHi);
1920
1921 ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
1922 MRI.setRegBank(ZextLo, *BankLo);
1923 }
1924
1925 auto Or = B.buildOr(S32, ZextLo, ShiftHi);
1926 MRI.setRegBank(Or.getReg(0), *DstBank);
1927
1928 B.buildBitcast(DstReg, Or);
1929 MI.eraseFromParent();
1930 return;
1931 }
1932 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
1933 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1934
1935 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
1936
1937 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1938 MachineIRBuilder B(MI);
1939
1940 const ValueMapping &DstMapping
1941 = OpdMapper.getInstrMapping().getOperandMapping(0);
1942 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
1943 const RegisterBank *SrcBank =
1944 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1945
1946 Register DstReg = MI.getOperand(0).getReg();
1947 Register SrcReg = MI.getOperand(1).getReg();
1948 Register IdxReg = MI.getOperand(2).getReg();
1949
1950 // If this is a VGPR result only because the index was a VGPR result, the
1951 // actual indexing will be done on the SGPR source vector, which will
1952 // produce a scalar result. We need to copy to the VGPR result inside the
1953 // waterfall loop.
1954 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
1955 SrcBank == &AMDGPU::SGPRRegBank;
1956 if (DstRegs.empty()) {
1957 applyDefaultMapping(OpdMapper);
1958
1959 executeInWaterfallLoop(MI, MRI, { 2 });
1960
1961 if (NeedCopyToVGPR) {
1962 // We don't want a phi for this temporary reg.
1963 Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
1964 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
1965 MI.getOperand(0).setReg(TmpReg);
1966 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
1967
1968 // Use a v_mov_b32 here to make the exec dependency explicit.
1969 buildVCopy(B, DstReg, TmpReg);
1970 }
1971
1972 return;
1973 }
1974
1975 assert(DstTy.getSizeInBits() == 64);
1976
1977 LLT SrcTy = MRI.getType(SrcReg);
1978 const LLT S32 = LLT::scalar(32);
1979 LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
1980
1981 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
1982 auto One = B.buildConstant(S32, 1);
1983
1984 // Split the vector index into 32-bit pieces. Prepare to move all of the
1985 // new instructions into a waterfall loop if necessary.
1986 //
1987 // Don't put the bitcast or constant in the loop.
1988 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
1989
1990 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
1991 auto IdxLo = B.buildShl(S32, IdxReg, One);
1992 auto IdxHi = B.buildAdd(S32, IdxLo, One);
1993
1994 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
1995 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
1996
1997 MRI.setRegBank(DstReg, *DstBank);
1998 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
1999 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2000 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2001 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2002
2003 SmallSet<Register, 4> OpsToWaterfall;
2004 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2005 MI.eraseFromParent();
2006 return;
2007 }
2008
2009 // Remove the original instruction to avoid potentially confusing the
2010 // waterfall loop logic.
2011 B.setInstr(*Span.begin());
2012 MI.eraseFromParent();
2013 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2014 OpsToWaterfall, MRI);
2015
2016 if (NeedCopyToVGPR) {
2017 MachineBasicBlock *LoopBB = Extract1->getParent();
2018 Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2019 Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2020 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2021 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2022
2023 Extract0->getOperand(0).setReg(TmpReg0);
2024 Extract1->getOperand(0).setReg(TmpReg1);
2025
2026 B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2027
2028 buildVCopy(B, DstRegs[0], TmpReg0);
2029 buildVCopy(B, DstRegs[1], TmpReg1);
2030 }
2031
2032 return;
2033 }
2034 case AMDGPU::G_INSERT_VECTOR_ELT: {
2035 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2036
2037 assert(OpdMapper.getVRegs(0).empty());
2038 assert(OpdMapper.getVRegs(1).empty());
2039 assert(OpdMapper.getVRegs(3).empty());
2040
2041 if (InsRegs.empty()) {
2042 applyDefaultMapping(OpdMapper);
2043 executeInWaterfallLoop(MI, MRI, { 3 });
2044 return;
2045 }
2046
2047 Register DstReg = MI.getOperand(0).getReg();
2048 Register SrcReg = MI.getOperand(1).getReg();
2049 Register InsReg = MI.getOperand(2).getReg();
2050 Register IdxReg = MI.getOperand(3).getReg();
2051 LLT SrcTy = MRI.getType(SrcReg);
2052 LLT InsTy = MRI.getType(InsReg);
2053 (void)InsTy;
2054
2055 assert(InsTy.getSizeInBits() == 64);
2056
2057 const LLT S32 = LLT::scalar(32);
2058 LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
2059
2060 MachineIRBuilder B(MI);
2061 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2062 auto One = B.buildConstant(S32, 1);
2063
2064 // Split the vector index into 32-bit pieces. Prepare to move all of the
2065 // new instructions into a waterfall loop if necessary.
2066 //
2067 // Don't put the bitcast or constant in the loop.
2068 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2069
2070 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2071 auto IdxLo = B.buildShl(S32, IdxReg, One);
2072 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2073
2074 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2075 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2076 B.buildBitcast(DstReg, InsHi);
2077
2078 const RegisterBank *DstBank =
2079 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2080 const RegisterBank *SrcBank =
2081 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2082 const RegisterBank *InsSrcBank =
2083 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2084
2085 MRI.setRegBank(InsReg, *InsSrcBank);
2086 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2087 MRI.setRegBank(InsLo.getReg(0), *DstBank);
2088 MRI.setRegBank(InsHi.getReg(0), *DstBank);
2089 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2090 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2091 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2092
2093
2094 SmallSet<Register, 4> OpsToWaterfall;
2095 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2096 MI.eraseFromParent();
2097 return;
2098 }
2099
2100 B.setInstr(*Span.begin());
2101 MI.eraseFromParent();
2102
2103 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2104 OpsToWaterfall, MRI);
2105 return;
2106 }
2107 case AMDGPU::G_INTRINSIC: {
2108 switch (MI.getIntrinsicID()) {
2109 case Intrinsic::amdgcn_s_buffer_load: {
2110 // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS
2111 executeInWaterfallLoop(MI, MRI, { 2, 3 });
2112 return;
2113 }
2114 case Intrinsic::amdgcn_readlane: {
2115 substituteSimpleCopyRegs(OpdMapper, 2);
2116
2117 assert(OpdMapper.getVRegs(0).empty());
2118 assert(OpdMapper.getVRegs(3).empty());
2119
2120 // Make sure the index is an SGPR. It doesn't make sense to run this in a
2121 // waterfall loop, so assume it's a uniform value.
2122 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2123 return;
2124 }
2125 case Intrinsic::amdgcn_writelane: {
2126 assert(OpdMapper.getVRegs(0).empty());
2127 assert(OpdMapper.getVRegs(2).empty());
2128 assert(OpdMapper.getVRegs(3).empty());
2129
2130 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
2131 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
2132 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2133 return;
2134 }
2135 default:
2136 break;
2137 }
2138 break;
2139 }
2140 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
2141 auto IntrID = MI.getIntrinsicID();
2142 switch (IntrID) {
2143 case Intrinsic::amdgcn_buffer_load: {
2144 executeInWaterfallLoop(MI, MRI, { 2 });
2145 return;
2146 }
2147 case Intrinsic::amdgcn_ds_ordered_add:
2148 case Intrinsic::amdgcn_ds_ordered_swap: {
2149 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
2150 assert(OpdMapper.getVRegs(0).empty());
2151 substituteSimpleCopyRegs(OpdMapper, 3);
2152 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2153 return;
2154 }
2155 case Intrinsic::amdgcn_ds_gws_init:
2156 case Intrinsic::amdgcn_ds_gws_barrier:
2157 case Intrinsic::amdgcn_ds_gws_sema_br: {
2158 // Only the first lane is executes, so readfirstlane is safe.
2159 substituteSimpleCopyRegs(OpdMapper, 1);
2160 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2161 return;
2162 }
2163 case Intrinsic::amdgcn_ds_gws_sema_v:
2164 case Intrinsic::amdgcn_ds_gws_sema_p:
2165 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
2166 // Only the first lane is executes, so readfirstlane is safe.
2167 constrainOpWithReadfirstlane(MI, MRI, 1); // M0
2168 return;
2169 }
2170 case Intrinsic::amdgcn_s_sendmsg:
2171 case Intrinsic::amdgcn_s_sendmsghalt: {
2172 // FIXME: Should this use a waterfall loop?
2173 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2174 return;
2175 }
2176 case Intrinsic::amdgcn_raw_buffer_load:
2177 case Intrinsic::amdgcn_raw_buffer_load_format:
2178 case Intrinsic::amdgcn_raw_tbuffer_load:
2179 case Intrinsic::amdgcn_raw_buffer_store:
2180 case Intrinsic::amdgcn_raw_buffer_store_format:
2181 case Intrinsic::amdgcn_raw_tbuffer_store: {
2182 applyDefaultMapping(OpdMapper);
2183 executeInWaterfallLoop(MI, MRI, {2, 4});
2184 return;
2185 }
2186 case Intrinsic::amdgcn_struct_buffer_load:
2187 case Intrinsic::amdgcn_struct_buffer_store:
2188 case Intrinsic::amdgcn_struct_tbuffer_load:
2189 case Intrinsic::amdgcn_struct_tbuffer_store: {
2190 applyDefaultMapping(OpdMapper);
2191 executeInWaterfallLoop(MI, MRI, {2, 5});
2192 return;
2193 }
2194 default: {
2195 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
2196 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
2197 // Non-images can have complications from operands that allow both SGPR
2198 // and VGPR. For now it's too complicated to figure out the final opcode
2199 // to derive the register bank from the MCInstrDesc.
2200 if (RSrcIntrin->IsImage) {
2201 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
2202 return;
2203 }
2204 }
2205
2206 break;
2207 }
2208 }
2209 break;
2210 }
2211 case AMDGPU::G_LOAD:
2212 case AMDGPU::G_ZEXTLOAD:
2213 case AMDGPU::G_SEXTLOAD: {
2214 if (applyMappingWideLoad(MI, OpdMapper, MRI))
2215 return;
2216 break;
2217 }
2218 default:
2219 break;
2220 }
2221
2222 return applyDefaultMapping(OpdMapper);
2223 }
2224
isSALUMapping(const MachineInstr & MI) const2225 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
2226 const MachineFunction &MF = *MI.getParent()->getParent();
2227 const MachineRegisterInfo &MRI = MF.getRegInfo();
2228 for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
2229 if (!MI.getOperand(i).isReg())
2230 continue;
2231 Register Reg = MI.getOperand(i).getReg();
2232 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
2233 if (Bank->getID() != AMDGPU::SGPRRegBankID)
2234 return false;
2235 }
2236 }
2237 return true;
2238 }
2239
2240 const RegisterBankInfo::InstructionMapping &
getDefaultMappingSOP(const MachineInstr & MI) const2241 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
2242 const MachineFunction &MF = *MI.getParent()->getParent();
2243 const MachineRegisterInfo &MRI = MF.getRegInfo();
2244 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
2245
2246 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
2247 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
2248 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2249 }
2250 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
2251 MI.getNumOperands());
2252 }
2253
2254 const RegisterBankInfo::InstructionMapping &
getDefaultMappingVOP(const MachineInstr & MI) const2255 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
2256 const MachineFunction &MF = *MI.getParent()->getParent();
2257 const MachineRegisterInfo &MRI = MF.getRegInfo();
2258 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
2259 unsigned OpdIdx = 0;
2260
2261 unsigned Size0 = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2262 OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
2263
2264 if (MI.getOperand(OpdIdx).isIntrinsicID())
2265 OpdsMapping[OpdIdx++] = nullptr;
2266
2267 Register Reg1 = MI.getOperand(OpdIdx).getReg();
2268 unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI);
2269
2270 unsigned DefaultBankID = Size1 == 1 ?
2271 AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
2272 unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI, DefaultBankID);
2273
2274 OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1);
2275
2276 for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) {
2277 const MachineOperand &MO = MI.getOperand(OpdIdx);
2278 if (!MO.isReg())
2279 continue;
2280
2281 unsigned Size = getSizeInBits(MO.getReg(), MRI, *TRI);
2282 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
2283 OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(BankID, Size);
2284 }
2285
2286 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
2287 MI.getNumOperands());
2288 }
2289
2290 const RegisterBankInfo::InstructionMapping &
getDefaultMappingAllVGPR(const MachineInstr & MI) const2291 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
2292 const MachineFunction &MF = *MI.getParent()->getParent();
2293 const MachineRegisterInfo &MRI = MF.getRegInfo();
2294 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
2295
2296 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
2297 const MachineOperand &Op = MI.getOperand(I);
2298 if (!Op.isReg())
2299 continue;
2300
2301 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
2302 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2303 }
2304
2305 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
2306 MI.getNumOperands());
2307 }
2308
2309 const RegisterBankInfo::InstructionMapping &
getImageMapping(const MachineRegisterInfo & MRI,const MachineInstr & MI,int RsrcIdx) const2310 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
2311 const MachineInstr &MI,
2312 int RsrcIdx) const {
2313 // The reported argument index is relative to the IR intrinsic call arguments,
2314 // so we need to shift by the number of defs and the intrinsic ID.
2315 RsrcIdx += MI.getNumExplicitDefs() + 1;
2316
2317 const int NumOps = MI.getNumOperands();
2318 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
2319
2320 // TODO: Should packed/unpacked D16 difference be reported here as part of
2321 // the value mapping?
2322 for (int I = 0; I != NumOps; ++I) {
2323 if (!MI.getOperand(I).isReg())
2324 continue;
2325
2326 Register OpReg = MI.getOperand(I).getReg();
2327 unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
2328
2329 // FIXME: Probably need a new intrinsic register bank searchable table to
2330 // handle arbitrary intrinsics easily.
2331 //
2332 // If this has a sampler, it immediately follows rsrc.
2333 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
2334
2335 if (MustBeSGPR) {
2336 // If this must be an SGPR, so we must report whatever it is as legal.
2337 unsigned NewBank = getRegBankID(OpReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
2338 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
2339 } else {
2340 // Some operands must be VGPR, and these are easy to copy to.
2341 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2342 }
2343 }
2344
2345 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
2346 }
2347
2348 const RegisterBankInfo::InstructionMapping &
getInstrMappingForLoad(const MachineInstr & MI) const2349 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
2350
2351 const MachineFunction &MF = *MI.getParent()->getParent();
2352 const MachineRegisterInfo &MRI = MF.getRegInfo();
2353 SmallVector<const ValueMapping*, 2> OpdsMapping(2);
2354 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2355 LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
2356 Register PtrReg = MI.getOperand(1).getReg();
2357 LLT PtrTy = MRI.getType(PtrReg);
2358 unsigned AS = PtrTy.getAddressSpace();
2359 unsigned PtrSize = PtrTy.getSizeInBits();
2360
2361 const ValueMapping *ValMapping;
2362 const ValueMapping *PtrMapping;
2363
2364 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
2365
2366 if (PtrBank == &AMDGPU::SGPRRegBank &&
2367 (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
2368 AS != AMDGPUAS::PRIVATE_ADDRESS) &&
2369 isScalarLoadLegal(MI)) {
2370 // We have a uniform instruction so we want to use an SMRD load
2371 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2372 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
2373 } else {
2374 ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy);
2375 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
2376 }
2377
2378 OpdsMapping[0] = ValMapping;
2379 OpdsMapping[1] = PtrMapping;
2380 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
2381 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
2382 return Mapping;
2383
2384 // FIXME: Do we want to add a mapping for FLAT load, or should we just
2385 // handle that during instruction selection?
2386 }
2387
2388 unsigned
getRegBankID(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI,unsigned Default) const2389 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
2390 const MachineRegisterInfo &MRI,
2391 const TargetRegisterInfo &TRI,
2392 unsigned Default) const {
2393 const RegisterBank *Bank = getRegBank(Reg, MRI, TRI);
2394 return Bank ? Bank->getID() : Default;
2395 }
2396
2397
regBankUnion(unsigned RB0,unsigned RB1)2398 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
2399 return (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) ?
2400 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2401 }
2402
regBankBoolUnion(int RB0,int RB1)2403 static int regBankBoolUnion(int RB0, int RB1) {
2404 if (RB0 == -1)
2405 return RB1;
2406 if (RB1 == -1)
2407 return RB0;
2408
2409 // vcc, vcc -> vcc
2410 // vcc, sgpr -> vcc
2411 // vcc, vgpr -> vcc
2412 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
2413 return AMDGPU::VCCRegBankID;
2414
2415 // vcc, vgpr -> vgpr
2416 return regBankUnion(RB0, RB1);
2417 }
2418
2419 const RegisterBankInfo::ValueMapping *
getSGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const2420 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
2421 const MachineRegisterInfo &MRI,
2422 const TargetRegisterInfo &TRI) const {
2423 // Lie and claim anything is legal, even though this needs to be an SGPR
2424 // applyMapping will have to deal with it as a waterfall loop.
2425 unsigned Bank = getRegBankID(Reg, MRI, TRI, AMDGPU::SGPRRegBankID);
2426 unsigned Size = getSizeInBits(Reg, MRI, TRI);
2427 return AMDGPU::getValueMapping(Bank, Size);
2428 }
2429
2430 const RegisterBankInfo::ValueMapping *
getVGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const2431 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
2432 const MachineRegisterInfo &MRI,
2433 const TargetRegisterInfo &TRI) const {
2434 unsigned Size = getSizeInBits(Reg, MRI, TRI);
2435 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2436 }
2437
2438 const RegisterBankInfo::ValueMapping *
getAGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const2439 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
2440 const MachineRegisterInfo &MRI,
2441 const TargetRegisterInfo &TRI) const {
2442 unsigned Size = getSizeInBits(Reg, MRI, TRI);
2443 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
2444 }
2445
2446 ///
2447 /// This function must return a legal mapping, because
2448 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
2449 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a
2450 /// VGPR to SGPR generated is illegal.
2451 ///
2452 const RegisterBankInfo::InstructionMapping &
getInstrMapping(const MachineInstr & MI) const2453 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
2454 const MachineFunction &MF = *MI.getParent()->getParent();
2455 const MachineRegisterInfo &MRI = MF.getRegInfo();
2456
2457 if (MI.isRegSequence()) {
2458 // If any input is a VGPR, the result must be a VGPR. The default handling
2459 // assumes any copy between banks is legal.
2460 unsigned BankID = AMDGPU::SGPRRegBankID;
2461
2462 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2463 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI, *TRI);
2464 // It doesn't make sense to use vcc or scc banks here, so just ignore
2465 // them.
2466 if (OpBank != AMDGPU::SGPRRegBankID) {
2467 BankID = AMDGPU::VGPRRegBankID;
2468 break;
2469 }
2470 }
2471 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2472
2473 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
2474 return getInstructionMapping(
2475 1, /*Cost*/ 1,
2476 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
2477 }
2478
2479 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
2480 // properly.
2481 //
2482 // TODO: There are additional exec masking dependencies to analyze.
2483 if (MI.getOpcode() == TargetOpcode::G_PHI) {
2484 // TODO: Generate proper invalid bank enum.
2485 int ResultBank = -1;
2486 Register DstReg = MI.getOperand(0).getReg();
2487
2488 // Sometimes the result may have already been assigned a bank.
2489 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
2490 ResultBank = DstBank->getID();
2491
2492 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2493 Register Reg = MI.getOperand(I).getReg();
2494 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
2495
2496 // FIXME: Assuming VGPR for any undetermined inputs.
2497 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
2498 ResultBank = AMDGPU::VGPRRegBankID;
2499 break;
2500 }
2501
2502 // FIXME: Need to promote SGPR case to s32
2503 unsigned OpBank = Bank->getID();
2504 ResultBank = regBankBoolUnion(ResultBank, OpBank);
2505 }
2506
2507 assert(ResultBank != -1);
2508
2509 unsigned Size = MRI.getType(DstReg).getSizeInBits();
2510
2511 const ValueMapping &ValMap =
2512 getValueMapping(0, Size, getRegBank(ResultBank));
2513 return getInstructionMapping(
2514 1, /*Cost*/ 1,
2515 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
2516 }
2517
2518 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
2519 if (Mapping.isValid())
2520 return Mapping;
2521
2522 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
2523
2524 switch (MI.getOpcode()) {
2525 default:
2526 return getInvalidInstructionMapping();
2527
2528 case AMDGPU::G_AND:
2529 case AMDGPU::G_OR:
2530 case AMDGPU::G_XOR: {
2531 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2532 if (Size == 1) {
2533 const RegisterBank *DstBank
2534 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
2535
2536 unsigned TargetBankID = -1;
2537 unsigned BankLHS = -1;
2538 unsigned BankRHS = -1;
2539 if (DstBank) {
2540 TargetBankID = DstBank->getID();
2541 if (DstBank == &AMDGPU::VCCRegBank) {
2542 TargetBankID = AMDGPU::VCCRegBankID;
2543 BankLHS = AMDGPU::VCCRegBankID;
2544 BankRHS = AMDGPU::VCCRegBankID;
2545 } else {
2546 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
2547 AMDGPU::SGPRRegBankID);
2548 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2549 AMDGPU::SGPRRegBankID);
2550 }
2551 } else {
2552 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
2553 AMDGPU::VCCRegBankID);
2554 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2555 AMDGPU::VCCRegBankID);
2556
2557 // Both inputs should be true booleans to produce a boolean result.
2558 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
2559 TargetBankID = AMDGPU::VGPRRegBankID;
2560 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
2561 TargetBankID = AMDGPU::VCCRegBankID;
2562 BankLHS = AMDGPU::VCCRegBankID;
2563 BankRHS = AMDGPU::VCCRegBankID;
2564 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
2565 TargetBankID = AMDGPU::SGPRRegBankID;
2566 }
2567 }
2568
2569 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
2570 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
2571 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
2572 break;
2573 }
2574
2575 if (Size == 64) {
2576
2577 if (isSALUMapping(MI)) {
2578 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
2579 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
2580 } else {
2581 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
2582 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI/*, DefaultBankID*/);
2583 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
2584
2585 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI/*, DefaultBankID*/);
2586 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
2587 }
2588
2589 break;
2590 }
2591
2592 LLVM_FALLTHROUGH;
2593 }
2594 case AMDGPU::G_PTR_ADD:
2595 case AMDGPU::G_ADD:
2596 case AMDGPU::G_SUB:
2597 case AMDGPU::G_MUL:
2598 case AMDGPU::G_SHL:
2599 case AMDGPU::G_LSHR:
2600 case AMDGPU::G_ASHR:
2601 case AMDGPU::G_UADDO:
2602 case AMDGPU::G_USUBO:
2603 case AMDGPU::G_UADDE:
2604 case AMDGPU::G_SADDE:
2605 case AMDGPU::G_USUBE:
2606 case AMDGPU::G_SSUBE:
2607 case AMDGPU::G_SMIN:
2608 case AMDGPU::G_SMAX:
2609 case AMDGPU::G_UMIN:
2610 case AMDGPU::G_UMAX:
2611 if (isSALUMapping(MI))
2612 return getDefaultMappingSOP(MI);
2613 LLVM_FALLTHROUGH;
2614
2615 case AMDGPU::G_FADD:
2616 case AMDGPU::G_FSUB:
2617 case AMDGPU::G_FPTOSI:
2618 case AMDGPU::G_FPTOUI:
2619 case AMDGPU::G_FMUL:
2620 case AMDGPU::G_FMA:
2621 case AMDGPU::G_FMAD:
2622 case AMDGPU::G_FSQRT:
2623 case AMDGPU::G_FFLOOR:
2624 case AMDGPU::G_FCEIL:
2625 case AMDGPU::G_FRINT:
2626 case AMDGPU::G_SITOFP:
2627 case AMDGPU::G_UITOFP:
2628 case AMDGPU::G_FPTRUNC:
2629 case AMDGPU::G_FPEXT:
2630 case AMDGPU::G_FEXP2:
2631 case AMDGPU::G_FLOG2:
2632 case AMDGPU::G_FMINNUM:
2633 case AMDGPU::G_FMAXNUM:
2634 case AMDGPU::G_FMINNUM_IEEE:
2635 case AMDGPU::G_FMAXNUM_IEEE:
2636 case AMDGPU::G_FCANONICALIZE:
2637 case AMDGPU::G_INTRINSIC_TRUNC:
2638 case AMDGPU::G_AMDGPU_FFBH_U32:
2639 return getDefaultMappingVOP(MI);
2640 case AMDGPU::G_UMULH:
2641 case AMDGPU::G_SMULH: {
2642 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
2643 return getDefaultMappingSOP(MI);
2644 return getDefaultMappingVOP(MI);
2645 }
2646 case AMDGPU::G_IMPLICIT_DEF: {
2647 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2648 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2649 break;
2650 }
2651 case AMDGPU::G_FCONSTANT:
2652 case AMDGPU::G_CONSTANT:
2653 case AMDGPU::G_GLOBAL_VALUE:
2654 case AMDGPU::G_BLOCK_ADDR:
2655 case AMDGPU::G_READCYCLECOUNTER: {
2656 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2657 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2658 break;
2659 }
2660 case AMDGPU::G_FRAME_INDEX: {
2661 // TODO: This should be the same as other constants, but eliminateFrameIndex
2662 // currently assumes VALU uses.
2663 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2664 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2665 break;
2666 }
2667 case AMDGPU::G_INSERT: {
2668 unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
2669 AMDGPU::VGPRRegBankID;
2670 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2671 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2672 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
2673 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
2674 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
2675 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
2676 OpdsMapping[3] = nullptr;
2677 break;
2678 }
2679 case AMDGPU::G_EXTRACT: {
2680 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2681 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2682 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2683 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
2684 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
2685 OpdsMapping[2] = nullptr;
2686 break;
2687 }
2688 case AMDGPU::G_BUILD_VECTOR:
2689 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
2690 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
2691 if (DstTy == LLT::vector(2, 16)) {
2692 unsigned DstSize = DstTy.getSizeInBits();
2693 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2694 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2695 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2696 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
2697
2698 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
2699 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
2700 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
2701 break;
2702 }
2703
2704 LLVM_FALLTHROUGH;
2705 }
2706 case AMDGPU::G_MERGE_VALUES:
2707 case AMDGPU::G_CONCAT_VECTORS: {
2708 unsigned Bank = isSALUMapping(MI) ?
2709 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2710 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2711 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2712
2713 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
2714 // Op1 and Dst should use the same register bank.
2715 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
2716 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
2717 break;
2718 }
2719 case AMDGPU::G_BITCAST:
2720 case AMDGPU::G_INTTOPTR:
2721 case AMDGPU::G_PTRTOINT:
2722 case AMDGPU::G_CTLZ:
2723 case AMDGPU::G_CTLZ_ZERO_UNDEF:
2724 case AMDGPU::G_CTTZ:
2725 case AMDGPU::G_CTTZ_ZERO_UNDEF:
2726 case AMDGPU::G_CTPOP:
2727 case AMDGPU::G_BSWAP:
2728 case AMDGPU::G_BITREVERSE:
2729 case AMDGPU::G_FABS:
2730 case AMDGPU::G_FNEG: {
2731 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2732 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2733 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
2734 break;
2735 }
2736 case AMDGPU::G_TRUNC: {
2737 Register Dst = MI.getOperand(0).getReg();
2738 Register Src = MI.getOperand(1).getReg();
2739 unsigned Bank = getRegBankID(Src, MRI, *TRI);
2740 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
2741 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
2742 OpdsMapping[0] = DstSize == 1 && Bank != AMDGPU::SGPRRegBankID ?
2743 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize) :
2744 AMDGPU::getValueMapping(Bank, DstSize);
2745 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
2746 break;
2747 }
2748 case AMDGPU::G_ZEXT:
2749 case AMDGPU::G_SEXT:
2750 case AMDGPU::G_ANYEXT: {
2751 Register Dst = MI.getOperand(0).getReg();
2752 Register Src = MI.getOperand(1).getReg();
2753 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
2754 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
2755
2756 unsigned DstBank;
2757 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
2758 assert(SrcBank);
2759 switch (SrcBank->getID()) {
2760 case AMDGPU::SGPRRegBankID:
2761 DstBank = AMDGPU::SGPRRegBankID;
2762 break;
2763 default:
2764 DstBank = AMDGPU::VGPRRegBankID;
2765 break;
2766 }
2767
2768 // TODO: Should anyext be split into 32-bit part as well?
2769 if (MI.getOpcode() == AMDGPU::G_ANYEXT) {
2770 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize);
2771 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank->getID(), SrcSize);
2772 } else {
2773 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
2774 // 32-bits, and then to 64.
2775 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
2776 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
2777 SrcSize);
2778 }
2779 break;
2780 }
2781 case AMDGPU::G_FCMP: {
2782 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2783 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2784 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
2785 OpdsMapping[1] = nullptr; // Predicate Operand.
2786 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
2787 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2788 break;
2789 }
2790 case AMDGPU::G_STORE: {
2791 assert(MI.getOperand(0).isReg());
2792 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2793 // FIXME: We need to specify a different reg bank once scalar stores
2794 // are supported.
2795 const ValueMapping *ValMapping =
2796 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2797 // FIXME: Depending on the type of store, the pointer could be in
2798 // the SGPR Reg bank.
2799 // FIXME: Pointer size should be based on the address space.
2800 const ValueMapping *PtrMapping =
2801 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
2802
2803 OpdsMapping[0] = ValMapping;
2804 OpdsMapping[1] = PtrMapping;
2805 break;
2806 }
2807
2808 case AMDGPU::G_ICMP: {
2809 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
2810 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2811 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2812 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
2813
2814 bool CanUseSCC = Op2Bank == AMDGPU::SGPRRegBankID &&
2815 Op3Bank == AMDGPU::SGPRRegBankID &&
2816 (Size == 32 || (Size == 64 &&
2817 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
2818 Subtarget.hasScalarCompareEq64()));
2819
2820 unsigned Op0Bank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
2821
2822 // TODO: Use 32-bit for scalar output size.
2823 // SCC results will need to be copied to a 32-bit SGPR virtual register.
2824 const unsigned ResultSize = 1;
2825
2826 OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, ResultSize);
2827 OpdsMapping[1] = nullptr; // Predicate Operand.
2828 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
2829 OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size);
2830 break;
2831 }
2832 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2833 // VGPR index can be used for waterfall when indexing a SGPR vector.
2834 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2835 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2836 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2837 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2838 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2839 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
2840
2841 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
2842 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
2843
2844 // The index can be either if the source vector is VGPR.
2845 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
2846 break;
2847 }
2848 case AMDGPU::G_INSERT_VECTOR_ELT: {
2849 unsigned OutputBankID = isSALUMapping(MI) ?
2850 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2851
2852 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2853 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2854 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
2855 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
2856 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(),
2857 MRI, *TRI);
2858 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
2859
2860 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
2861 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, VecSize);
2862 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(InsertEltBankID,
2863 InsertSize);
2864
2865 // The index can be either if the source vector is VGPR.
2866 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
2867 break;
2868 }
2869 case AMDGPU::G_UNMERGE_VALUES: {
2870 unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
2871 AMDGPU::VGPRRegBankID;
2872
2873 // Op1 and Dst should use the same register bank.
2874 // FIXME: Shouldn't this be the default? Why do we need to handle this?
2875 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
2876 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
2877 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
2878 }
2879 break;
2880 }
2881 case AMDGPU::G_INTRINSIC: {
2882 switch (MI.getIntrinsicID()) {
2883 default:
2884 return getInvalidInstructionMapping();
2885 case Intrinsic::amdgcn_div_fmas:
2886 case Intrinsic::amdgcn_div_fixup:
2887 case Intrinsic::amdgcn_trig_preop:
2888 case Intrinsic::amdgcn_sin:
2889 case Intrinsic::amdgcn_cos:
2890 case Intrinsic::amdgcn_log_clamp:
2891 case Intrinsic::amdgcn_rcp:
2892 case Intrinsic::amdgcn_rcp_legacy:
2893 case Intrinsic::amdgcn_rsq:
2894 case Intrinsic::amdgcn_rsq_legacy:
2895 case Intrinsic::amdgcn_rsq_clamp:
2896 case Intrinsic::amdgcn_ldexp:
2897 case Intrinsic::amdgcn_frexp_mant:
2898 case Intrinsic::amdgcn_frexp_exp:
2899 case Intrinsic::amdgcn_fract:
2900 case Intrinsic::amdgcn_cvt_pkrtz:
2901 case Intrinsic::amdgcn_cvt_pknorm_i16:
2902 case Intrinsic::amdgcn_cvt_pknorm_u16:
2903 case Intrinsic::amdgcn_cvt_pk_i16:
2904 case Intrinsic::amdgcn_cvt_pk_u16:
2905 case Intrinsic::amdgcn_fmed3:
2906 case Intrinsic::amdgcn_cubeid:
2907 case Intrinsic::amdgcn_cubema:
2908 case Intrinsic::amdgcn_cubesc:
2909 case Intrinsic::amdgcn_cubetc:
2910 case Intrinsic::amdgcn_sffbh:
2911 case Intrinsic::amdgcn_fmad_ftz:
2912 case Intrinsic::amdgcn_mbcnt_lo:
2913 case Intrinsic::amdgcn_mbcnt_hi:
2914 case Intrinsic::amdgcn_ubfe:
2915 case Intrinsic::amdgcn_sbfe:
2916 case Intrinsic::amdgcn_mul_u24:
2917 case Intrinsic::amdgcn_mul_i24:
2918 case Intrinsic::amdgcn_lerp:
2919 case Intrinsic::amdgcn_sad_u8:
2920 case Intrinsic::amdgcn_msad_u8:
2921 case Intrinsic::amdgcn_sad_hi_u8:
2922 case Intrinsic::amdgcn_sad_u16:
2923 case Intrinsic::amdgcn_qsad_pk_u16_u8:
2924 case Intrinsic::amdgcn_mqsad_pk_u16_u8:
2925 case Intrinsic::amdgcn_mqsad_u32_u8:
2926 case Intrinsic::amdgcn_cvt_pk_u8_f32:
2927 case Intrinsic::amdgcn_alignbit:
2928 case Intrinsic::amdgcn_alignbyte:
2929 case Intrinsic::amdgcn_fdot2:
2930 case Intrinsic::amdgcn_sdot2:
2931 case Intrinsic::amdgcn_udot2:
2932 case Intrinsic::amdgcn_sdot4:
2933 case Intrinsic::amdgcn_udot4:
2934 case Intrinsic::amdgcn_sdot8:
2935 case Intrinsic::amdgcn_udot8:
2936 case Intrinsic::amdgcn_wwm:
2937 case Intrinsic::amdgcn_wqm:
2938 return getDefaultMappingVOP(MI);
2939 case Intrinsic::amdgcn_ds_swizzle:
2940 case Intrinsic::amdgcn_ds_permute:
2941 case Intrinsic::amdgcn_ds_bpermute:
2942 case Intrinsic::amdgcn_update_dpp:
2943 return getDefaultMappingAllVGPR(MI);
2944 case Intrinsic::amdgcn_kernarg_segment_ptr:
2945 case Intrinsic::amdgcn_s_getpc:
2946 case Intrinsic::amdgcn_groupstaticsize: {
2947 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2948 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2949 break;
2950 }
2951 case Intrinsic::amdgcn_wqm_vote: {
2952 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2953 OpdsMapping[0] = OpdsMapping[2]
2954 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
2955 break;
2956 }
2957 case Intrinsic::amdgcn_s_buffer_load: {
2958 // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS
2959 Register RSrc = MI.getOperand(2).getReg(); // SGPR
2960 Register Offset = MI.getOperand(3).getReg(); // SGPR/imm
2961
2962 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2963 unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
2964 unsigned Size3 = MRI.getType(Offset).getSizeInBits();
2965
2966 unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
2967 unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
2968
2969 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size0);
2970 OpdsMapping[1] = nullptr; // intrinsic id
2971
2972 // Lie and claim everything is legal, even though some need to be
2973 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
2974 OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
2975 OpdsMapping[3] = AMDGPU::getValueMapping(OffsetBank, Size3);
2976 OpdsMapping[4] = nullptr;
2977 break;
2978 }
2979 case Intrinsic::amdgcn_div_scale: {
2980 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2981 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2982 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
2983 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
2984
2985 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
2986 OpdsMapping[3] = AMDGPU::getValueMapping(
2987 getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI), SrcSize);
2988 OpdsMapping[4] = AMDGPU::getValueMapping(
2989 getRegBankID(MI.getOperand(4).getReg(), MRI, *TRI), SrcSize);
2990
2991 break;
2992 }
2993 case Intrinsic::amdgcn_class: {
2994 Register Src0Reg = MI.getOperand(2).getReg();
2995 Register Src1Reg = MI.getOperand(3).getReg();
2996 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
2997 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
2998 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2999 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
3000 OpdsMapping[2] = AMDGPU::getValueMapping(getRegBankID(Src0Reg, MRI, *TRI),
3001 Src0Size);
3002 OpdsMapping[3] = AMDGPU::getValueMapping(getRegBankID(Src1Reg, MRI, *TRI),
3003 Src1Size);
3004 break;
3005 }
3006 case Intrinsic::amdgcn_icmp:
3007 case Intrinsic::amdgcn_fcmp: {
3008 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3009 // This is not VCCRegBank because this is not used in boolean contexts.
3010 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
3011 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3012 unsigned Op1Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
3013 unsigned Op2Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
3014 OpdsMapping[2] = AMDGPU::getValueMapping(Op1Bank, OpSize);
3015 OpdsMapping[3] = AMDGPU::getValueMapping(Op2Bank, OpSize);
3016 break;
3017 }
3018 case Intrinsic::amdgcn_readlane: {
3019 // This must be an SGPR, but accept a VGPR.
3020 Register IdxReg = MI.getOperand(3).getReg();
3021 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
3022 unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
3023 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
3024 LLVM_FALLTHROUGH;
3025 }
3026 case Intrinsic::amdgcn_readfirstlane: {
3027 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3028 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3029 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
3030 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
3031 break;
3032 }
3033 case Intrinsic::amdgcn_writelane: {
3034 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3035 Register SrcReg = MI.getOperand(2).getReg();
3036 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
3037 unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
3038 Register IdxReg = MI.getOperand(3).getReg();
3039 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
3040 unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
3041 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
3042
3043 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
3044 // to legalize.
3045 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
3046 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
3047 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
3048 break;
3049 }
3050 case Intrinsic::amdgcn_if_break: {
3051 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3052 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3053 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3054 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3055 break;
3056 }
3057 case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
3058 case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
3059 case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
3060 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
3061 case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
3062 case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
3063 case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
3064 case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
3065 case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
3066 case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
3067 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
3068 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
3069 case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
3070 case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
3071 case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
3072 case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
3073 case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
3074 case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
3075 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
3076 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: {
3077 // Default for MAI intrinsics.
3078 // srcC can also be an immediate which can be folded later.
3079 // FIXME: Should we eventually add an alternative mapping with AGPR src
3080 // for srcA/srcB?
3081 //
3082 // vdst, srcA, srcB, srcC
3083 OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3084 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3085 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3086 OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3087 break;
3088 }
3089 }
3090 break;
3091 }
3092 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
3093 auto IntrID = MI.getIntrinsicID();
3094 switch (IntrID) {
3095 case Intrinsic::amdgcn_s_getreg:
3096 case Intrinsic::amdgcn_s_memtime:
3097 case Intrinsic::amdgcn_s_memrealtime:
3098 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
3099 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3100 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3101 break;
3102 }
3103 case Intrinsic::amdgcn_ds_append:
3104 case Intrinsic::amdgcn_ds_consume:
3105 case Intrinsic::amdgcn_ds_fadd:
3106 case Intrinsic::amdgcn_ds_fmin:
3107 case Intrinsic::amdgcn_ds_fmax:
3108 case Intrinsic::amdgcn_atomic_inc:
3109 case Intrinsic::amdgcn_atomic_dec:
3110 return getDefaultMappingAllVGPR(MI);
3111 case Intrinsic::amdgcn_ds_ordered_add:
3112 case Intrinsic::amdgcn_ds_ordered_swap: {
3113 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3114 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
3115 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
3116 AMDGPU::SGPRRegBankID);
3117 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
3118 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3119 break;
3120 }
3121 case Intrinsic::amdgcn_exp_compr:
3122 OpdsMapping[0] = nullptr; // IntrinsicID
3123 // FIXME: These are immediate values which can't be read from registers.
3124 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3125 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3126 // FIXME: Could we support packed types here?
3127 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3128 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3129 // FIXME: These are immediate values which can't be read from registers.
3130 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3131 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3132 break;
3133 case Intrinsic::amdgcn_exp:
3134 // FIXME: Could we support packed types here?
3135 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3136 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3137 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3138 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3139 break;
3140 case Intrinsic::amdgcn_buffer_load: {
3141 Register RSrc = MI.getOperand(2).getReg(); // SGPR
3142 Register VIndex = MI.getOperand(3).getReg(); // VGPR
3143 Register Offset = MI.getOperand(4).getReg(); // SGPR/VGPR/imm
3144
3145 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3146 unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
3147 unsigned Size3 = MRI.getType(VIndex).getSizeInBits();
3148 unsigned Size4 = MRI.getType(Offset).getSizeInBits();
3149
3150 unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
3151 unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
3152
3153 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
3154 OpdsMapping[1] = nullptr; // intrinsic id
3155
3156 // Lie and claim everything is legal, even though some need to be
3157 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
3158 OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
3159 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size3);
3160 OpdsMapping[4] = AMDGPU::getValueMapping(OffsetBank, Size4);
3161 OpdsMapping[5] = nullptr;
3162 OpdsMapping[6] = nullptr;
3163 break;
3164 }
3165 case Intrinsic::amdgcn_s_sendmsg:
3166 case Intrinsic::amdgcn_s_sendmsghalt: {
3167 // This must be an SGPR, but accept a VGPR.
3168 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
3169 AMDGPU::SGPRRegBankID);
3170 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
3171 break;
3172 }
3173 case Intrinsic::amdgcn_end_cf:
3174 case Intrinsic::amdgcn_init_exec: {
3175 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3176 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3177 break;
3178 }
3179 case Intrinsic::amdgcn_else: {
3180 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3181 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3182 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
3183 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
3184 break;
3185 }
3186 case Intrinsic::amdgcn_kill: {
3187 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3188 break;
3189 }
3190 case Intrinsic::amdgcn_raw_buffer_load:
3191 case Intrinsic::amdgcn_raw_tbuffer_load: {
3192 // FIXME: Should make intrinsic ID the last operand of the instruction,
3193 // then this would be the same as store
3194 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3195 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3196 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3197 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3198 break;
3199 }
3200 case Intrinsic::amdgcn_raw_buffer_store:
3201 case Intrinsic::amdgcn_raw_buffer_store_format:
3202 case Intrinsic::amdgcn_raw_tbuffer_store: {
3203 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3204 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3205 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3206 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3207 break;
3208 }
3209 case Intrinsic::amdgcn_struct_buffer_load:
3210 case Intrinsic::amdgcn_struct_tbuffer_load: {
3211 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3212 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3213 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3214 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3215 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3216 break;
3217 }
3218 case Intrinsic::amdgcn_struct_buffer_store:
3219 case Intrinsic::amdgcn_struct_tbuffer_store: {
3220 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3221 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3222 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3223 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3224 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3225 break;
3226 }
3227 case Intrinsic::amdgcn_init_exec_from_input: {
3228 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3229 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3230 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3231 break;
3232 }
3233 case Intrinsic::amdgcn_ds_gws_init:
3234 case Intrinsic::amdgcn_ds_gws_barrier:
3235 case Intrinsic::amdgcn_ds_gws_sema_br: {
3236 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3237
3238 // This must be an SGPR, but accept a VGPR.
3239 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
3240 AMDGPU::SGPRRegBankID);
3241 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
3242 break;
3243 }
3244 case Intrinsic::amdgcn_ds_gws_sema_v:
3245 case Intrinsic::amdgcn_ds_gws_sema_p:
3246 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3247 // This must be an SGPR, but accept a VGPR.
3248 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
3249 AMDGPU::SGPRRegBankID);
3250 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
3251 break;
3252 }
3253 default:
3254 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3255 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3256 // Non-images can have complications from operands that allow both SGPR
3257 // and VGPR. For now it's too complicated to figure out the final opcode
3258 // to derive the register bank from the MCInstrDesc.
3259 if (RSrcIntrin->IsImage)
3260 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
3261 }
3262
3263 return getInvalidInstructionMapping();
3264 }
3265 break;
3266 }
3267 case AMDGPU::G_SELECT: {
3268 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3269 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
3270 AMDGPU::SGPRRegBankID);
3271 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI,
3272 AMDGPU::SGPRRegBankID);
3273 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
3274 Op3Bank == AMDGPU::SGPRRegBankID;
3275
3276 unsigned CondBankDefault = SGPRSrcs ?
3277 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
3278 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
3279 CondBankDefault);
3280 if (CondBank == AMDGPU::SGPRRegBankID)
3281 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
3282 else if (CondBank == AMDGPU::VGPRRegBankID)
3283 CondBank = AMDGPU::VCCRegBankID;
3284
3285 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
3286 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3287
3288 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
3289
3290 // TODO: Should report 32-bit for scalar condition type.
3291 if (Size == 64) {
3292 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
3293 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
3294 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
3295 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
3296 } else {
3297 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
3298 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
3299 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
3300 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
3301 }
3302
3303 break;
3304 }
3305
3306 case AMDGPU::G_LOAD:
3307 case AMDGPU::G_ZEXTLOAD:
3308 case AMDGPU::G_SEXTLOAD:
3309 return getInstrMappingForLoad(MI);
3310
3311 case AMDGPU::G_ATOMICRMW_XCHG:
3312 case AMDGPU::G_ATOMICRMW_ADD:
3313 case AMDGPU::G_ATOMICRMW_SUB:
3314 case AMDGPU::G_ATOMICRMW_AND:
3315 case AMDGPU::G_ATOMICRMW_OR:
3316 case AMDGPU::G_ATOMICRMW_XOR:
3317 case AMDGPU::G_ATOMICRMW_MAX:
3318 case AMDGPU::G_ATOMICRMW_MIN:
3319 case AMDGPU::G_ATOMICRMW_UMAX:
3320 case AMDGPU::G_ATOMICRMW_UMIN:
3321 case AMDGPU::G_ATOMICRMW_FADD:
3322 case AMDGPU::G_ATOMIC_CMPXCHG:
3323 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
3324 return getDefaultMappingAllVGPR(MI);
3325 }
3326 case AMDGPU::G_BRCOND: {
3327 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI,
3328 AMDGPU::SGPRRegBankID);
3329 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
3330 if (Bank != AMDGPU::SGPRRegBankID)
3331 Bank = AMDGPU::VCCRegBankID;
3332
3333 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
3334 break;
3335 }
3336 }
3337
3338 return getInstructionMapping(/*ID*/1, /*Cost*/1,
3339 getOperandsMapping(OpdsMapping),
3340 MI.getNumOperands());
3341 }
3342