1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 ///
12 /// \par
13 ///
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
20 ///
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
27 ///
28 /// \par Booleans
29 ///
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
41 ///
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
47 ///
48 /// \par Constant bus restriction
49 ///
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
58 ///
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trival legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
68 ///
69 //===----------------------------------------------------------------------===//
70
71 #include "AMDGPURegisterBankInfo.h"
72
73 #include "AMDGPUGlobalISelUtils.h"
74 #include "AMDGPUInstrInfo.h"
75 #include "AMDGPUSubtarget.h"
76 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
79 #include "llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h"
80 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
81 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
82 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
83 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
84 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
85 #include "llvm/CodeGen/TargetRegisterInfo.h"
86 #include "llvm/CodeGen/TargetSubtargetInfo.h"
87 #include "llvm/IR/Constants.h"
88
89 #define GET_TARGET_REGBANK_IMPL
90 #include "AMDGPUGenRegisterBank.inc"
91
92 // This file will be TableGen'ed at some point.
93 #include "AMDGPUGenRegisterBankInfo.def"
94
95 using namespace llvm;
96 using namespace MIPatternMatch;
97
98 namespace {
99
100 // Observer to apply a register bank to new registers created by LegalizerHelper.
101 class ApplyRegBankMapping final : public GISelChangeObserver {
102 private:
103 const AMDGPURegisterBankInfo &RBI;
104 MachineRegisterInfo &MRI;
105 const RegisterBank *NewBank;
106 SmallVector<MachineInstr *, 4> NewInsts;
107
108 public:
ApplyRegBankMapping(const AMDGPURegisterBankInfo & RBI_,MachineRegisterInfo & MRI_,const RegisterBank * RB)109 ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
110 MachineRegisterInfo &MRI_, const RegisterBank *RB)
111 : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
112
~ApplyRegBankMapping()113 ~ApplyRegBankMapping() {
114 for (MachineInstr *MI : NewInsts)
115 applyBank(*MI);
116 }
117
118 /// Set any registers that don't have a set register class or bank to SALU.
applyBank(MachineInstr & MI)119 void applyBank(MachineInstr &MI) {
120 const unsigned Opc = MI.getOpcode();
121 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
122 Opc == AMDGPU::G_SEXT) {
123 // LegalizerHelper wants to use the basic legalization artifacts when
124 // widening etc. We don't handle selection with vcc in artifact sources,
125 // so we need to use a sslect instead to handle these properly.
126 Register DstReg = MI.getOperand(0).getReg();
127 Register SrcReg = MI.getOperand(1).getReg();
128 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
129 if (SrcBank == &AMDGPU::VCCRegBank) {
130 const LLT S32 = LLT::scalar(32);
131 assert(MRI.getType(SrcReg) == LLT::scalar(1));
132 assert(MRI.getType(DstReg) == S32);
133 assert(NewBank == &AMDGPU::VGPRRegBank);
134
135 // Replace the extension with a select, which really uses the boolean
136 // source.
137 MachineIRBuilder B(MI);
138 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
139 auto False = B.buildConstant(S32, 0);
140 B.buildSelect(DstReg, SrcReg, True, False);
141 MRI.setRegBank(True.getReg(0), *NewBank);
142 MRI.setRegBank(False.getReg(0), *NewBank);
143 MI.eraseFromParent();
144 }
145
146 assert(!MRI.getRegClassOrRegBank(DstReg));
147 MRI.setRegBank(DstReg, *NewBank);
148 return;
149 }
150
151 #ifndef NDEBUG
152 if (Opc == AMDGPU::G_TRUNC) {
153 Register DstReg = MI.getOperand(0).getReg();
154 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
155 assert(DstBank != &AMDGPU::VCCRegBank);
156 }
157 #endif
158
159 for (MachineOperand &Op : MI.operands()) {
160 if (!Op.isReg())
161 continue;
162
163 // We may see physical registers if building a real MI
164 Register Reg = Op.getReg();
165 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
166 continue;
167
168 const RegisterBank *RB = NewBank;
169 if (MRI.getType(Reg) == LLT::scalar(1)) {
170 assert(NewBank == &AMDGPU::VGPRRegBank &&
171 "s1 operands should only be used for vector bools");
172 assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
173 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
174 "not expecting legalization artifacts here");
175 RB = &AMDGPU::VCCRegBank;
176 }
177
178 MRI.setRegBank(Reg, *RB);
179 }
180 }
181
erasingInstr(MachineInstr & MI)182 void erasingInstr(MachineInstr &MI) override {}
183
createdInstr(MachineInstr & MI)184 void createdInstr(MachineInstr &MI) override {
185 // At this point, the instruction was just inserted and has no operands.
186 NewInsts.push_back(&MI);
187 }
188
changingInstr(MachineInstr & MI)189 void changingInstr(MachineInstr &MI) override {}
changedInstr(MachineInstr & MI)190 void changedInstr(MachineInstr &MI) override {}
191 };
192
193 }
AMDGPURegisterBankInfo(const GCNSubtarget & ST)194 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
195 : AMDGPUGenRegisterBankInfo(),
196 Subtarget(ST),
197 TRI(Subtarget.getRegisterInfo()),
198 TII(Subtarget.getInstrInfo()) {
199
200 // HACK: Until this is fully tablegen'd.
201 static llvm::once_flag InitializeRegisterBankFlag;
202
203 static auto InitializeRegisterBankOnce = [this]() {
204 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
205 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
206 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
207 (void)this;
208 };
209
210 llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
211 }
212
isVectorRegisterBank(const RegisterBank & Bank)213 static bool isVectorRegisterBank(const RegisterBank &Bank) {
214 unsigned BankID = Bank.getID();
215 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
216 }
217
copyCost(const RegisterBank & Dst,const RegisterBank & Src,unsigned Size) const218 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
219 const RegisterBank &Src,
220 unsigned Size) const {
221 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
222 if (Dst.getID() == AMDGPU::SGPRRegBankID &&
223 (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
224 return std::numeric_limits<unsigned>::max();
225 }
226
227 // Bool values are tricky, because the meaning is based on context. The SCC
228 // and VCC banks are for the natural scalar and vector conditions produced by
229 // a compare.
230 //
231 // Legalization doesn't know about the necessary context, so an s1 use may
232 // have been a truncate from an arbitrary value, in which case a copy (lowered
233 // as a compare with 0) needs to be inserted.
234 if (Size == 1 &&
235 (Dst.getID() == AMDGPU::SGPRRegBankID) &&
236 (isVectorRegisterBank(Src) ||
237 Src.getID() == AMDGPU::SGPRRegBankID ||
238 Src.getID() == AMDGPU::VCCRegBankID))
239 return std::numeric_limits<unsigned>::max();
240
241 // There is no direct copy between AGPRs.
242 if (Dst.getID() == AMDGPU::AGPRRegBankID &&
243 Src.getID() == AMDGPU::AGPRRegBankID)
244 return 4;
245
246 return RegisterBankInfo::copyCost(Dst, Src, Size);
247 }
248
getBreakDownCost(const ValueMapping & ValMapping,const RegisterBank * CurBank) const249 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
250 const ValueMapping &ValMapping,
251 const RegisterBank *CurBank) const {
252 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
253 // VGPR.
254 // FIXME: Is there a better way to do this?
255 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
256 return 10; // This is expensive.
257
258 assert(ValMapping.NumBreakDowns == 2 &&
259 ValMapping.BreakDown[0].Length == 32 &&
260 ValMapping.BreakDown[0].StartIdx == 0 &&
261 ValMapping.BreakDown[1].Length == 32 &&
262 ValMapping.BreakDown[1].StartIdx == 32 &&
263 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
264
265 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
266 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
267 // want.
268
269 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
270 // alignment restrictions, but this probably isn't important.
271 return 1;
272 }
273
274 const RegisterBank &
getRegBankFromRegClass(const TargetRegisterClass & RC,LLT Ty) const275 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
276 LLT Ty) const {
277 if (&RC == &AMDGPU::SReg_1RegClass)
278 return AMDGPU::VCCRegBank;
279
280 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
281 // VCC-like use.
282 if (TRI->isSGPRClass(&RC)) {
283 // FIXME: This probably came from a copy from a physical register, which
284 // should be inferrrable from the copied to-type. We don't have many boolean
285 // physical register constraints so just assume a normal SGPR for now.
286 if (!Ty.isValid())
287 return AMDGPU::SGPRRegBank;
288
289 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
290 }
291
292 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
293 }
294
295 template <unsigned NumOps>
296 RegisterBankInfo::InstructionMappings
addMappingFromTable(const MachineInstr & MI,const MachineRegisterInfo & MRI,const std::array<unsigned,NumOps> RegSrcOpIdx,ArrayRef<OpRegBankEntry<NumOps>> Table) const297 AMDGPURegisterBankInfo::addMappingFromTable(
298 const MachineInstr &MI, const MachineRegisterInfo &MRI,
299 const std::array<unsigned, NumOps> RegSrcOpIdx,
300 ArrayRef<OpRegBankEntry<NumOps>> Table) const {
301
302 InstructionMappings AltMappings;
303
304 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
305
306 unsigned Sizes[NumOps];
307 for (unsigned I = 0; I < NumOps; ++I) {
308 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
309 Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
310 }
311
312 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
313 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
314 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
315 }
316
317 // getInstrMapping's default mapping uses ID 1, so start at 2.
318 unsigned MappingID = 2;
319 for (const auto &Entry : Table) {
320 for (unsigned I = 0; I < NumOps; ++I) {
321 int OpIdx = RegSrcOpIdx[I];
322 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
323 }
324
325 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
326 getOperandsMapping(Operands),
327 Operands.size()));
328 }
329
330 return AltMappings;
331 }
332
333 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsic(const MachineInstr & MI,const MachineRegisterInfo & MRI) const334 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
335 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
336 switch (MI.getIntrinsicID()) {
337 case Intrinsic::amdgcn_readlane: {
338 static const OpRegBankEntry<3> Table[2] = {
339 // Perfectly legal.
340 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
341
342 // Need a readfirstlane for the index.
343 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
344 };
345
346 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
347 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
348 }
349 case Intrinsic::amdgcn_writelane: {
350 static const OpRegBankEntry<4> Table[4] = {
351 // Perfectly legal.
352 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
353
354 // Need readfirstlane of first op
355 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
356
357 // Need readfirstlane of second op
358 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
359
360 // Need readfirstlane of both ops
361 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
362 };
363
364 // rsrc, voffset, offset
365 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
366 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
367 }
368 default:
369 return RegisterBankInfo::getInstrAlternativeMappings(MI);
370 }
371 }
372
373 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsicWSideEffects(const MachineInstr & MI,const MachineRegisterInfo & MRI) const374 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
375 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
376
377 switch (MI.getIntrinsicID()) {
378 case Intrinsic::amdgcn_s_buffer_load: {
379 static const OpRegBankEntry<2> Table[4] = {
380 // Perfectly legal.
381 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
382
383 // Only need 1 register in loop
384 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
385
386 // Have to waterfall the resource.
387 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
388
389 // Have to waterfall the resource, and the offset.
390 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
391 };
392
393 // rsrc, offset
394 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
395 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
396 }
397 case Intrinsic::amdgcn_ds_ordered_add:
398 case Intrinsic::amdgcn_ds_ordered_swap: {
399 // VGPR = M0, VGPR
400 static const OpRegBankEntry<3> Table[2] = {
401 // Perfectly legal.
402 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
403
404 // Need a readfirstlane for m0
405 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
406 };
407
408 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
409 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
410 }
411 case Intrinsic::amdgcn_s_sendmsg:
412 case Intrinsic::amdgcn_s_sendmsghalt: {
413 // FIXME: Should have no register for immediate
414 static const OpRegBankEntry<1> Table[2] = {
415 // Perfectly legal.
416 { { AMDGPU::SGPRRegBankID }, 1 },
417
418 // Need readlane
419 { { AMDGPU::VGPRRegBankID }, 3 }
420 };
421
422 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
423 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
424 }
425 default:
426 return RegisterBankInfo::getInstrAlternativeMappings(MI);
427 }
428 }
429
memOpHasNoClobbered(const MachineMemOperand * MMO)430 static bool memOpHasNoClobbered(const MachineMemOperand *MMO) {
431 const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue());
432 return I && I->getMetadata("amdgpu.noclobber");
433 }
434
435 // FIXME: Returns uniform if there's no source value information. This is
436 // probably wrong.
isScalarLoadLegal(const MachineInstr & MI)437 static bool isScalarLoadLegal(const MachineInstr &MI) {
438 if (!MI.hasOneMemOperand())
439 return false;
440
441 const MachineMemOperand *MMO = *MI.memoperands_begin();
442 const unsigned AS = MMO->getAddrSpace();
443 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
444 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
445
446 // There are no extending SMRD/SMEM loads, and they require 4-byte alignment.
447 return MMO->getSize() >= 4 && MMO->getAlign() >= Align(4) &&
448 // Can't do a scalar atomic load.
449 !MMO->isAtomic() &&
450 // Don't use scalar loads for volatile accesses to non-constant address
451 // spaces.
452 (IsConst || !MMO->isVolatile()) &&
453 // Memory must be known constant, or not written before this load.
454 (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) &&
455 AMDGPUInstrInfo::isUniformMMO(MMO);
456 }
457
458 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappings(const MachineInstr & MI) const459 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
460 const MachineInstr &MI) const {
461
462 const MachineFunction &MF = *MI.getParent()->getParent();
463 const MachineRegisterInfo &MRI = MF.getRegInfo();
464
465
466 InstructionMappings AltMappings;
467 switch (MI.getOpcode()) {
468 case TargetOpcode::G_CONSTANT: {
469 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
470 if (Size == 1) {
471 static const OpRegBankEntry<1> Table[3] = {
472 { { AMDGPU::VGPRRegBankID }, 1 },
473 { { AMDGPU::SGPRRegBankID }, 1 },
474 { { AMDGPU::VCCRegBankID }, 1 }
475 };
476
477 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
478 }
479
480 LLVM_FALLTHROUGH;
481 }
482 case TargetOpcode::G_FCONSTANT:
483 case TargetOpcode::G_FRAME_INDEX:
484 case TargetOpcode::G_GLOBAL_VALUE: {
485 static const OpRegBankEntry<1> Table[2] = {
486 { { AMDGPU::VGPRRegBankID }, 1 },
487 { { AMDGPU::SGPRRegBankID }, 1 }
488 };
489
490 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
491 }
492 case TargetOpcode::G_AND:
493 case TargetOpcode::G_OR:
494 case TargetOpcode::G_XOR: {
495 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
496
497 if (Size == 1) {
498 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
499 const InstructionMapping &SCCMapping = getInstructionMapping(
500 1, 1, getOperandsMapping(
501 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
502 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
503 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
504 3); // Num Operands
505 AltMappings.push_back(&SCCMapping);
506
507 const InstructionMapping &VCCMapping0 = getInstructionMapping(
508 2, 1, getOperandsMapping(
509 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
510 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
511 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
512 3); // Num Operands
513 AltMappings.push_back(&VCCMapping0);
514 return AltMappings;
515 }
516
517 if (Size != 64)
518 break;
519
520 const InstructionMapping &SSMapping = getInstructionMapping(
521 1, 1, getOperandsMapping(
522 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
523 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
524 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
525 3); // Num Operands
526 AltMappings.push_back(&SSMapping);
527
528 const InstructionMapping &VVMapping = getInstructionMapping(
529 2, 2, getOperandsMapping(
530 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
531 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
532 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
533 3); // Num Operands
534 AltMappings.push_back(&VVMapping);
535 break;
536 }
537 case TargetOpcode::G_LOAD:
538 case TargetOpcode::G_ZEXTLOAD:
539 case TargetOpcode::G_SEXTLOAD: {
540 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
541 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
542 unsigned PtrSize = PtrTy.getSizeInBits();
543 unsigned AS = PtrTy.getAddressSpace();
544
545 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
546 AS != AMDGPUAS::PRIVATE_ADDRESS) &&
547 isScalarLoadLegal(MI)) {
548 const InstructionMapping &SSMapping = getInstructionMapping(
549 1, 1, getOperandsMapping(
550 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
551 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
552 2); // Num Operands
553 AltMappings.push_back(&SSMapping);
554 }
555
556 const InstructionMapping &VVMapping = getInstructionMapping(
557 2, 1,
558 getOperandsMapping(
559 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
560 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
561 2); // Num Operands
562 AltMappings.push_back(&VVMapping);
563
564 // It may be possible to have a vgpr = load sgpr mapping here, because
565 // the mubuf instructions support this kind of load, but probably for only
566 // gfx7 and older. However, the addressing mode matching in the instruction
567 // selector should be able to do a better job of detecting and selecting
568 // these kinds of loads from the vgpr = load vgpr mapping.
569
570 return AltMappings;
571
572 }
573 case TargetOpcode::G_SELECT: {
574 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
575 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
576 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
577 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
578 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
579 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
580 4); // Num Operands
581 AltMappings.push_back(&SSMapping);
582
583 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
584 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
585 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
586 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
587 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
588 4); // Num Operands
589 AltMappings.push_back(&VVMapping);
590
591 return AltMappings;
592 }
593 case TargetOpcode::G_SMIN:
594 case TargetOpcode::G_SMAX:
595 case TargetOpcode::G_UMIN:
596 case TargetOpcode::G_UMAX: {
597 static const OpRegBankEntry<3> Table[2] = {
598 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
599
600 // Scalar requires cmp+select, and extends if 16-bit.
601 // FIXME: Should there be separate costs for 32 and 16-bit
602 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 }
603 };
604
605 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } };
606 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
607 }
608 case TargetOpcode::G_UADDE:
609 case TargetOpcode::G_USUBE:
610 case TargetOpcode::G_SADDE:
611 case TargetOpcode::G_SSUBE: {
612 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
613 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
614 getOperandsMapping(
615 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
616 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
617 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
618 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
619 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
620 5); // Num Operands
621 AltMappings.push_back(&SSMapping);
622
623 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
624 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
625 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
626 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
627 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
628 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
629 5); // Num Operands
630 AltMappings.push_back(&VVMapping);
631 return AltMappings;
632 }
633 case AMDGPU::G_BRCOND: {
634 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
635
636 // TODO: Change type to 32 for scalar
637 const InstructionMapping &SMapping = getInstructionMapping(
638 1, 1, getOperandsMapping(
639 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
640 2); // Num Operands
641 AltMappings.push_back(&SMapping);
642
643 const InstructionMapping &VMapping = getInstructionMapping(
644 1, 1, getOperandsMapping(
645 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
646 2); // Num Operands
647 AltMappings.push_back(&VMapping);
648 return AltMappings;
649 }
650 case AMDGPU::G_INTRINSIC:
651 return getInstrAlternativeMappingsIntrinsic(MI, MRI);
652 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
653 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
654 default:
655 break;
656 }
657 return RegisterBankInfo::getInstrAlternativeMappings(MI);
658 }
659
split64BitValueForMapping(MachineIRBuilder & B,SmallVector<Register,2> & Regs,LLT HalfTy,Register Reg) const660 void AMDGPURegisterBankInfo::split64BitValueForMapping(
661 MachineIRBuilder &B,
662 SmallVector<Register, 2> &Regs,
663 LLT HalfTy,
664 Register Reg) const {
665 assert(HalfTy.getSizeInBits() == 32);
666 MachineRegisterInfo *MRI = B.getMRI();
667 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
668 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
669 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
670 MRI->setRegBank(LoLHS, *Bank);
671 MRI->setRegBank(HiLHS, *Bank);
672
673 Regs.push_back(LoLHS);
674 Regs.push_back(HiLHS);
675
676 B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
677 .addDef(LoLHS)
678 .addDef(HiLHS)
679 .addUse(Reg);
680 }
681
682 /// Replace the current type each register in \p Regs has with \p NewTy
setRegsToType(MachineRegisterInfo & MRI,ArrayRef<Register> Regs,LLT NewTy)683 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
684 LLT NewTy) {
685 for (Register Reg : Regs) {
686 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
687 MRI.setType(Reg, NewTy);
688 }
689 }
690
getHalfSizedType(LLT Ty)691 static LLT getHalfSizedType(LLT Ty) {
692 if (Ty.isVector()) {
693 assert(Ty.getNumElements() % 2 == 0);
694 return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType());
695 }
696
697 assert(Ty.getSizeInBits() % 2 == 0);
698 return LLT::scalar(Ty.getSizeInBits() / 2);
699 }
700
701 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
702 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
703 /// execute the instruction for each unique combination of values in all lanes
704 /// in the wave. The block will be split such that rest of the instructions are
705 /// moved to a new block.
706 ///
707 /// Essentially performs this loop:
708 //
709 /// Save Execution Mask
710 /// For (Lane : Wavefront) {
711 /// Enable Lane, Disable all other lanes
712 /// SGPR = read SGPR value for current lane from VGPR
713 /// VGPRResult[Lane] = use_op SGPR
714 /// }
715 /// Restore Execution Mask
716 ///
717 /// There is additional complexity to try for compare values to identify the
718 /// unique values used.
executeInWaterfallLoop(MachineIRBuilder & B,iterator_range<MachineBasicBlock::iterator> Range,SmallSet<Register,4> & SGPROperandRegs,MachineRegisterInfo & MRI) const719 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
720 MachineIRBuilder &B,
721 iterator_range<MachineBasicBlock::iterator> Range,
722 SmallSet<Register, 4> &SGPROperandRegs,
723 MachineRegisterInfo &MRI) const {
724 SmallVector<Register, 4> ResultRegs;
725 SmallVector<Register, 4> InitResultRegs;
726 SmallVector<Register, 4> PhiRegs;
727
728 // Track use registers which have already been expanded with a readfirstlane
729 // sequence. This may have multiple uses if moving a sequence.
730 DenseMap<Register, Register> WaterfalledRegMap;
731
732 MachineBasicBlock &MBB = B.getMBB();
733 MachineFunction *MF = &B.getMF();
734
735 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
736 const unsigned WaveAndOpc = Subtarget.isWave32() ?
737 AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
738 const unsigned MovTermOpc = Subtarget.isWave32() ?
739 AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
740 const unsigned XorTermOpc = Subtarget.isWave32() ?
741 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
742 const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
743 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
744 const unsigned ExecReg = Subtarget.isWave32() ?
745 AMDGPU::EXEC_LO : AMDGPU::EXEC;
746
747 #ifndef NDEBUG
748 const int OrigRangeSize = std::distance(Range.begin(), Range.end());
749 #endif
750
751 for (MachineInstr &MI : Range) {
752 for (MachineOperand &Def : MI.defs()) {
753 if (MRI.use_nodbg_empty(Def.getReg()))
754 continue;
755
756 LLT ResTy = MRI.getType(Def.getReg());
757 const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
758 ResultRegs.push_back(Def.getReg());
759 Register InitReg = B.buildUndef(ResTy).getReg(0);
760 Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
761 InitResultRegs.push_back(InitReg);
762 PhiRegs.push_back(PhiReg);
763 MRI.setRegBank(PhiReg, *DefBank);
764 MRI.setRegBank(InitReg, *DefBank);
765 }
766 }
767
768 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
769 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
770
771 // Don't bother using generic instructions/registers for the exec mask.
772 B.buildInstr(TargetOpcode::IMPLICIT_DEF)
773 .addDef(InitSaveExecReg);
774
775 Register PhiExec = MRI.createVirtualRegister(WaveRC);
776 Register NewExec = MRI.createVirtualRegister(WaveRC);
777
778 // To insert the loop we need to split the block. Move everything before this
779 // point to a new block, and insert a new empty block before this instruction.
780 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
781 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
782 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
783 MachineFunction::iterator MBBI(MBB);
784 ++MBBI;
785 MF->insert(MBBI, LoopBB);
786 MF->insert(MBBI, RestoreExecBB);
787 MF->insert(MBBI, RemainderBB);
788
789 LoopBB->addSuccessor(RestoreExecBB);
790 LoopBB->addSuccessor(LoopBB);
791
792 // Move the rest of the block into a new block.
793 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
794 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
795
796 MBB.addSuccessor(LoopBB);
797 RestoreExecBB->addSuccessor(RemainderBB);
798
799 B.setInsertPt(*LoopBB, LoopBB->end());
800
801 B.buildInstr(TargetOpcode::PHI)
802 .addDef(PhiExec)
803 .addReg(InitSaveExecReg)
804 .addMBB(&MBB)
805 .addReg(NewExec)
806 .addMBB(LoopBB);
807
808 for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
809 B.buildInstr(TargetOpcode::G_PHI)
810 .addDef(std::get<2>(Result))
811 .addReg(std::get<0>(Result)) // Initial value / implicit_def
812 .addMBB(&MBB)
813 .addReg(std::get<1>(Result)) // Mid-loop value.
814 .addMBB(LoopBB);
815 }
816
817 const DebugLoc &DL = B.getDL();
818
819 MachineInstr &FirstInst = *Range.begin();
820
821 // Move the instruction into the loop. Note we moved everything after
822 // Range.end() already into a new block, so Range.end() is no longer valid.
823 LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
824
825 // Figure out the iterator range after splicing the instructions.
826 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
827 auto NewEnd = LoopBB->end();
828
829 MachineBasicBlock::iterator I = Range.begin();
830 B.setInsertPt(*LoopBB, I);
831
832 Register CondReg;
833
834 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
835
836 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
837 for (MachineOperand &Op : MI.uses()) {
838 if (!Op.isReg() || Op.isDef())
839 continue;
840
841 Register OldReg = Op.getReg();
842 if (!SGPROperandRegs.count(OldReg))
843 continue;
844
845 // See if we already processed this register in another instruction in the
846 // sequence.
847 auto OldVal = WaterfalledRegMap.find(OldReg);
848 if (OldVal != WaterfalledRegMap.end()) {
849 Op.setReg(OldVal->second);
850 continue;
851 }
852
853 Register OpReg = Op.getReg();
854 LLT OpTy = MRI.getType(OpReg);
855
856 const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
857 if (OpBank != &AMDGPU::VGPRRegBank) {
858 // Insert copy from AGPR to VGPR before the loop.
859 B.setMBB(MBB);
860 OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
861 MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
862 B.setInstr(*I);
863 }
864
865 unsigned OpSize = OpTy.getSizeInBits();
866
867 // Can only do a readlane of 32-bit pieces.
868 if (OpSize == 32) {
869 // Avoid extra copies in the simple case of one 32-bit register.
870 Register CurrentLaneOpReg
871 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
872 MRI.setType(CurrentLaneOpReg, OpTy);
873
874 constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI);
875 // Read the next variant <- also loop target.
876 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
877 CurrentLaneOpReg)
878 .addReg(OpReg);
879
880 Register NewCondReg = MRI.createVirtualRegister(WaveRC);
881 bool First = CondReg == AMDGPU::NoRegister;
882 if (First)
883 CondReg = NewCondReg;
884
885 // Compare the just read M0 value to all possible Idx values.
886 B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
887 .addDef(NewCondReg)
888 .addReg(CurrentLaneOpReg)
889 .addReg(OpReg);
890 Op.setReg(CurrentLaneOpReg);
891
892 if (!First) {
893 Register AndReg = MRI.createVirtualRegister(WaveRC);
894
895 // If there are multiple operands to consider, and the conditions.
896 B.buildInstr(WaveAndOpc)
897 .addDef(AndReg)
898 .addReg(NewCondReg)
899 .addReg(CondReg);
900 CondReg = AndReg;
901 }
902 } else {
903 LLT S32 = LLT::scalar(32);
904 SmallVector<Register, 8> ReadlanePieces;
905
906 // The compares can be done as 64-bit, but the extract needs to be done
907 // in 32-bit pieces.
908
909 bool Is64 = OpSize % 64 == 0;
910
911 LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
912 unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
913 : AMDGPU::V_CMP_EQ_U32_e64;
914
915 // The compares can be done as 64-bit, but the extract needs to be done
916 // in 32-bit pieces.
917
918 // Insert the unmerge before the loop.
919
920 B.setMBB(MBB);
921 auto Unmerge = B.buildUnmerge(UnmergeTy, OpReg);
922 B.setInstr(*I);
923
924 unsigned NumPieces = Unmerge->getNumOperands() - 1;
925 for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
926 Register UnmergePiece = Unmerge.getReg(PieceIdx);
927
928 Register CurrentLaneOpReg;
929 if (Is64) {
930 Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
931 Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
932
933 MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
934 MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
935 MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
936
937 // Read the next variant <- also loop target.
938 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
939 CurrentLaneOpRegLo)
940 .addReg(UnmergePiece, 0, AMDGPU::sub0);
941
942 // Read the next variant <- also loop target.
943 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
944 CurrentLaneOpRegHi)
945 .addReg(UnmergePiece, 0, AMDGPU::sub1);
946
947 CurrentLaneOpReg =
948 B.buildMerge(LLT::scalar(64),
949 {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
950 .getReg(0);
951
952 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
953
954 if (OpTy.getScalarSizeInBits() == 64) {
955 // If we need to produce a 64-bit element vector, so use the
956 // merged pieces
957 ReadlanePieces.push_back(CurrentLaneOpReg);
958 } else {
959 // 32-bit element type.
960 ReadlanePieces.push_back(CurrentLaneOpRegLo);
961 ReadlanePieces.push_back(CurrentLaneOpRegHi);
962 }
963 } else {
964 CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
965 MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
966 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
967
968 // Read the next variant <- also loop target.
969 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
970 CurrentLaneOpReg)
971 .addReg(UnmergePiece);
972 ReadlanePieces.push_back(CurrentLaneOpReg);
973 }
974
975 Register NewCondReg = MRI.createVirtualRegister(WaveRC);
976 bool First = CondReg == AMDGPU::NoRegister;
977 if (First)
978 CondReg = NewCondReg;
979
980 B.buildInstr(CmpOp)
981 .addDef(NewCondReg)
982 .addReg(CurrentLaneOpReg)
983 .addReg(UnmergePiece);
984
985 if (!First) {
986 Register AndReg = MRI.createVirtualRegister(WaveRC);
987
988 // If there are multiple operands to consider, and the conditions.
989 B.buildInstr(WaveAndOpc)
990 .addDef(AndReg)
991 .addReg(NewCondReg)
992 .addReg(CondReg);
993 CondReg = AndReg;
994 }
995 }
996
997 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
998 // BUILD_VECTOR
999 if (OpTy.isVector()) {
1000 auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
1001 Op.setReg(Merge.getReg(0));
1002 } else {
1003 auto Merge = B.buildMerge(OpTy, ReadlanePieces);
1004 Op.setReg(Merge.getReg(0));
1005 }
1006
1007 MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
1008 }
1009
1010 // Make sure we don't re-process this register again.
1011 WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg()));
1012 }
1013 }
1014
1015 B.setInsertPt(*LoopBB, LoopBB->end());
1016
1017 // Update EXEC, save the original EXEC value to VCC.
1018 B.buildInstr(AndSaveExecOpc)
1019 .addDef(NewExec)
1020 .addReg(CondReg, RegState::Kill);
1021
1022 MRI.setSimpleHint(NewExec, CondReg);
1023
1024 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
1025 B.buildInstr(XorTermOpc)
1026 .addDef(ExecReg)
1027 .addReg(ExecReg)
1028 .addReg(NewExec);
1029
1030 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
1031 // s_cbranch_scc0?
1032
1033 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
1034 B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
1035 .addMBB(LoopBB);
1036
1037 // Save the EXEC mask before the loop.
1038 BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg)
1039 .addReg(ExecReg);
1040
1041 // Restore the EXEC mask after the loop.
1042 B.setMBB(*RestoreExecBB);
1043 B.buildInstr(MovTermOpc)
1044 .addDef(ExecReg)
1045 .addReg(SaveExecReg);
1046
1047 // Set the insert point after the original instruction, so any new
1048 // instructions will be in the remainder.
1049 B.setInsertPt(*RemainderBB, RemainderBB->begin());
1050
1051 return true;
1052 }
1053
1054 // Return any unique registers used by \p MI at \p OpIndices that need to be
1055 // handled in a waterfall loop. Returns these registers in \p
1056 // SGPROperandRegs. Returns true if there are any operands to handle and a
1057 // waterfall loop is necessary.
collectWaterfallOperands(SmallSet<Register,4> & SGPROperandRegs,MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const1058 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
1059 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
1060 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
1061 for (unsigned Op : OpIndices) {
1062 assert(MI.getOperand(Op).isUse());
1063 Register Reg = MI.getOperand(Op).getReg();
1064 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
1065 if (OpBank->getID() != AMDGPU::SGPRRegBankID)
1066 SGPROperandRegs.insert(Reg);
1067 }
1068
1069 // No operands need to be replaced, so no need to loop.
1070 return !SGPROperandRegs.empty();
1071 }
1072
executeInWaterfallLoop(MachineIRBuilder & B,MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const1073 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1074 MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
1075 ArrayRef<unsigned> OpIndices) const {
1076 // Use a set to avoid extra readfirstlanes in the case where multiple operands
1077 // are the same register.
1078 SmallSet<Register, 4> SGPROperandRegs;
1079
1080 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
1081 return false;
1082
1083 MachineBasicBlock::iterator I = MI.getIterator();
1084 return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1085 SGPROperandRegs, MRI);
1086 }
1087
executeInWaterfallLoop(MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const1088 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1089 MachineInstr &MI, MachineRegisterInfo &MRI,
1090 ArrayRef<unsigned> OpIndices) const {
1091 MachineIRBuilder B(MI);
1092 return executeInWaterfallLoop(B, MI, MRI, OpIndices);
1093 }
1094
1095 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
constrainOpWithReadfirstlane(MachineInstr & MI,MachineRegisterInfo & MRI,unsigned OpIdx) const1096 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1097 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1098 Register Reg = MI.getOperand(OpIdx).getReg();
1099 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1100 if (Bank == &AMDGPU::SGPRRegBank)
1101 return;
1102
1103 LLT Ty = MRI.getType(Reg);
1104 MachineIRBuilder B(MI);
1105
1106 if (Bank != &AMDGPU::VGPRRegBank) {
1107 // We need to copy from AGPR to VGPR
1108 Reg = B.buildCopy(Ty, Reg).getReg(0);
1109 MRI.setRegBank(Reg, AMDGPU::VGPRRegBank);
1110 }
1111
1112 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1113 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
1114 .addDef(SGPR)
1115 .addReg(Reg);
1116
1117 MRI.setType(SGPR, Ty);
1118
1119 const TargetRegisterClass *Constrained =
1120 constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
1121 (void)Constrained;
1122 assert(Constrained && "Failed to constrain readfirstlane src reg");
1123
1124 MI.getOperand(OpIdx).setReg(SGPR);
1125 }
1126
1127 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1128 /// rest will be in the remainder.
splitUnequalType(LLT Ty,unsigned FirstSize)1129 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1130 unsigned TotalSize = Ty.getSizeInBits();
1131 if (!Ty.isVector())
1132 return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1133
1134 LLT EltTy = Ty.getElementType();
1135 unsigned EltSize = EltTy.getSizeInBits();
1136 assert(FirstSize % EltSize == 0);
1137
1138 unsigned FirstPartNumElts = FirstSize / EltSize;
1139 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1140
1141 return {LLT::scalarOrVector(FirstPartNumElts, EltTy),
1142 LLT::scalarOrVector(RemainderElts, EltTy)};
1143 }
1144
widen96To128(LLT Ty)1145 static LLT widen96To128(LLT Ty) {
1146 if (!Ty.isVector())
1147 return LLT::scalar(128);
1148
1149 LLT EltTy = Ty.getElementType();
1150 assert(128 % EltTy.getSizeInBits() == 0);
1151 return LLT::vector(128 / EltTy.getSizeInBits(), EltTy);
1152 }
1153
applyMappingLoad(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI) const1154 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
1155 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1156 MachineRegisterInfo &MRI) const {
1157 Register DstReg = MI.getOperand(0).getReg();
1158 const LLT LoadTy = MRI.getType(DstReg);
1159 unsigned LoadSize = LoadTy.getSizeInBits();
1160 const unsigned MaxNonSmrdLoadSize = 128;
1161
1162 const RegisterBank *PtrBank =
1163 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1164 if (PtrBank == &AMDGPU::SGPRRegBank) {
1165 // If the pointer is an SGPR, we ordinarily have nothing to do.
1166 if (LoadSize != 96)
1167 return false;
1168
1169 MachineMemOperand *MMO = *MI.memoperands_begin();
1170 Register PtrReg = MI.getOperand(1).getReg();
1171 // 96-bit loads are only available for vector loads. We need to split this
1172 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1173
1174 MachineIRBuilder B(MI);
1175 ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
1176 GISelObserverWrapper Observer(&O);
1177 B.setChangeObserver(Observer);
1178
1179 if (MMO->getAlign() < Align(16)) {
1180 LLT Part64, Part32;
1181 std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1182 auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);
1183 auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);
1184
1185 auto Undef = B.buildUndef(LoadTy);
1186 auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);
1187 B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);
1188 } else {
1189 LLT WiderTy = widen96To128(LoadTy);
1190 auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1191 B.buildExtract(MI.getOperand(0), WideLoad, 0);
1192 }
1193
1194 MI.eraseFromParent();
1195 return true;
1196 }
1197
1198 // 128-bit loads are supported for all instruction types.
1199 if (LoadSize <= MaxNonSmrdLoadSize)
1200 return false;
1201
1202 SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1203 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1204
1205 if (SrcRegs.empty())
1206 SrcRegs.push_back(MI.getOperand(1).getReg());
1207
1208 assert(LoadSize % MaxNonSmrdLoadSize == 0);
1209
1210 // RegBankSelect only emits scalar types, so we need to reset the pointer
1211 // operand to a pointer type.
1212 Register BasePtrReg = SrcRegs[0];
1213 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1214 MRI.setType(BasePtrReg, PtrTy);
1215
1216 MachineIRBuilder B(MI);
1217
1218 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1219 const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1220 ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
1221 GISelObserverWrapper Observer(&O);
1222 B.setChangeObserver(Observer);
1223 LegalizerHelper Helper(B.getMF(), Observer, B);
1224
1225 if (LoadTy.isVector()) {
1226 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1227 return false;
1228 } else {
1229 if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1230 return false;
1231 }
1232
1233 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1234 return true;
1235 }
1236
applyMappingDynStackAlloc(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI) const1237 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1238 MachineInstr &MI,
1239 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1240 MachineRegisterInfo &MRI) const {
1241 const MachineFunction &MF = *MI.getMF();
1242 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1243 const auto &TFI = *ST.getFrameLowering();
1244
1245 // Guard in case the stack growth direction ever changes with scratch
1246 // instructions.
1247 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1248 return false;
1249
1250 Register Dst = MI.getOperand(0).getReg();
1251 Register AllocSize = MI.getOperand(1).getReg();
1252 Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1253
1254 const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1255
1256 // TODO: Need to emit a wave reduction to get the maximum size.
1257 if (SizeBank != &AMDGPU::SGPRRegBank)
1258 return false;
1259
1260 LLT PtrTy = MRI.getType(Dst);
1261 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1262
1263 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1264 Register SPReg = Info->getStackPtrOffsetReg();
1265 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1266 GISelObserverWrapper Observer(&ApplyBank);
1267
1268 MachineIRBuilder B(MI);
1269 B.setChangeObserver(Observer);
1270
1271 auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1272 auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1273
1274 auto SPCopy = B.buildCopy(PtrTy, SPReg);
1275 if (Alignment > TFI.getStackAlign()) {
1276 auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1277 B.buildMaskLowPtrBits(Dst, PtrAdd,
1278 Log2(Alignment) + ST.getWavefrontSizeLog2());
1279 } else {
1280 B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1281 }
1282
1283 MI.eraseFromParent();
1284 return true;
1285 }
1286
applyMappingImage(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI,int RsrcIdx) const1287 bool AMDGPURegisterBankInfo::applyMappingImage(
1288 MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1289 MachineRegisterInfo &MRI, int RsrcIdx) const {
1290 const int NumDefs = MI.getNumExplicitDefs();
1291
1292 // The reported argument index is relative to the IR intrinsic call arguments,
1293 // so we need to shift by the number of defs and the intrinsic ID.
1294 RsrcIdx += NumDefs + 1;
1295
1296 // Insert copies to VGPR arguments.
1297 applyDefaultMapping(OpdMapper);
1298
1299 // Fixup any SGPR arguments.
1300 SmallVector<unsigned, 4> SGPRIndexes;
1301 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1302 if (!MI.getOperand(I).isReg())
1303 continue;
1304
1305 // If this intrinsic has a sampler, it immediately follows rsrc.
1306 if (I == RsrcIdx || I == RsrcIdx + 1)
1307 SGPRIndexes.push_back(I);
1308 }
1309
1310 executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1311 return true;
1312 }
1313
getSrcRegIgnoringCopies(const MachineRegisterInfo & MRI,Register Reg)1314 static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI,
1315 Register Reg) {
1316 MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
1317 if (!Def)
1318 return Reg;
1319
1320 // TODO: Guard against this being an implicit def
1321 return Def->getOperand(0).getReg();
1322 }
1323
1324 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1325 // the three offsets (voffset, soffset and instoffset)
setBufferOffsets(MachineIRBuilder & B,const AMDGPURegisterBankInfo & RBI,Register CombinedOffset,Register & VOffsetReg,Register & SOffsetReg,int64_t & InstOffsetVal,Align Alignment)1326 static unsigned setBufferOffsets(MachineIRBuilder &B,
1327 const AMDGPURegisterBankInfo &RBI,
1328 Register CombinedOffset, Register &VOffsetReg,
1329 Register &SOffsetReg, int64_t &InstOffsetVal,
1330 Align Alignment) {
1331 const LLT S32 = LLT::scalar(32);
1332 MachineRegisterInfo *MRI = B.getMRI();
1333
1334 if (Optional<int64_t> Imm = getConstantVRegVal(CombinedOffset, *MRI)) {
1335 uint32_t SOffset, ImmOffset;
1336 if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
1337 Alignment)) {
1338 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1339 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1340 InstOffsetVal = ImmOffset;
1341
1342 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1343 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1344 return SOffset + ImmOffset;
1345 }
1346 }
1347
1348 Register Base;
1349 unsigned Offset;
1350
1351 std::tie(Base, Offset) =
1352 AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1353
1354 uint32_t SOffset, ImmOffset;
1355 if (Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
1356 &RBI.Subtarget, Alignment)) {
1357 if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1358 VOffsetReg = Base;
1359 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1360 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1361 InstOffsetVal = ImmOffset;
1362 return 0; // XXX - Why is this 0?
1363 }
1364
1365 // If we have SGPR base, we can use it for soffset.
1366 if (SOffset == 0) {
1367 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1368 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1369 SOffsetReg = Base;
1370 InstOffsetVal = ImmOffset;
1371 return 0; // XXX - Why is this 0?
1372 }
1373 }
1374
1375 // Handle the variable sgpr + vgpr case.
1376 if (MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI)) {
1377 Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
1378 Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
1379
1380 const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI);
1381 const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI);
1382
1383 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1384 VOffsetReg = Src0;
1385 SOffsetReg = Src1;
1386 return 0;
1387 }
1388
1389 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1390 VOffsetReg = Src1;
1391 SOffsetReg = Src0;
1392 return 0;
1393 }
1394 }
1395
1396 // Ensure we have a VGPR for the combined offset. This could be an issue if we
1397 // have an SGPR offset and a VGPR resource.
1398 if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1399 VOffsetReg = CombinedOffset;
1400 } else {
1401 VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1402 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1403 }
1404
1405 SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1406 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1407 return 0;
1408 }
1409
applyMappingSBufferLoad(const OperandsMapper & OpdMapper) const1410 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1411 const OperandsMapper &OpdMapper) const {
1412 MachineInstr &MI = OpdMapper.getMI();
1413 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1414
1415 const LLT S32 = LLT::scalar(32);
1416 Register Dst = MI.getOperand(0).getReg();
1417 LLT Ty = MRI.getType(Dst);
1418
1419 const RegisterBank *RSrcBank =
1420 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1421 const RegisterBank *OffsetBank =
1422 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1423 if (RSrcBank == &AMDGPU::SGPRRegBank &&
1424 OffsetBank == &AMDGPU::SGPRRegBank)
1425 return true; // Legal mapping
1426
1427 // FIXME: 96-bit case was widened during legalize. We neeed to narrow it back
1428 // here but don't have an MMO.
1429
1430 unsigned LoadSize = Ty.getSizeInBits();
1431 int NumLoads = 1;
1432 if (LoadSize == 256 || LoadSize == 512) {
1433 NumLoads = LoadSize / 128;
1434 Ty = Ty.divide(NumLoads);
1435 }
1436
1437 // Use the alignment to ensure that the required offsets will fit into the
1438 // immediate offsets.
1439 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1440
1441 MachineIRBuilder B(MI);
1442 MachineFunction &MF = B.getMF();
1443
1444 Register SOffset;
1445 Register VOffset;
1446 int64_t ImmOffset = 0;
1447
1448 unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(),
1449 VOffset, SOffset, ImmOffset, Alignment);
1450
1451 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1452 // can, but we neeed to track an MMO for that.
1453 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1454 const Align MemAlign(4); // FIXME: ABI type alignment?
1455 MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1456 MachinePointerInfo(),
1457 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1458 MachineMemOperand::MOInvariant,
1459 MemSize, MemAlign);
1460 if (MMOOffset != 0)
1461 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1462
1463 // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1464 // assume that the buffer is unswizzled.
1465
1466 Register RSrc = MI.getOperand(1).getReg();
1467 Register VIndex = B.buildConstant(S32, 0).getReg(0);
1468 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1469
1470 SmallVector<Register, 4> LoadParts(NumLoads);
1471
1472 MachineBasicBlock::iterator MII = MI.getIterator();
1473 MachineInstrSpan Span(MII, &B.getMBB());
1474
1475 for (int i = 0; i < NumLoads; ++i) {
1476 if (NumLoads == 1) {
1477 LoadParts[i] = Dst;
1478 } else {
1479 LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1480 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1481 }
1482
1483 MachineMemOperand *MMO = BaseMMO;
1484 if (i != 0)
1485 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1486
1487 B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1488 .addDef(LoadParts[i]) // vdata
1489 .addUse(RSrc) // rsrc
1490 .addUse(VIndex) // vindex
1491 .addUse(VOffset) // voffset
1492 .addUse(SOffset) // soffset
1493 .addImm(ImmOffset + 16 * i) // offset(imm)
1494 .addImm(0) // cachepolicy, swizzled buffer(imm)
1495 .addImm(0) // idxen(imm)
1496 .addMemOperand(MMO);
1497 }
1498
1499 // TODO: If only the resource is a VGPR, it may be better to execute the
1500 // scalar load in the waterfall loop if the resource is expected to frequently
1501 // be dynamically uniform.
1502 if (RSrcBank != &AMDGPU::SGPRRegBank) {
1503 // Remove the original instruction to avoid potentially confusing the
1504 // waterfall loop logic.
1505 B.setInstr(*Span.begin());
1506 MI.eraseFromParent();
1507
1508 SmallSet<Register, 4> OpsToWaterfall;
1509
1510 OpsToWaterfall.insert(RSrc);
1511 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1512 OpsToWaterfall, MRI);
1513 }
1514
1515 if (NumLoads != 1) {
1516 if (Ty.isVector())
1517 B.buildConcatVectors(Dst, LoadParts);
1518 else
1519 B.buildMerge(Dst, LoadParts);
1520 }
1521
1522 // We removed the instruction earlier with a waterfall loop.
1523 if (RSrcBank == &AMDGPU::SGPRRegBank)
1524 MI.eraseFromParent();
1525
1526 return true;
1527 }
1528
applyMappingBFEIntrinsic(const OperandsMapper & OpdMapper,bool Signed) const1529 bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic(
1530 const OperandsMapper &OpdMapper, bool Signed) const {
1531 MachineInstr &MI = OpdMapper.getMI();
1532 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1533
1534 // Insert basic copies
1535 applyDefaultMapping(OpdMapper);
1536
1537 Register DstReg = MI.getOperand(0).getReg();
1538 LLT Ty = MRI.getType(DstReg);
1539
1540 const LLT S32 = LLT::scalar(32);
1541
1542 const RegisterBank *DstBank =
1543 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1544 if (DstBank == &AMDGPU::VGPRRegBank) {
1545 if (Ty == S32)
1546 return true;
1547
1548 // TODO: 64-bit version is scalar only, so we need to expand this.
1549 return false;
1550 }
1551
1552 Register SrcReg = MI.getOperand(2).getReg();
1553 Register OffsetReg = MI.getOperand(3).getReg();
1554 Register WidthReg = MI.getOperand(4).getReg();
1555
1556 // The scalar form packs the offset and width in a single operand.
1557
1558 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1559 GISelObserverWrapper Observer(&ApplyBank);
1560 MachineIRBuilder B(MI);
1561 B.setChangeObserver(Observer);
1562
1563 // Ensure the high bits are clear to insert the offset.
1564 auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1565 auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1566
1567 // Zeros out the low bits, so don't bother clamping the input value.
1568 auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1569
1570 // Transformation function, pack the offset and width of a BFE into
1571 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1572 // source, bits [5:0] contain the offset and bits [22:16] the width.
1573 auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1574
1575 // TODO: It might be worth using a pseudo here to avoid scc clobber and
1576 // register class constraints.
1577 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1578 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1579
1580 auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1581 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1582 llvm_unreachable("failed to constrain BFE");
1583
1584 MI.eraseFromParent();
1585 return true;
1586 }
1587
1588 // FIXME: Duplicated from LegalizerHelper
minMaxToCompare(unsigned Opc)1589 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
1590 switch (Opc) {
1591 case TargetOpcode::G_SMIN:
1592 return CmpInst::ICMP_SLT;
1593 case TargetOpcode::G_SMAX:
1594 return CmpInst::ICMP_SGT;
1595 case TargetOpcode::G_UMIN:
1596 return CmpInst::ICMP_ULT;
1597 case TargetOpcode::G_UMAX:
1598 return CmpInst::ICMP_UGT;
1599 default:
1600 llvm_unreachable("not in integer min/max");
1601 }
1602 }
1603
minMaxToExtend(unsigned Opc)1604 static unsigned minMaxToExtend(unsigned Opc) {
1605 switch (Opc) {
1606 case TargetOpcode::G_SMIN:
1607 case TargetOpcode::G_SMAX:
1608 return TargetOpcode::G_SEXT;
1609 case TargetOpcode::G_UMIN:
1610 case TargetOpcode::G_UMAX:
1611 return TargetOpcode::G_ZEXT;
1612 default:
1613 llvm_unreachable("not in integer min/max");
1614 }
1615 }
1616
1617 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1618 // any illegal vector extend or unmerge operations.
1619 static std::pair<Register, Register>
unpackV2S16ToS32(MachineIRBuilder & B,Register Src,unsigned ExtOpcode)1620 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1621 const LLT S32 = LLT::scalar(32);
1622 auto Bitcast = B.buildBitcast(S32, Src);
1623
1624 if (ExtOpcode == TargetOpcode::G_SEXT) {
1625 auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1626 auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1627 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1628 }
1629
1630 auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1631 if (ExtOpcode == TargetOpcode::G_ZEXT) {
1632 auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1633 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1634 }
1635
1636 assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1637 return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1638 }
1639
buildExpandedScalarMinMax(MachineIRBuilder & B,CmpInst::Predicate Pred,Register Dst,Register Src0,Register Src1)1640 static MachineInstr *buildExpandedScalarMinMax(MachineIRBuilder &B,
1641 CmpInst::Predicate Pred,
1642 Register Dst, Register Src0,
1643 Register Src1) {
1644 const LLT CmpType = LLT::scalar(32);
1645 auto Cmp = B.buildICmp(Pred, CmpType, Src0, Src1);
1646 return B.buildSelect(Dst, Cmp, Src0, Src1);
1647 }
1648
1649 // FIXME: Duplicated from LegalizerHelper, except changing the boolean type.
lowerScalarMinMax(MachineIRBuilder & B,MachineInstr & MI) const1650 void AMDGPURegisterBankInfo::lowerScalarMinMax(MachineIRBuilder &B,
1651 MachineInstr &MI) const {
1652 Register Dst = MI.getOperand(0).getReg();
1653 Register Src0 = MI.getOperand(1).getReg();
1654 Register Src1 = MI.getOperand(2).getReg();
1655
1656 const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
1657 MachineInstr *Sel = buildExpandedScalarMinMax(B, Pred, Dst, Src0, Src1);
1658
1659 Register CmpReg = Sel->getOperand(1).getReg();
1660 B.getMRI()->setRegBank(CmpReg, AMDGPU::SGPRRegBank);
1661 MI.eraseFromParent();
1662 }
1663
1664 // For cases where only a single copy is inserted for matching register banks.
1665 // Replace the register in the instruction operand
substituteSimpleCopyRegs(const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,unsigned OpIdx)1666 static bool substituteSimpleCopyRegs(
1667 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1668 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1669 if (!SrcReg.empty()) {
1670 assert(SrcReg.size() == 1);
1671 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1672 return true;
1673 }
1674
1675 return false;
1676 }
1677
1678 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg) const1679 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1680 MachineRegisterInfo &MRI,
1681 Register Reg) const {
1682 if (!Subtarget.hasUnpackedD16VMem())
1683 return Reg;
1684
1685 const LLT S16 = LLT::scalar(16);
1686 LLT StoreVT = MRI.getType(Reg);
1687 if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1688 return Reg;
1689
1690 auto Unmerge = B.buildUnmerge(S16, Reg);
1691
1692
1693 SmallVector<Register, 4> WideRegs;
1694 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1695 WideRegs.push_back(Unmerge.getReg(I));
1696
1697 const LLT S32 = LLT::scalar(32);
1698 int NumElts = StoreVT.getNumElements();
1699
1700 return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1701 }
1702
1703 static std::pair<Register, unsigned>
getBaseWithConstantOffset(MachineRegisterInfo & MRI,Register Reg)1704 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1705 int64_t Const;
1706 if (mi_match(Reg, MRI, m_ICst(Const)))
1707 return std::make_pair(Register(), Const);
1708
1709 Register Base;
1710 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1711 return std::make_pair(Base, Const);
1712
1713 // TODO: Handle G_OR used for add case
1714 return std::make_pair(Reg, 0);
1715 }
1716
1717 std::pair<Register, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const1718 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1719 Register OrigOffset) const {
1720 const unsigned MaxImm = 4095;
1721 Register BaseReg;
1722 unsigned ImmOffset;
1723 const LLT S32 = LLT::scalar(32);
1724
1725 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1726 OrigOffset);
1727
1728 unsigned C1 = 0;
1729 if (ImmOffset != 0) {
1730 // If the immediate value is too big for the immoffset field, put the value
1731 // and -4096 into the immoffset field so that the value that is copied/added
1732 // for the voffset field is a multiple of 4096, and it stands more chance
1733 // of being CSEd with the copy/add for another similar load/store.
1734 // However, do not do that rounding down to a multiple of 4096 if that is a
1735 // negative number, as it appears to be illegal to have a negative offset
1736 // in the vgpr, even if adding the immediate offset makes it positive.
1737 unsigned Overflow = ImmOffset & ~MaxImm;
1738 ImmOffset -= Overflow;
1739 if ((int32_t)Overflow < 0) {
1740 Overflow += ImmOffset;
1741 ImmOffset = 0;
1742 }
1743
1744 C1 = ImmOffset;
1745 if (Overflow != 0) {
1746 if (!BaseReg)
1747 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1748 else {
1749 auto OverflowVal = B.buildConstant(S32, Overflow);
1750 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1751 }
1752 }
1753 }
1754
1755 if (!BaseReg)
1756 BaseReg = B.buildConstant(S32, 0).getReg(0);
1757
1758 return {BaseReg, C1};
1759 }
1760
isZero(Register Reg,MachineRegisterInfo & MRI)1761 static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
1762 int64_t C;
1763 return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
1764 }
1765
extractGLC(unsigned CachePolicy)1766 static unsigned extractGLC(unsigned CachePolicy) {
1767 return CachePolicy & 1;
1768 }
1769
extractSLC(unsigned CachePolicy)1770 static unsigned extractSLC(unsigned CachePolicy) {
1771 return (CachePolicy >> 1) & 1;
1772 }
1773
extractDLC(unsigned CachePolicy)1774 static unsigned extractDLC(unsigned CachePolicy) {
1775 return (CachePolicy >> 2) & 1;
1776 }
1777
1778 MachineInstr *
selectStoreIntrinsic(MachineIRBuilder & B,MachineInstr & MI) const1779 AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
1780 MachineInstr &MI) const {
1781 MachineRegisterInfo &MRI = *B.getMRI();
1782 executeInWaterfallLoop(B, MI, MRI, {2, 4});
1783
1784 // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
1785
1786 Register VData = MI.getOperand(1).getReg();
1787 LLT Ty = MRI.getType(VData);
1788
1789 int EltSize = Ty.getScalarSizeInBits();
1790 int Size = Ty.getSizeInBits();
1791
1792 // FIXME: Broken integer truncstore.
1793 if (EltSize != 32)
1794 report_fatal_error("unhandled intrinsic store");
1795
1796 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
1797 const int MemSize = (*MI.memoperands_begin())->getSize();
1798
1799
1800 Register RSrc = MI.getOperand(2).getReg();
1801 Register VOffset = MI.getOperand(3).getReg();
1802 Register SOffset = MI.getOperand(4).getReg();
1803 unsigned CachePolicy = MI.getOperand(5).getImm();
1804
1805 unsigned ImmOffset;
1806 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
1807
1808 const bool Offen = !isZero(VOffset, MRI);
1809
1810 unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact;
1811 switch (8 * MemSize) {
1812 case 8:
1813 Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
1814 AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
1815 break;
1816 case 16:
1817 Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
1818 AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
1819 break;
1820 default:
1821 Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
1822 AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
1823 if (Size > 32)
1824 Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
1825 break;
1826 }
1827
1828
1829 // Set the insertion point back to the instruction in case it was moved into a
1830 // loop.
1831 B.setInstr(MI);
1832
1833 MachineInstrBuilder MIB = B.buildInstr(Opc)
1834 .addUse(VData);
1835
1836 if (Offen)
1837 MIB.addUse(VOffset);
1838
1839 MIB.addUse(RSrc)
1840 .addUse(SOffset)
1841 .addImm(ImmOffset)
1842 .addImm(extractGLC(CachePolicy))
1843 .addImm(extractSLC(CachePolicy))
1844 .addImm(0) // tfe: FIXME: Remove from inst
1845 .addImm(extractDLC(CachePolicy))
1846 .cloneMemRefs(MI);
1847
1848 // FIXME: We need a way to report failure from applyMappingImpl.
1849 // Insert constrain copies before inserting the loop.
1850 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1851 report_fatal_error("failed to constrain selected store intrinsic");
1852
1853 return MIB;
1854 }
1855
buildVCopy(MachineIRBuilder & B,Register DstReg,Register SrcReg) const1856 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1857 Register SrcReg) const {
1858 MachineRegisterInfo &MRI = *B.getMRI();
1859 LLT SrcTy = MRI.getType(SrcReg);
1860 if (SrcTy.getSizeInBits() == 32) {
1861 // Use a v_mov_b32 here to make the exec dependency explicit.
1862 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1863 .addDef(DstReg)
1864 .addUse(SrcReg);
1865 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1866 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1867 }
1868
1869 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1870 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1871
1872 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1873 .addDef(TmpReg0)
1874 .addUse(SrcReg, 0, AMDGPU::sub0);
1875 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1876 .addDef(TmpReg1)
1877 .addUse(SrcReg, 0, AMDGPU::sub1);
1878 B.buildInstr(AMDGPU::REG_SEQUENCE)
1879 .addDef(DstReg)
1880 .addUse(TmpReg0)
1881 .addImm(AMDGPU::sub0)
1882 .addUse(TmpReg1)
1883 .addImm(AMDGPU::sub1);
1884
1885 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1886 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1887 }
1888
1889 /// Utility function for pushing dynamic vector indexes with a constant offset
1890 /// into waterwall loops.
reinsertVectorIndexAdd(MachineIRBuilder & B,MachineInstr & IdxUseInstr,unsigned OpIdx,unsigned ConstOffset)1891 static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1892 MachineInstr &IdxUseInstr,
1893 unsigned OpIdx,
1894 unsigned ConstOffset) {
1895 MachineRegisterInfo &MRI = *B.getMRI();
1896 const LLT S32 = LLT::scalar(32);
1897 Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1898 B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1899
1900 auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1901
1902 auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1903 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1904 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1905 IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1906 }
1907
1908 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1909 /// original 32-bit source value (to be inserted in the low part of the combined
1910 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1911 /// value.
extendLow32IntoHigh32(MachineIRBuilder & B,Register Hi32Reg,Register Lo32Reg,unsigned ExtOpc,const RegisterBank & RegBank,bool IsBooleanSrc=false)1912 static void extendLow32IntoHigh32(MachineIRBuilder &B,
1913 Register Hi32Reg, Register Lo32Reg,
1914 unsigned ExtOpc,
1915 const RegisterBank &RegBank,
1916 bool IsBooleanSrc = false) {
1917 if (ExtOpc == AMDGPU::G_ZEXT) {
1918 B.buildConstant(Hi32Reg, 0);
1919 } else if (ExtOpc == AMDGPU::G_SEXT) {
1920 if (IsBooleanSrc) {
1921 // If we know the original source was an s1, the high half is the same as
1922 // the low.
1923 B.buildCopy(Hi32Reg, Lo32Reg);
1924 } else {
1925 // Replicate sign bit from 32-bit extended part.
1926 auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1927 B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1928 B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1929 }
1930 } else {
1931 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1932 B.buildUndef(Hi32Reg);
1933 }
1934 }
1935
foldExtractEltToCmpSelect(MachineInstr & MI,MachineRegisterInfo & MRI,const OperandsMapper & OpdMapper) const1936 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1937 MachineInstr &MI, MachineRegisterInfo &MRI,
1938 const OperandsMapper &OpdMapper) const {
1939
1940 Register VecReg = MI.getOperand(1).getReg();
1941 Register Idx = MI.getOperand(2).getReg();
1942
1943 const RegisterBank &IdxBank =
1944 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1945
1946 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1947
1948 LLT VecTy = MRI.getType(VecReg);
1949 unsigned EltSize = VecTy.getScalarSizeInBits();
1950 unsigned NumElem = VecTy.getNumElements();
1951
1952 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1953 IsDivergentIdx))
1954 return false;
1955
1956 MachineIRBuilder B(MI);
1957 LLT S32 = LLT::scalar(32);
1958
1959 const RegisterBank &DstBank =
1960 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1961 const RegisterBank &SrcBank =
1962 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1963
1964 const RegisterBank &CCBank =
1965 (DstBank == AMDGPU::SGPRRegBank &&
1966 SrcBank == AMDGPU::SGPRRegBank &&
1967 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1968 : AMDGPU::VCCRegBank;
1969 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1970
1971 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1972 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1973 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1974 }
1975
1976 LLT EltTy = VecTy.getScalarType();
1977 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1978 unsigned NumLanes = DstRegs.size();
1979 if (!NumLanes)
1980 NumLanes = 1;
1981 else
1982 EltTy = MRI.getType(DstRegs[0]);
1983
1984 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1985 SmallVector<Register, 2> Res(NumLanes);
1986 for (unsigned L = 0; L < NumLanes; ++L)
1987 Res[L] = UnmergeToEltTy.getReg(L);
1988
1989 for (unsigned I = 1; I < NumElem; ++I) {
1990 auto IC = B.buildConstant(S32, I);
1991 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
1992 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
1993 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
1994
1995 for (unsigned L = 0; L < NumLanes; ++L) {
1996 auto S = B.buildSelect(EltTy, Cmp,
1997 UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
1998
1999 for (unsigned N : { 0, 2, 3 })
2000 MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
2001
2002 Res[L] = S->getOperand(0).getReg();
2003 }
2004 }
2005
2006 for (unsigned L = 0; L < NumLanes; ++L) {
2007 Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
2008 B.buildCopy(DstReg, Res[L]);
2009 MRI.setRegBank(DstReg, DstBank);
2010 }
2011
2012 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2013 MI.eraseFromParent();
2014
2015 return true;
2016 }
2017
foldInsertEltToCmpSelect(MachineInstr & MI,MachineRegisterInfo & MRI,const OperandsMapper & OpdMapper) const2018 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2019 MachineInstr &MI, MachineRegisterInfo &MRI,
2020 const OperandsMapper &OpdMapper) const {
2021
2022 Register VecReg = MI.getOperand(1).getReg();
2023 Register Idx = MI.getOperand(3).getReg();
2024
2025 const RegisterBank &IdxBank =
2026 *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2027
2028 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2029
2030 LLT VecTy = MRI.getType(VecReg);
2031 unsigned EltSize = VecTy.getScalarSizeInBits();
2032 unsigned NumElem = VecTy.getNumElements();
2033
2034 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2035 IsDivergentIdx))
2036 return false;
2037
2038 MachineIRBuilder B(MI);
2039 LLT S32 = LLT::scalar(32);
2040
2041 const RegisterBank &DstBank =
2042 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2043 const RegisterBank &SrcBank =
2044 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2045 const RegisterBank &InsBank =
2046 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2047
2048 const RegisterBank &CCBank =
2049 (DstBank == AMDGPU::SGPRRegBank &&
2050 SrcBank == AMDGPU::SGPRRegBank &&
2051 InsBank == AMDGPU::SGPRRegBank &&
2052 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2053 : AMDGPU::VCCRegBank;
2054 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2055
2056 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2057 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
2058 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2059 }
2060
2061 LLT EltTy = VecTy.getScalarType();
2062 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2063 unsigned NumLanes = InsRegs.size();
2064 if (!NumLanes) {
2065 NumLanes = 1;
2066 InsRegs.push_back(MI.getOperand(2).getReg());
2067 } else {
2068 EltTy = MRI.getType(InsRegs[0]);
2069 }
2070
2071 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
2072 SmallVector<Register, 16> Ops(NumElem * NumLanes);
2073
2074 for (unsigned I = 0; I < NumElem; ++I) {
2075 auto IC = B.buildConstant(S32, I);
2076 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2077 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2078 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2079
2080 for (unsigned L = 0; L < NumLanes; ++L) {
2081 auto S = B.buildSelect(EltTy, Cmp, InsRegs[L],
2082 UnmergeToEltTy.getReg(I * NumLanes + L));
2083
2084 for (unsigned N : { 0, 2, 3 })
2085 MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
2086
2087 Ops[I * NumLanes + L] = S->getOperand(0).getReg();
2088 }
2089 }
2090
2091 LLT MergeTy = LLT::vector(Ops.size(), EltTy);
2092 if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2093 B.buildBuildVector(MI.getOperand(0), Ops);
2094 } else {
2095 auto Vec = B.buildBuildVector(MergeTy, Ops);
2096 MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2097 B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2098 }
2099
2100 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2101 MI.eraseFromParent();
2102
2103 return true;
2104 }
2105
applyMappingImpl(const OperandsMapper & OpdMapper) const2106 void AMDGPURegisterBankInfo::applyMappingImpl(
2107 const OperandsMapper &OpdMapper) const {
2108 MachineInstr &MI = OpdMapper.getMI();
2109 unsigned Opc = MI.getOpcode();
2110 MachineRegisterInfo &MRI = OpdMapper.getMRI();
2111 switch (Opc) {
2112 case AMDGPU::G_PHI: {
2113 Register DstReg = MI.getOperand(0).getReg();
2114 LLT DstTy = MRI.getType(DstReg);
2115 if (DstTy != LLT::scalar(1))
2116 break;
2117
2118 const LLT S32 = LLT::scalar(32);
2119 const RegisterBank *DstBank =
2120 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2121 if (DstBank == &AMDGPU::VCCRegBank) {
2122 applyDefaultMapping(OpdMapper);
2123 // The standard handling only considers the result register bank for
2124 // phis. For VCC, blindly inserting a copy when the phi is lowered will
2125 // produce an invalid copy. We can only copy with some kind of compare to
2126 // get a vector boolean result. Insert a regitser bank copy that will be
2127 // correctly lowered to a compare.
2128 MachineIRBuilder B(*MI.getParent()->getParent());
2129
2130 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2131 Register SrcReg = MI.getOperand(I).getReg();
2132 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2133
2134 if (SrcBank != &AMDGPU::VCCRegBank) {
2135 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2136 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2137
2138 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2139 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2140 MI.getOperand(I).setReg(Copy.getReg(0));
2141 }
2142 }
2143
2144 return;
2145 }
2146
2147 // Phi handling is strange and only considers the bank of the destination.
2148 substituteSimpleCopyRegs(OpdMapper, 0);
2149
2150 // Promote SGPR/VGPR booleans to s32
2151 MachineFunction *MF = MI.getParent()->getParent();
2152 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2153 GISelObserverWrapper Observer(&ApplyBank);
2154 MachineIRBuilder B(MI);
2155 LegalizerHelper Helper(*MF, Observer, B);
2156
2157 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2158 llvm_unreachable("widen scalar should have succeeded");
2159
2160 return;
2161 }
2162 case AMDGPU::G_ICMP:
2163 case AMDGPU::G_UADDO:
2164 case AMDGPU::G_USUBO:
2165 case AMDGPU::G_UADDE:
2166 case AMDGPU::G_SADDE:
2167 case AMDGPU::G_USUBE:
2168 case AMDGPU::G_SSUBE: {
2169 unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
2170 Register DstReg = MI.getOperand(BoolDstOp).getReg();
2171
2172 const RegisterBank *DstBank =
2173 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2174 if (DstBank != &AMDGPU::SGPRRegBank)
2175 break;
2176
2177 const bool HasCarryIn = MI.getNumOperands() == 5;
2178
2179 // If this is a scalar compare, promote the result to s32, as the selection
2180 // will end up using a copy to a 32-bit vreg.
2181 const LLT S32 = LLT::scalar(32);
2182 Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2183 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2184 MI.getOperand(BoolDstOp).setReg(NewDstReg);
2185 MachineIRBuilder B(MI);
2186
2187 if (HasCarryIn) {
2188 Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2189 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2190 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2191 MI.getOperand(4).setReg(NewSrcReg);
2192 }
2193
2194 MachineBasicBlock *MBB = MI.getParent();
2195 B.setInsertPt(*MBB, std::next(MI.getIterator()));
2196
2197 // If we had a constrained VCC result register, a copy was inserted to VCC
2198 // from SGPR.
2199 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2200 if (DefRegs.empty())
2201 DefRegs.push_back(DstReg);
2202 B.buildTrunc(DefRegs[0], NewDstReg);
2203 return;
2204 }
2205 case AMDGPU::G_SELECT: {
2206 Register DstReg = MI.getOperand(0).getReg();
2207 LLT DstTy = MRI.getType(DstReg);
2208
2209 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2210 if (CondRegs.empty())
2211 CondRegs.push_back(MI.getOperand(1).getReg());
2212 else {
2213 assert(CondRegs.size() == 1);
2214 }
2215
2216 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2217 if (CondBank == &AMDGPU::SGPRRegBank) {
2218 MachineIRBuilder B(MI);
2219 const LLT S32 = LLT::scalar(32);
2220 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2221 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2222
2223 MI.getOperand(1).setReg(NewCondReg);
2224 B.buildZExt(NewCondReg, CondRegs[0]);
2225 }
2226
2227 if (DstTy.getSizeInBits() != 64)
2228 break;
2229
2230 MachineIRBuilder B(MI);
2231 LLT HalfTy = getHalfSizedType(DstTy);
2232
2233 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2234 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2235 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2236
2237 // All inputs are SGPRs, nothing special to do.
2238 if (DefRegs.empty()) {
2239 assert(Src1Regs.empty() && Src2Regs.empty());
2240 break;
2241 }
2242
2243 if (Src1Regs.empty())
2244 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2245 else {
2246 setRegsToType(MRI, Src1Regs, HalfTy);
2247 }
2248
2249 if (Src2Regs.empty())
2250 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2251 else
2252 setRegsToType(MRI, Src2Regs, HalfTy);
2253
2254 setRegsToType(MRI, DefRegs, HalfTy);
2255
2256 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2257 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2258
2259 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2260 MI.eraseFromParent();
2261 return;
2262 }
2263 case AMDGPU::G_BRCOND: {
2264 Register CondReg = MI.getOperand(0).getReg();
2265 // FIXME: Should use legalizer helper, but should change bool ext type.
2266 const RegisterBank *CondBank =
2267 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2268
2269 if (CondBank == &AMDGPU::SGPRRegBank) {
2270 MachineIRBuilder B(MI);
2271 const LLT S32 = LLT::scalar(32);
2272 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2273 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2274
2275 MI.getOperand(0).setReg(NewCondReg);
2276 B.buildZExt(NewCondReg, CondReg);
2277 return;
2278 }
2279
2280 break;
2281 }
2282 case AMDGPU::G_AND:
2283 case AMDGPU::G_OR:
2284 case AMDGPU::G_XOR: {
2285 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2286 // there is a VGPR input.
2287 Register DstReg = MI.getOperand(0).getReg();
2288 LLT DstTy = MRI.getType(DstReg);
2289
2290 if (DstTy.getSizeInBits() == 1) {
2291 const RegisterBank *DstBank =
2292 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2293 if (DstBank == &AMDGPU::VCCRegBank)
2294 break;
2295
2296 MachineFunction *MF = MI.getParent()->getParent();
2297 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2298 GISelObserverWrapper Observer(&ApplyBank);
2299 MachineIRBuilder B(MI);
2300 LegalizerHelper Helper(*MF, Observer, B);
2301
2302 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2303 LegalizerHelper::Legalized)
2304 llvm_unreachable("widen scalar should have succeeded");
2305 return;
2306 }
2307
2308 if (DstTy.getSizeInBits() != 64)
2309 break;
2310
2311 LLT HalfTy = getHalfSizedType(DstTy);
2312 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2313 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2314 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2315
2316 // All inputs are SGPRs, nothing special to do.
2317 if (DefRegs.empty()) {
2318 assert(Src0Regs.empty() && Src1Regs.empty());
2319 break;
2320 }
2321
2322 assert(DefRegs.size() == 2);
2323 assert(Src0Regs.size() == Src1Regs.size() &&
2324 (Src0Regs.empty() || Src0Regs.size() == 2));
2325
2326 // Depending on where the source registers came from, the generic code may
2327 // have decided to split the inputs already or not. If not, we still need to
2328 // extract the values.
2329 MachineIRBuilder B(MI);
2330
2331 if (Src0Regs.empty())
2332 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2333 else
2334 setRegsToType(MRI, Src0Regs, HalfTy);
2335
2336 if (Src1Regs.empty())
2337 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2338 else
2339 setRegsToType(MRI, Src1Regs, HalfTy);
2340
2341 setRegsToType(MRI, DefRegs, HalfTy);
2342
2343 B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
2344 B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
2345
2346 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2347 MI.eraseFromParent();
2348 return;
2349 }
2350 case AMDGPU::G_ADD:
2351 case AMDGPU::G_SUB:
2352 case AMDGPU::G_MUL:
2353 case AMDGPU::G_SHL:
2354 case AMDGPU::G_LSHR:
2355 case AMDGPU::G_ASHR: {
2356 Register DstReg = MI.getOperand(0).getReg();
2357 LLT DstTy = MRI.getType(DstReg);
2358
2359 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2360 // Packed 16-bit operations need to be scalarized and promoted.
2361 if (DstTy != LLT::scalar(16) && DstTy != LLT::vector(2, 16))
2362 break;
2363
2364 const RegisterBank *DstBank =
2365 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2366 if (DstBank == &AMDGPU::VGPRRegBank)
2367 break;
2368
2369 const LLT S32 = LLT::scalar(32);
2370 MachineBasicBlock *MBB = MI.getParent();
2371 MachineFunction *MF = MBB->getParent();
2372 MachineIRBuilder B(MI);
2373 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2374 GISelObserverWrapper Observer(&ApplySALU);
2375
2376 if (DstTy.isVector()) {
2377 B.setChangeObserver(Observer);
2378
2379 Register WideSrc0Lo, WideSrc0Hi;
2380 Register WideSrc1Lo, WideSrc1Hi;
2381
2382 std::tie(WideSrc0Lo, WideSrc0Hi)
2383 = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), AMDGPU::G_ANYEXT);
2384 std::tie(WideSrc1Lo, WideSrc1Hi)
2385 = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), AMDGPU::G_ANYEXT);
2386 auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2387 auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2388 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2389 MI.eraseFromParent();
2390 } else {
2391 LegalizerHelper Helper(*MF, Observer, B);
2392
2393 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2394 llvm_unreachable("widen scalar should have succeeded");
2395
2396 // FIXME: s16 shift amounts should be legal.
2397 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2398 Opc == AMDGPU::G_ASHR) {
2399 B.setInsertPt(*MBB, MI.getIterator());
2400 if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2401 llvm_unreachable("widen scalar should have succeeded");
2402 }
2403 }
2404
2405 return;
2406 }
2407 case AMDGPU::G_SMIN:
2408 case AMDGPU::G_SMAX:
2409 case AMDGPU::G_UMIN:
2410 case AMDGPU::G_UMAX: {
2411 Register DstReg = MI.getOperand(0).getReg();
2412 const RegisterBank *DstBank =
2413 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2414 if (DstBank == &AMDGPU::VGPRRegBank)
2415 break;
2416
2417 MachineFunction *MF = MI.getParent()->getParent();
2418 MachineIRBuilder B(MI);
2419
2420 // Turn scalar min/max into a compare and select.
2421 LLT Ty = MRI.getType(DstReg);
2422 const LLT S32 = LLT::scalar(32);
2423 const LLT S16 = LLT::scalar(16);
2424 const LLT V2S16 = LLT::vector(2, 16);
2425
2426 if (Ty == V2S16) {
2427 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2428 GISelObserverWrapper Observer(&ApplySALU);
2429 B.setChangeObserver(Observer);
2430
2431 // Need to widen to s32, and expand as cmp + select, and avoid producing
2432 // illegal vector extends or unmerges that would need further
2433 // legalization.
2434 //
2435 // TODO: Should we just readfirstlane? That should probably be handled
2436 // with a UniformVGPR register bank that wouldn't need special
2437 // consideration here.
2438
2439 Register Dst = MI.getOperand(0).getReg();
2440 Register Src0 = MI.getOperand(1).getReg();
2441 Register Src1 = MI.getOperand(2).getReg();
2442
2443 Register WideSrc0Lo, WideSrc0Hi;
2444 Register WideSrc1Lo, WideSrc1Hi;
2445
2446 unsigned ExtendOp = minMaxToExtend(MI.getOpcode());
2447
2448 std::tie(WideSrc0Lo, WideSrc0Hi) = unpackV2S16ToS32(B, Src0, ExtendOp);
2449 std::tie(WideSrc1Lo, WideSrc1Hi) = unpackV2S16ToS32(B, Src1, ExtendOp);
2450
2451 Register Lo = MRI.createGenericVirtualRegister(S32);
2452 Register Hi = MRI.createGenericVirtualRegister(S32);
2453 const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
2454 buildExpandedScalarMinMax(B, Pred, Lo, WideSrc0Lo, WideSrc1Lo);
2455 buildExpandedScalarMinMax(B, Pred, Hi, WideSrc0Hi, WideSrc1Hi);
2456
2457 B.buildBuildVectorTrunc(Dst, {Lo, Hi});
2458 MI.eraseFromParent();
2459 } else if (Ty == S16) {
2460 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2461 GISelObserverWrapper Observer(&ApplySALU);
2462 LegalizerHelper Helper(*MF, Observer, B);
2463
2464 // Need to widen to s32, and expand as cmp + select.
2465 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2466 llvm_unreachable("widenScalar should have succeeded");
2467
2468 // FIXME: This is relying on widenScalar leaving MI in place.
2469 lowerScalarMinMax(B, MI);
2470 } else
2471 lowerScalarMinMax(B, MI);
2472
2473 return;
2474 }
2475 case AMDGPU::G_SEXT_INREG: {
2476 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2477 if (SrcRegs.empty())
2478 break; // Nothing to repair
2479
2480 const LLT S32 = LLT::scalar(32);
2481 MachineIRBuilder B(MI);
2482 ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
2483 GISelObserverWrapper Observer(&O);
2484 B.setChangeObserver(Observer);
2485
2486 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2487 // we would need to further expand, and doesn't let us directly set the
2488 // result registers.
2489 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2490
2491 int Amt = MI.getOperand(2).getImm();
2492 if (Amt <= 32) {
2493 if (Amt == 32) {
2494 // The low bits are unchanged.
2495 B.buildCopy(DstRegs[0], SrcRegs[0]);
2496 } else {
2497 // Extend in the low bits and propagate the sign bit to the high half.
2498 B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt);
2499 }
2500
2501 B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2502 } else {
2503 // The low bits are unchanged, and extend in the high bits.
2504 B.buildCopy(DstRegs[0], SrcRegs[0]);
2505 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2506 }
2507
2508 Register DstReg = MI.getOperand(0).getReg();
2509 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2510 MI.eraseFromParent();
2511 return;
2512 }
2513 case AMDGPU::G_CTPOP:
2514 case AMDGPU::G_CTLZ_ZERO_UNDEF:
2515 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2516 MachineIRBuilder B(MI);
2517 MachineFunction &MF = B.getMF();
2518
2519 const RegisterBank *DstBank =
2520 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2521 if (DstBank == &AMDGPU::SGPRRegBank)
2522 break;
2523
2524 Register SrcReg = MI.getOperand(1).getReg();
2525 const LLT S32 = LLT::scalar(32);
2526 LLT Ty = MRI.getType(SrcReg);
2527 if (Ty == S32)
2528 break;
2529
2530 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2531 GISelObserverWrapper Observer(&ApplyVALU);
2532 LegalizerHelper Helper(MF, Observer, B);
2533
2534 if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2535 llvm_unreachable("narrowScalar should have succeeded");
2536 return;
2537 }
2538 case AMDGPU::G_SEXT:
2539 case AMDGPU::G_ZEXT:
2540 case AMDGPU::G_ANYEXT: {
2541 Register SrcReg = MI.getOperand(1).getReg();
2542 LLT SrcTy = MRI.getType(SrcReg);
2543 const bool Signed = Opc == AMDGPU::G_SEXT;
2544
2545 assert(empty(OpdMapper.getVRegs(1)));
2546
2547 MachineIRBuilder B(MI);
2548 const RegisterBank *SrcBank =
2549 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2550
2551 Register DstReg = MI.getOperand(0).getReg();
2552 LLT DstTy = MRI.getType(DstReg);
2553 if (DstTy.isScalar() &&
2554 SrcBank != &AMDGPU::SGPRRegBank &&
2555 SrcBank != &AMDGPU::VCCRegBank &&
2556 // FIXME: Should handle any type that round to s64 when irregular
2557 // breakdowns supported.
2558 DstTy.getSizeInBits() == 64 &&
2559 SrcTy.getSizeInBits() <= 32) {
2560 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2561
2562 // Extend to 32-bit, and then extend the low half.
2563 if (Signed) {
2564 // TODO: Should really be buildSExtOrCopy
2565 B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2566 } else if (Opc == AMDGPU::G_ZEXT) {
2567 B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2568 } else {
2569 B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2570 }
2571
2572 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2573 MRI.setRegBank(DstReg, *SrcBank);
2574 MI.eraseFromParent();
2575 return;
2576 }
2577
2578 if (SrcTy != LLT::scalar(1))
2579 return;
2580
2581 // It is not legal to have a legalization artifact with a VCC source. Rather
2582 // than introducing a copy, insert the select we would have to select the
2583 // copy to.
2584 if (SrcBank == &AMDGPU::VCCRegBank) {
2585 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2586
2587 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2588
2589 unsigned DstSize = DstTy.getSizeInBits();
2590 // 64-bit select is SGPR only
2591 const bool UseSel64 = DstSize > 32 &&
2592 SrcBank->getID() == AMDGPU::SGPRRegBankID;
2593
2594 // TODO: Should s16 select be legal?
2595 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2596 auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2597 auto False = B.buildConstant(SelType, 0);
2598
2599 MRI.setRegBank(True.getReg(0), *DstBank);
2600 MRI.setRegBank(False.getReg(0), *DstBank);
2601 MRI.setRegBank(DstReg, *DstBank);
2602
2603 if (DstSize > 32) {
2604 B.buildSelect(DefRegs[0], SrcReg, True, False);
2605 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2606 } else if (DstSize < 32) {
2607 auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2608 MRI.setRegBank(Sel.getReg(0), *DstBank);
2609 B.buildTrunc(DstReg, Sel);
2610 } else {
2611 B.buildSelect(DstReg, SrcReg, True, False);
2612 }
2613
2614 MI.eraseFromParent();
2615 return;
2616 }
2617
2618 break;
2619 }
2620 case AMDGPU::G_BUILD_VECTOR:
2621 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
2622 Register DstReg = MI.getOperand(0).getReg();
2623 LLT DstTy = MRI.getType(DstReg);
2624 if (DstTy != LLT::vector(2, 16))
2625 break;
2626
2627 assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
2628 substituteSimpleCopyRegs(OpdMapper, 1);
2629 substituteSimpleCopyRegs(OpdMapper, 2);
2630
2631 const RegisterBank *DstBank =
2632 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2633 if (DstBank == &AMDGPU::SGPRRegBank)
2634 break; // Can use S_PACK_* instructions.
2635
2636 MachineIRBuilder B(MI);
2637
2638 Register Lo = MI.getOperand(1).getReg();
2639 Register Hi = MI.getOperand(2).getReg();
2640 const LLT S32 = LLT::scalar(32);
2641
2642 const RegisterBank *BankLo =
2643 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2644 const RegisterBank *BankHi =
2645 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2646
2647 Register ZextLo;
2648 Register ShiftHi;
2649
2650 if (Opc == AMDGPU::G_BUILD_VECTOR) {
2651 ZextLo = B.buildZExt(S32, Lo).getReg(0);
2652 MRI.setRegBank(ZextLo, *BankLo);
2653
2654 Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
2655 MRI.setRegBank(ZextHi, *BankHi);
2656
2657 auto ShiftAmt = B.buildConstant(S32, 16);
2658 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2659
2660 ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
2661 MRI.setRegBank(ShiftHi, *BankHi);
2662 } else {
2663 Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
2664 MRI.setRegBank(MaskLo, *BankLo);
2665
2666 auto ShiftAmt = B.buildConstant(S32, 16);
2667 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2668
2669 ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
2670 MRI.setRegBank(ShiftHi, *BankHi);
2671
2672 ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
2673 MRI.setRegBank(ZextLo, *BankLo);
2674 }
2675
2676 auto Or = B.buildOr(S32, ZextLo, ShiftHi);
2677 MRI.setRegBank(Or.getReg(0), *DstBank);
2678
2679 B.buildBitcast(DstReg, Or);
2680 MI.eraseFromParent();
2681 return;
2682 }
2683 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2684 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2685
2686 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2687
2688 Register DstReg = MI.getOperand(0).getReg();
2689 Register SrcReg = MI.getOperand(1).getReg();
2690
2691 const LLT S32 = LLT::scalar(32);
2692 LLT DstTy = MRI.getType(DstReg);
2693 LLT SrcTy = MRI.getType(SrcReg);
2694
2695 if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
2696 return;
2697
2698 MachineIRBuilder B(MI);
2699
2700 const ValueMapping &DstMapping
2701 = OpdMapper.getInstrMapping().getOperandMapping(0);
2702 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2703 const RegisterBank *SrcBank =
2704 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2705 const RegisterBank *IdxBank =
2706 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2707
2708 Register BaseIdxReg;
2709 unsigned ConstOffset;
2710 std::tie(BaseIdxReg, ConstOffset) =
2711 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2712
2713 // See if the index is an add of a constant which will be foldable by moving
2714 // the base register of the index later if this is going to be executed in a
2715 // waterfall loop. This is essentially to reassociate the add of a constant
2716 // with the readfirstlane.
2717 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2718 ConstOffset > 0 &&
2719 ConstOffset < SrcTy.getNumElements();
2720
2721 // Move the base register. We'll re-insert the add later.
2722 if (ShouldMoveIndexIntoLoop)
2723 MI.getOperand(2).setReg(BaseIdxReg);
2724
2725 // If this is a VGPR result only because the index was a VGPR result, the
2726 // actual indexing will be done on the SGPR source vector, which will
2727 // produce a scalar result. We need to copy to the VGPR result inside the
2728 // waterfall loop.
2729 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2730 SrcBank == &AMDGPU::SGPRRegBank;
2731 if (DstRegs.empty()) {
2732 applyDefaultMapping(OpdMapper);
2733
2734 executeInWaterfallLoop(MI, MRI, { 2 });
2735
2736 if (NeedCopyToVGPR) {
2737 // We don't want a phi for this temporary reg.
2738 Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2739 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2740 MI.getOperand(0).setReg(TmpReg);
2741 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2742
2743 // Use a v_mov_b32 here to make the exec dependency explicit.
2744 buildVCopy(B, DstReg, TmpReg);
2745 }
2746
2747 // Re-insert the constant offset add inside the waterfall loop.
2748 if (ShouldMoveIndexIntoLoop)
2749 reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2750
2751 return;
2752 }
2753
2754 assert(DstTy.getSizeInBits() == 64);
2755
2756 LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
2757
2758 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2759 auto One = B.buildConstant(S32, 1);
2760
2761 MachineBasicBlock::iterator MII = MI.getIterator();
2762
2763 // Split the vector index into 32-bit pieces. Prepare to move all of the
2764 // new instructions into a waterfall loop if necessary.
2765 //
2766 // Don't put the bitcast or constant in the loop.
2767 MachineInstrSpan Span(MII, &B.getMBB());
2768
2769 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2770 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2771 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2772
2773 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2774 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2775
2776 MRI.setRegBank(DstReg, *DstBank);
2777 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2778 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2779 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2780 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2781
2782 SmallSet<Register, 4> OpsToWaterfall;
2783 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2784 MI.eraseFromParent();
2785 return;
2786 }
2787
2788 // Remove the original instruction to avoid potentially confusing the
2789 // waterfall loop logic.
2790 B.setInstr(*Span.begin());
2791 MI.eraseFromParent();
2792 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2793 OpsToWaterfall, MRI);
2794
2795 if (NeedCopyToVGPR) {
2796 MachineBasicBlock *LoopBB = Extract1->getParent();
2797 Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2798 Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2799 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2800 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2801
2802 Extract0->getOperand(0).setReg(TmpReg0);
2803 Extract1->getOperand(0).setReg(TmpReg1);
2804
2805 B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2806
2807 buildVCopy(B, DstRegs[0], TmpReg0);
2808 buildVCopy(B, DstRegs[1], TmpReg1);
2809 }
2810
2811 if (ShouldMoveIndexIntoLoop)
2812 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2813
2814 return;
2815 }
2816 case AMDGPU::G_INSERT_VECTOR_ELT: {
2817 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2818
2819 Register DstReg = MI.getOperand(0).getReg();
2820 LLT VecTy = MRI.getType(DstReg);
2821
2822 assert(OpdMapper.getVRegs(0).empty());
2823 assert(OpdMapper.getVRegs(3).empty());
2824
2825 if (substituteSimpleCopyRegs(OpdMapper, 1))
2826 MRI.setType(MI.getOperand(1).getReg(), VecTy);
2827
2828 if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
2829 return;
2830
2831 const RegisterBank *IdxBank =
2832 OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2833
2834 Register SrcReg = MI.getOperand(1).getReg();
2835 Register InsReg = MI.getOperand(2).getReg();
2836 LLT InsTy = MRI.getType(InsReg);
2837 (void)InsTy;
2838
2839 Register BaseIdxReg;
2840 unsigned ConstOffset;
2841 std::tie(BaseIdxReg, ConstOffset) =
2842 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2843
2844 // See if the index is an add of a constant which will be foldable by moving
2845 // the base register of the index later if this is going to be executed in a
2846 // waterfall loop. This is essentially to reassociate the add of a constant
2847 // with the readfirstlane.
2848 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2849 ConstOffset > 0 &&
2850 ConstOffset < VecTy.getNumElements();
2851
2852 // Move the base register. We'll re-insert the add later.
2853 if (ShouldMoveIndexIntoLoop)
2854 MI.getOperand(3).setReg(BaseIdxReg);
2855
2856
2857 if (InsRegs.empty()) {
2858 executeInWaterfallLoop(MI, MRI, { 3 });
2859
2860 // Re-insert the constant offset add inside the waterfall loop.
2861 if (ShouldMoveIndexIntoLoop) {
2862 MachineIRBuilder B(MI);
2863 reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2864 }
2865
2866 return;
2867 }
2868
2869
2870 assert(InsTy.getSizeInBits() == 64);
2871
2872 const LLT S32 = LLT::scalar(32);
2873 LLT Vec32 = LLT::vector(2 * VecTy.getNumElements(), 32);
2874
2875 MachineIRBuilder B(MI);
2876 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2877 auto One = B.buildConstant(S32, 1);
2878
2879 // Split the vector index into 32-bit pieces. Prepare to move all of the
2880 // new instructions into a waterfall loop if necessary.
2881 //
2882 // Don't put the bitcast or constant in the loop.
2883 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2884
2885 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2886 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2887 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2888
2889 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2890 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2891
2892 const RegisterBank *DstBank =
2893 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2894 const RegisterBank *SrcBank =
2895 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2896 const RegisterBank *InsSrcBank =
2897 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2898
2899 MRI.setRegBank(InsReg, *InsSrcBank);
2900 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2901 MRI.setRegBank(InsLo.getReg(0), *DstBank);
2902 MRI.setRegBank(InsHi.getReg(0), *DstBank);
2903 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2904 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2905 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2906
2907
2908 SmallSet<Register, 4> OpsToWaterfall;
2909 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2910 B.setInsertPt(B.getMBB(), MI);
2911 B.buildBitcast(DstReg, InsHi);
2912 MI.eraseFromParent();
2913 return;
2914 }
2915
2916 B.setInstr(*Span.begin());
2917 MI.eraseFromParent();
2918
2919 // Figure out the point after the waterfall loop before mangling the control
2920 // flow.
2921 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2922 OpsToWaterfall, MRI);
2923
2924 // The insertion point is now right after the original instruction.
2925 //
2926 // Keep the bitcast to the original vector type out of the loop. Doing this
2927 // saved an extra phi we don't need inside the loop.
2928 B.buildBitcast(DstReg, InsHi);
2929
2930 // Re-insert the constant offset add inside the waterfall loop.
2931 if (ShouldMoveIndexIntoLoop)
2932 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2933
2934 return;
2935 }
2936 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
2937 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
2938 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
2939 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
2940 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
2941 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
2942 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
2943 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
2944 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
2945 case AMDGPU::G_AMDGPU_BUFFER_STORE:
2946 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
2947 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
2948 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
2949 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
2950 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
2951 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
2952 applyDefaultMapping(OpdMapper);
2953 executeInWaterfallLoop(MI, MRI, {1, 4});
2954 return;
2955 }
2956 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
2957 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
2958 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
2959 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
2960 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
2961 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
2962 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
2963 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
2964 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
2965 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
2966 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
2967 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
2968 applyDefaultMapping(OpdMapper);
2969 executeInWaterfallLoop(MI, MRI, {2, 5});
2970 return;
2971 }
2972 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: {
2973 applyDefaultMapping(OpdMapper);
2974 executeInWaterfallLoop(MI, MRI, {2, 5});
2975 return;
2976 }
2977 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
2978 applyDefaultMapping(OpdMapper);
2979 executeInWaterfallLoop(MI, MRI, {3, 6});
2980 return;
2981 }
2982 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
2983 applyMappingSBufferLoad(OpdMapper);
2984 return;
2985 }
2986 case AMDGPU::G_INTRINSIC: {
2987 switch (MI.getIntrinsicID()) {
2988 case Intrinsic::amdgcn_readlane: {
2989 substituteSimpleCopyRegs(OpdMapper, 2);
2990
2991 assert(OpdMapper.getVRegs(0).empty());
2992 assert(OpdMapper.getVRegs(3).empty());
2993
2994 // Make sure the index is an SGPR. It doesn't make sense to run this in a
2995 // waterfall loop, so assume it's a uniform value.
2996 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2997 return;
2998 }
2999 case Intrinsic::amdgcn_writelane: {
3000 assert(OpdMapper.getVRegs(0).empty());
3001 assert(OpdMapper.getVRegs(2).empty());
3002 assert(OpdMapper.getVRegs(3).empty());
3003
3004 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
3005 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
3006 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
3007 return;
3008 }
3009 case Intrinsic::amdgcn_interp_p1:
3010 case Intrinsic::amdgcn_interp_p2:
3011 case Intrinsic::amdgcn_interp_mov:
3012 case Intrinsic::amdgcn_interp_p1_f16:
3013 case Intrinsic::amdgcn_interp_p2_f16: {
3014 applyDefaultMapping(OpdMapper);
3015
3016 // Readlane for m0 value, which is always the last operand.
3017 // FIXME: Should this be a waterfall loop instead?
3018 constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
3019 return;
3020 }
3021 case Intrinsic::amdgcn_permlane16:
3022 case Intrinsic::amdgcn_permlanex16: {
3023 // Doing a waterfall loop over these wouldn't make any sense.
3024 substituteSimpleCopyRegs(OpdMapper, 2);
3025 substituteSimpleCopyRegs(OpdMapper, 3);
3026 constrainOpWithReadfirstlane(MI, MRI, 4);
3027 constrainOpWithReadfirstlane(MI, MRI, 5);
3028 return;
3029 }
3030 case Intrinsic::amdgcn_sbfe:
3031 applyMappingBFEIntrinsic(OpdMapper, true);
3032 return;
3033 case Intrinsic::amdgcn_ubfe:
3034 applyMappingBFEIntrinsic(OpdMapper, false);
3035 return;
3036 case Intrinsic::amdgcn_ballot:
3037 // Use default handling and insert copy to vcc source.
3038 break;
3039 }
3040 break;
3041 }
3042 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3043 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
3044 const AMDGPU::RsrcIntrinsic *RSrcIntrin
3045 = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
3046 assert(RSrcIntrin && RSrcIntrin->IsImage);
3047 // Non-images can have complications from operands that allow both SGPR
3048 // and VGPR. For now it's too complicated to figure out the final opcode
3049 // to derive the register bank from the MCInstrDesc.
3050 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3051 return;
3052 }
3053 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
3054 unsigned N = MI.getNumExplicitOperands() - 2;
3055 executeInWaterfallLoop(MI, MRI, { N });
3056 return;
3057 }
3058 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
3059 auto IntrID = MI.getIntrinsicID();
3060 switch (IntrID) {
3061 case Intrinsic::amdgcn_ds_ordered_add:
3062 case Intrinsic::amdgcn_ds_ordered_swap: {
3063 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3064 assert(OpdMapper.getVRegs(0).empty());
3065 substituteSimpleCopyRegs(OpdMapper, 3);
3066 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3067 return;
3068 }
3069 case Intrinsic::amdgcn_ds_gws_init:
3070 case Intrinsic::amdgcn_ds_gws_barrier:
3071 case Intrinsic::amdgcn_ds_gws_sema_br: {
3072 // Only the first lane is executes, so readfirstlane is safe.
3073 substituteSimpleCopyRegs(OpdMapper, 1);
3074 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3075 return;
3076 }
3077 case Intrinsic::amdgcn_ds_gws_sema_v:
3078 case Intrinsic::amdgcn_ds_gws_sema_p:
3079 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3080 // Only the first lane is executes, so readfirstlane is safe.
3081 constrainOpWithReadfirstlane(MI, MRI, 1); // M0
3082 return;
3083 }
3084 case Intrinsic::amdgcn_ds_append:
3085 case Intrinsic::amdgcn_ds_consume: {
3086 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3087 return;
3088 }
3089 case Intrinsic::amdgcn_s_sendmsg:
3090 case Intrinsic::amdgcn_s_sendmsghalt: {
3091 // FIXME: Should this use a waterfall loop?
3092 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3093 return;
3094 }
3095 case Intrinsic::amdgcn_s_setreg: {
3096 constrainOpWithReadfirstlane(MI, MRI, 2);
3097 return;
3098 }
3099 default: {
3100 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3101 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3102 // Non-images can have complications from operands that allow both SGPR
3103 // and VGPR. For now it's too complicated to figure out the final opcode
3104 // to derive the register bank from the MCInstrDesc.
3105 if (RSrcIntrin->IsImage) {
3106 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3107 return;
3108 }
3109 }
3110
3111 break;
3112 }
3113 }
3114 break;
3115 }
3116 case AMDGPU::G_LOAD:
3117 case AMDGPU::G_ZEXTLOAD:
3118 case AMDGPU::G_SEXTLOAD: {
3119 if (applyMappingLoad(MI, OpdMapper, MRI))
3120 return;
3121 break;
3122 }
3123 case AMDGPU::G_DYN_STACKALLOC:
3124 applyMappingDynStackAlloc(MI, OpdMapper, MRI);
3125 return;
3126 default:
3127 break;
3128 }
3129
3130 return applyDefaultMapping(OpdMapper);
3131 }
3132
3133 // vgpr, sgpr -> vgpr
3134 // vgpr, agpr -> vgpr
3135 // agpr, agpr -> agpr
3136 // agpr, sgpr -> vgpr
regBankUnion(unsigned RB0,unsigned RB1)3137 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3138 if (RB0 == AMDGPU::InvalidRegBankID)
3139 return RB1;
3140 if (RB1 == AMDGPU::InvalidRegBankID)
3141 return RB0;
3142
3143 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3144 return AMDGPU::SGPRRegBankID;
3145
3146 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3147 return AMDGPU::AGPRRegBankID;
3148
3149 return AMDGPU::VGPRRegBankID;
3150 }
3151
regBankBoolUnion(unsigned RB0,unsigned RB1)3152 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3153 if (RB0 == AMDGPU::InvalidRegBankID)
3154 return RB1;
3155 if (RB1 == AMDGPU::InvalidRegBankID)
3156 return RB0;
3157
3158 // vcc, vcc -> vcc
3159 // vcc, sgpr -> vcc
3160 // vcc, vgpr -> vcc
3161 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3162 return AMDGPU::VCCRegBankID;
3163
3164 // vcc, vgpr -> vgpr
3165 return regBankUnion(RB0, RB1);
3166 }
3167
getMappingType(const MachineRegisterInfo & MRI,const MachineInstr & MI) const3168 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3169 const MachineInstr &MI) const {
3170 unsigned RegBank = AMDGPU::InvalidRegBankID;
3171
3172 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3173 if (!MI.getOperand(i).isReg())
3174 continue;
3175 Register Reg = MI.getOperand(i).getReg();
3176 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3177 RegBank = regBankUnion(RegBank, Bank->getID());
3178 if (RegBank == AMDGPU::VGPRRegBankID)
3179 break;
3180 }
3181 }
3182
3183 return RegBank;
3184 }
3185
isSALUMapping(const MachineInstr & MI) const3186 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3187 const MachineFunction &MF = *MI.getParent()->getParent();
3188 const MachineRegisterInfo &MRI = MF.getRegInfo();
3189 for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
3190 if (!MI.getOperand(i).isReg())
3191 continue;
3192 Register Reg = MI.getOperand(i).getReg();
3193 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3194 if (Bank->getID() != AMDGPU::SGPRRegBankID)
3195 return false;
3196 }
3197 }
3198 return true;
3199 }
3200
3201 const RegisterBankInfo::InstructionMapping &
getDefaultMappingSOP(const MachineInstr & MI) const3202 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3203 const MachineFunction &MF = *MI.getParent()->getParent();
3204 const MachineRegisterInfo &MRI = MF.getRegInfo();
3205 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3206
3207 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3208 const MachineOperand &SrcOp = MI.getOperand(i);
3209 if (!SrcOp.isReg())
3210 continue;
3211
3212 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3213 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3214 }
3215 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3216 MI.getNumOperands());
3217 }
3218
3219 const RegisterBankInfo::InstructionMapping &
getDefaultMappingVOP(const MachineInstr & MI) const3220 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3221 const MachineFunction &MF = *MI.getParent()->getParent();
3222 const MachineRegisterInfo &MRI = MF.getRegInfo();
3223 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3224
3225 // Even though we technically could use SGPRs, this would require knowledge of
3226 // the constant bus restriction. Force all sources to VGPR (except for VCC).
3227 //
3228 // TODO: Unary ops are trivially OK, so accept SGPRs?
3229 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3230 const MachineOperand &Src = MI.getOperand(i);
3231 if (!Src.isReg())
3232 continue;
3233
3234 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3235 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3236 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3237 }
3238
3239 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3240 MI.getNumOperands());
3241 }
3242
3243 const RegisterBankInfo::InstructionMapping &
getDefaultMappingAllVGPR(const MachineInstr & MI) const3244 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3245 const MachineFunction &MF = *MI.getParent()->getParent();
3246 const MachineRegisterInfo &MRI = MF.getRegInfo();
3247 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3248
3249 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3250 const MachineOperand &Op = MI.getOperand(I);
3251 if (!Op.isReg())
3252 continue;
3253
3254 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3255 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3256 }
3257
3258 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3259 MI.getNumOperands());
3260 }
3261
3262 const RegisterBankInfo::InstructionMapping &
getImageMapping(const MachineRegisterInfo & MRI,const MachineInstr & MI,int RsrcIdx) const3263 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3264 const MachineInstr &MI,
3265 int RsrcIdx) const {
3266 // The reported argument index is relative to the IR intrinsic call arguments,
3267 // so we need to shift by the number of defs and the intrinsic ID.
3268 RsrcIdx += MI.getNumExplicitDefs() + 1;
3269
3270 const int NumOps = MI.getNumOperands();
3271 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3272
3273 // TODO: Should packed/unpacked D16 difference be reported here as part of
3274 // the value mapping?
3275 for (int I = 0; I != NumOps; ++I) {
3276 if (!MI.getOperand(I).isReg())
3277 continue;
3278
3279 Register OpReg = MI.getOperand(I).getReg();
3280 // We replace some dead address operands with $noreg
3281 if (!OpReg)
3282 continue;
3283
3284 unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3285
3286 // FIXME: Probably need a new intrinsic register bank searchable table to
3287 // handle arbitrary intrinsics easily.
3288 //
3289 // If this has a sampler, it immediately follows rsrc.
3290 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3291
3292 if (MustBeSGPR) {
3293 // If this must be an SGPR, so we must report whatever it is as legal.
3294 unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3295 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3296 } else {
3297 // Some operands must be VGPR, and these are easy to copy to.
3298 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3299 }
3300 }
3301
3302 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3303 }
3304
3305 /// Return the mapping for a pointer arugment.
3306 const RegisterBankInfo::ValueMapping *
getValueMappingForPtr(const MachineRegisterInfo & MRI,Register PtrReg) const3307 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3308 Register PtrReg) const {
3309 LLT PtrTy = MRI.getType(PtrReg);
3310 unsigned Size = PtrTy.getSizeInBits();
3311 if (Subtarget.useFlatForGlobal() ||
3312 !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
3313 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3314
3315 // If we're using MUBUF instructions for global memory, an SGPR base register
3316 // is possible. Otherwise this needs to be a VGPR.
3317 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3318 return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3319 }
3320
3321 const RegisterBankInfo::InstructionMapping &
getInstrMappingForLoad(const MachineInstr & MI) const3322 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3323
3324 const MachineFunction &MF = *MI.getParent()->getParent();
3325 const MachineRegisterInfo &MRI = MF.getRegInfo();
3326 SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3327 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3328 Register PtrReg = MI.getOperand(1).getReg();
3329 LLT PtrTy = MRI.getType(PtrReg);
3330 unsigned AS = PtrTy.getAddressSpace();
3331 unsigned PtrSize = PtrTy.getSizeInBits();
3332
3333 const ValueMapping *ValMapping;
3334 const ValueMapping *PtrMapping;
3335
3336 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3337
3338 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3339 if (isScalarLoadLegal(MI)) {
3340 // We have a uniform instruction so we want to use an SMRD load
3341 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3342 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3343 } else {
3344 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3345
3346 // If we're using MUBUF instructions for global memory, an SGPR base
3347 // register is possible. Otherwise this needs to be a VGPR.
3348 unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3349 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3350
3351 PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3352 }
3353 } else {
3354 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3355 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3356 }
3357
3358 OpdsMapping[0] = ValMapping;
3359 OpdsMapping[1] = PtrMapping;
3360 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3361 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3362 return Mapping;
3363
3364 // FIXME: Do we want to add a mapping for FLAT load, or should we just
3365 // handle that during instruction selection?
3366 }
3367
3368 unsigned
getRegBankID(Register Reg,const MachineRegisterInfo & MRI,unsigned Default) const3369 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3370 const MachineRegisterInfo &MRI,
3371 unsigned Default) const {
3372 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3373 return Bank ? Bank->getID() : Default;
3374 }
3375
3376 const RegisterBankInfo::ValueMapping *
getSGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3377 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3378 const MachineRegisterInfo &MRI,
3379 const TargetRegisterInfo &TRI) const {
3380 // Lie and claim anything is legal, even though this needs to be an SGPR
3381 // applyMapping will have to deal with it as a waterfall loop.
3382 unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3383 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3384 return AMDGPU::getValueMapping(Bank, Size);
3385 }
3386
3387 const RegisterBankInfo::ValueMapping *
getVGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3388 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3389 const MachineRegisterInfo &MRI,
3390 const TargetRegisterInfo &TRI) const {
3391 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3392 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3393 }
3394
3395 const RegisterBankInfo::ValueMapping *
getAGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3396 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3397 const MachineRegisterInfo &MRI,
3398 const TargetRegisterInfo &TRI) const {
3399 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3400 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3401 }
3402
3403 ///
3404 /// This function must return a legal mapping, because
3405 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3406 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a
3407 /// VGPR to SGPR generated is illegal.
3408 ///
3409 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3410 // legal. These will be dealt with in applyMappingImpl.
3411 //
3412 const RegisterBankInfo::InstructionMapping &
getInstrMapping(const MachineInstr & MI) const3413 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3414 const MachineFunction &MF = *MI.getParent()->getParent();
3415 const MachineRegisterInfo &MRI = MF.getRegInfo();
3416
3417 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3418 // The default logic bothers to analyze impossible alternative mappings. We
3419 // want the most straightforward mapping, so just directly handle this.
3420 const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
3421 *TRI);
3422 const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
3423 *TRI);
3424 assert(SrcBank && "src bank should have been assigned already");
3425 if (!DstBank)
3426 DstBank = SrcBank;
3427
3428 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3429 if (cannotCopy(*DstBank, *SrcBank, Size))
3430 return getInvalidInstructionMapping();
3431
3432 const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3433 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3434 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3435 OpdsMapping[0] = &ValMap;
3436 if (MI.getOpcode() == AMDGPU::G_FREEZE)
3437 OpdsMapping[1] = &ValMap;
3438
3439 return getInstructionMapping(
3440 1, /*Cost*/ 1,
3441 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3442 }
3443
3444 if (MI.isRegSequence()) {
3445 // If any input is a VGPR, the result must be a VGPR. The default handling
3446 // assumes any copy between banks is legal.
3447 unsigned BankID = AMDGPU::SGPRRegBankID;
3448
3449 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3450 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3451 // It doesn't make sense to use vcc or scc banks here, so just ignore
3452 // them.
3453 if (OpBank != AMDGPU::SGPRRegBankID) {
3454 BankID = AMDGPU::VGPRRegBankID;
3455 break;
3456 }
3457 }
3458 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3459
3460 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3461 return getInstructionMapping(
3462 1, /*Cost*/ 1,
3463 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3464 }
3465
3466 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3467 // properly.
3468 //
3469 // TODO: There are additional exec masking dependencies to analyze.
3470 if (MI.getOpcode() == TargetOpcode::G_PHI) {
3471 unsigned ResultBank = AMDGPU::InvalidRegBankID;
3472 Register DstReg = MI.getOperand(0).getReg();
3473
3474 // Sometimes the result may have already been assigned a bank.
3475 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3476 ResultBank = DstBank->getID();
3477
3478 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3479 Register Reg = MI.getOperand(I).getReg();
3480 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3481
3482 // FIXME: Assuming VGPR for any undetermined inputs.
3483 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3484 ResultBank = AMDGPU::VGPRRegBankID;
3485 break;
3486 }
3487
3488 // FIXME: Need to promote SGPR case to s32
3489 unsigned OpBank = Bank->getID();
3490 ResultBank = regBankBoolUnion(ResultBank, OpBank);
3491 }
3492
3493 assert(ResultBank != AMDGPU::InvalidRegBankID);
3494
3495 unsigned Size = MRI.getType(DstReg).getSizeInBits();
3496
3497 const ValueMapping &ValMap =
3498 getValueMapping(0, Size, getRegBank(ResultBank));
3499 return getInstructionMapping(
3500 1, /*Cost*/ 1,
3501 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3502 }
3503
3504 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3505 if (Mapping.isValid())
3506 return Mapping;
3507
3508 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3509
3510 switch (MI.getOpcode()) {
3511 default:
3512 return getInvalidInstructionMapping();
3513
3514 case AMDGPU::G_AND:
3515 case AMDGPU::G_OR:
3516 case AMDGPU::G_XOR: {
3517 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3518 if (Size == 1) {
3519 const RegisterBank *DstBank
3520 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3521
3522 unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3523 unsigned BankLHS = AMDGPU::InvalidRegBankID;
3524 unsigned BankRHS = AMDGPU::InvalidRegBankID;
3525 if (DstBank) {
3526 TargetBankID = DstBank->getID();
3527 if (DstBank == &AMDGPU::VCCRegBank) {
3528 TargetBankID = AMDGPU::VCCRegBankID;
3529 BankLHS = AMDGPU::VCCRegBankID;
3530 BankRHS = AMDGPU::VCCRegBankID;
3531 } else {
3532 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3533 AMDGPU::SGPRRegBankID);
3534 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3535 AMDGPU::SGPRRegBankID);
3536 }
3537 } else {
3538 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3539 AMDGPU::VCCRegBankID);
3540 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3541 AMDGPU::VCCRegBankID);
3542
3543 // Both inputs should be true booleans to produce a boolean result.
3544 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3545 TargetBankID = AMDGPU::VGPRRegBankID;
3546 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3547 TargetBankID = AMDGPU::VCCRegBankID;
3548 BankLHS = AMDGPU::VCCRegBankID;
3549 BankRHS = AMDGPU::VCCRegBankID;
3550 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3551 TargetBankID = AMDGPU::SGPRRegBankID;
3552 }
3553 }
3554
3555 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3556 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3557 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3558 break;
3559 }
3560
3561 if (Size == 64) {
3562
3563 if (isSALUMapping(MI)) {
3564 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3565 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3566 } else {
3567 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3568 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3569 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3570
3571 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3572 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3573 }
3574
3575 break;
3576 }
3577
3578 LLVM_FALLTHROUGH;
3579 }
3580 case AMDGPU::G_PTR_ADD:
3581 case AMDGPU::G_PTRMASK:
3582 case AMDGPU::G_ADD:
3583 case AMDGPU::G_SUB:
3584 case AMDGPU::G_MUL:
3585 case AMDGPU::G_SHL:
3586 case AMDGPU::G_LSHR:
3587 case AMDGPU::G_ASHR:
3588 case AMDGPU::G_UADDO:
3589 case AMDGPU::G_USUBO:
3590 case AMDGPU::G_UADDE:
3591 case AMDGPU::G_SADDE:
3592 case AMDGPU::G_USUBE:
3593 case AMDGPU::G_SSUBE:
3594 case AMDGPU::G_SMIN:
3595 case AMDGPU::G_SMAX:
3596 case AMDGPU::G_UMIN:
3597 case AMDGPU::G_UMAX:
3598 case AMDGPU::G_SHUFFLE_VECTOR:
3599 if (isSALUMapping(MI))
3600 return getDefaultMappingSOP(MI);
3601 LLVM_FALLTHROUGH;
3602
3603 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3604 case AMDGPU::G_SSUBSAT:
3605 case AMDGPU::G_UADDSAT:
3606 case AMDGPU::G_USUBSAT:
3607 case AMDGPU::G_FADD:
3608 case AMDGPU::G_FSUB:
3609 case AMDGPU::G_FPTOSI:
3610 case AMDGPU::G_FPTOUI:
3611 case AMDGPU::G_FMUL:
3612 case AMDGPU::G_FMA:
3613 case AMDGPU::G_FMAD:
3614 case AMDGPU::G_FSQRT:
3615 case AMDGPU::G_FFLOOR:
3616 case AMDGPU::G_FCEIL:
3617 case AMDGPU::G_FRINT:
3618 case AMDGPU::G_SITOFP:
3619 case AMDGPU::G_UITOFP:
3620 case AMDGPU::G_FPTRUNC:
3621 case AMDGPU::G_FPEXT:
3622 case AMDGPU::G_FEXP2:
3623 case AMDGPU::G_FLOG2:
3624 case AMDGPU::G_FMINNUM:
3625 case AMDGPU::G_FMAXNUM:
3626 case AMDGPU::G_FMINNUM_IEEE:
3627 case AMDGPU::G_FMAXNUM_IEEE:
3628 case AMDGPU::G_FCANONICALIZE:
3629 case AMDGPU::G_INTRINSIC_TRUNC:
3630 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3631 case AMDGPU::G_FSHR: // TODO: Expand for scalar
3632 case AMDGPU::G_AMDGPU_FFBH_U32:
3633 case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3634 case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3635 case AMDGPU::G_AMDGPU_RCP_IFLAG:
3636 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3637 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3638 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3639 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3640 return getDefaultMappingVOP(MI);
3641 case AMDGPU::G_UMULH:
3642 case AMDGPU::G_SMULH: {
3643 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
3644 return getDefaultMappingSOP(MI);
3645 return getDefaultMappingVOP(MI);
3646 }
3647 case AMDGPU::G_IMPLICIT_DEF: {
3648 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3649 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3650 break;
3651 }
3652 case AMDGPU::G_FCONSTANT:
3653 case AMDGPU::G_CONSTANT:
3654 case AMDGPU::G_GLOBAL_VALUE:
3655 case AMDGPU::G_BLOCK_ADDR:
3656 case AMDGPU::G_READCYCLECOUNTER: {
3657 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3658 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3659 break;
3660 }
3661 case AMDGPU::G_FRAME_INDEX: {
3662 // TODO: This should be the same as other constants, but eliminateFrameIndex
3663 // currently assumes VALU uses.
3664 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3665 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3666 break;
3667 }
3668 case AMDGPU::G_DYN_STACKALLOC: {
3669 // Result is always uniform, and a wave reduction is needed for the source.
3670 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3671 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3672 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
3673 break;
3674 }
3675 case AMDGPU::G_INSERT: {
3676 unsigned BankID = getMappingType(MRI, MI);
3677 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3678 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3679 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
3680 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3681 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3682 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
3683 OpdsMapping[3] = nullptr;
3684 break;
3685 }
3686 case AMDGPU::G_EXTRACT: {
3687 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3688 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3689 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3690 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3691 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3692 OpdsMapping[2] = nullptr;
3693 break;
3694 }
3695 case AMDGPU::G_BUILD_VECTOR:
3696 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
3697 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3698 if (DstTy == LLT::vector(2, 16)) {
3699 unsigned DstSize = DstTy.getSizeInBits();
3700 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3701 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3702 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3703 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
3704
3705 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
3706 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
3707 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
3708 break;
3709 }
3710
3711 LLVM_FALLTHROUGH;
3712 }
3713 case AMDGPU::G_MERGE_VALUES:
3714 case AMDGPU::G_CONCAT_VECTORS: {
3715 unsigned Bank = getMappingType(MRI, MI);
3716 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3717 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3718
3719 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3720 // Op1 and Dst should use the same register bank.
3721 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
3722 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
3723 break;
3724 }
3725 case AMDGPU::G_BITCAST:
3726 case AMDGPU::G_INTTOPTR:
3727 case AMDGPU::G_PTRTOINT:
3728 case AMDGPU::G_BITREVERSE:
3729 case AMDGPU::G_FABS:
3730 case AMDGPU::G_FNEG: {
3731 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3732 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3733 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3734 break;
3735 }
3736 case AMDGPU::G_CTLZ_ZERO_UNDEF:
3737 case AMDGPU::G_CTTZ_ZERO_UNDEF:
3738 case AMDGPU::G_CTPOP: {
3739 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3740 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3741 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3742
3743 // This should really be getValueMappingSGPR64Only, but allowing the generic
3744 // code to handle the register split just makes using LegalizerHelper more
3745 // difficult.
3746 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3747 break;
3748 }
3749 case AMDGPU::G_TRUNC: {
3750 Register Dst = MI.getOperand(0).getReg();
3751 Register Src = MI.getOperand(1).getReg();
3752 unsigned Bank = getRegBankID(Src, MRI);
3753 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3754 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3755 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3756 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
3757 break;
3758 }
3759 case AMDGPU::G_ZEXT:
3760 case AMDGPU::G_SEXT:
3761 case AMDGPU::G_ANYEXT:
3762 case AMDGPU::G_SEXT_INREG: {
3763 Register Dst = MI.getOperand(0).getReg();
3764 Register Src = MI.getOperand(1).getReg();
3765 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3766 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3767
3768 unsigned DstBank;
3769 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
3770 assert(SrcBank);
3771 switch (SrcBank->getID()) {
3772 case AMDGPU::SGPRRegBankID:
3773 DstBank = AMDGPU::SGPRRegBankID;
3774 break;
3775 default:
3776 DstBank = AMDGPU::VGPRRegBankID;
3777 break;
3778 }
3779
3780 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
3781 // 32-bits, and then to 64.
3782 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
3783 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
3784 SrcSize);
3785 break;
3786 }
3787 case AMDGPU::G_FCMP: {
3788 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3789 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3790 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3791 OpdsMapping[1] = nullptr; // Predicate Operand.
3792 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
3793 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3794 break;
3795 }
3796 case AMDGPU::G_STORE: {
3797 assert(MI.getOperand(0).isReg());
3798 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3799
3800 // FIXME: We need to specify a different reg bank once scalar stores are
3801 // supported.
3802 const ValueMapping *ValMapping =
3803 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3804 OpdsMapping[0] = ValMapping;
3805 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
3806 break;
3807 }
3808 case AMDGPU::G_ICMP: {
3809 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
3810 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3811
3812 // See if the result register has already been constrained to vcc, which may
3813 // happen due to control flow intrinsic lowering.
3814 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
3815 AMDGPU::SGPRRegBankID);
3816 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3817 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
3818
3819 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
3820 Op2Bank == AMDGPU::SGPRRegBankID &&
3821 Op3Bank == AMDGPU::SGPRRegBankID &&
3822 (Size == 32 || (Size == 64 &&
3823 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
3824 Subtarget.hasScalarCompareEq64()));
3825
3826 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
3827 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3828
3829 // TODO: Use 32-bit for scalar output size.
3830 // SCC results will need to be copied to a 32-bit SGPR virtual register.
3831 const unsigned ResultSize = 1;
3832
3833 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
3834 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
3835 OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
3836 break;
3837 }
3838 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
3839 // VGPR index can be used for waterfall when indexing a SGPR vector.
3840 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3841 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3842 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3843 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3844 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3845 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
3846
3847 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
3848 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
3849
3850 // The index can be either if the source vector is VGPR.
3851 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
3852 break;
3853 }
3854 case AMDGPU::G_INSERT_VECTOR_ELT: {
3855 unsigned OutputBankID = isSALUMapping(MI) ?
3856 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3857
3858 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3859 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3860 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
3861 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3862 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
3863
3864 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3865 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3866
3867 // This is a weird case, because we need to break down the mapping based on
3868 // the register bank of a different operand.
3869 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
3870 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
3871 InsertSize);
3872 } else {
3873 assert(InsertSize == 32 || InsertSize == 64);
3874 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
3875 }
3876
3877 // The index can be either if the source vector is VGPR.
3878 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
3879 break;
3880 }
3881 case AMDGPU::G_UNMERGE_VALUES: {
3882 unsigned Bank = getMappingType(MRI, MI);
3883
3884 // Op1 and Dst should use the same register bank.
3885 // FIXME: Shouldn't this be the default? Why do we need to handle this?
3886 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3887 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
3888 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
3889 }
3890 break;
3891 }
3892 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3893 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3894 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3895 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3896 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3897 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3898 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3899 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3900 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3901 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3902 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
3903 case AMDGPU::G_AMDGPU_BUFFER_STORE:
3904 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3905 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3906 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3907 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
3908 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3909
3910 // rsrc
3911 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3912
3913 // vindex
3914 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3915
3916 // voffset
3917 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3918
3919 // soffset
3920 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3921
3922 // Any remaining operands are immediates and were correctly null
3923 // initialized.
3924 break;
3925 }
3926 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3927 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3928 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3929 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3930 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3931 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3932 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3933 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3934 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3935 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3936 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3937 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
3938 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: {
3939 // vdata_out
3940 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3941
3942 // vdata_in
3943 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3944
3945 // rsrc
3946 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3947
3948 // vindex
3949 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3950
3951 // voffset
3952 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3953
3954 // soffset
3955 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3956
3957 // Any remaining operands are immediates and were correctly null
3958 // initialized.
3959 break;
3960 }
3961 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
3962 // vdata_out
3963 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3964
3965 // vdata_in
3966 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3967
3968 // cmp
3969 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3970
3971 // rsrc
3972 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3973
3974 // vindex
3975 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3976
3977 // voffset
3978 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3979
3980 // soffset
3981 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
3982
3983 // Any remaining operands are immediates and were correctly null
3984 // initialized.
3985 break;
3986 }
3987 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
3988 // Lie and claim everything is legal, even though some need to be
3989 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
3990 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3991 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3992
3993 // We need to convert this to a MUBUF if either the resource of offset is
3994 // VGPR.
3995 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
3996 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
3997 unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
3998
3999 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4000 OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
4001 break;
4002 }
4003 case AMDGPU::G_INTRINSIC: {
4004 switch (MI.getIntrinsicID()) {
4005 default:
4006 return getInvalidInstructionMapping();
4007 case Intrinsic::amdgcn_div_fmas:
4008 case Intrinsic::amdgcn_div_fixup:
4009 case Intrinsic::amdgcn_trig_preop:
4010 case Intrinsic::amdgcn_sin:
4011 case Intrinsic::amdgcn_cos:
4012 case Intrinsic::amdgcn_log_clamp:
4013 case Intrinsic::amdgcn_rcp:
4014 case Intrinsic::amdgcn_rcp_legacy:
4015 case Intrinsic::amdgcn_sqrt:
4016 case Intrinsic::amdgcn_rsq:
4017 case Intrinsic::amdgcn_rsq_legacy:
4018 case Intrinsic::amdgcn_rsq_clamp:
4019 case Intrinsic::amdgcn_fmul_legacy:
4020 case Intrinsic::amdgcn_fma_legacy:
4021 case Intrinsic::amdgcn_ldexp:
4022 case Intrinsic::amdgcn_frexp_mant:
4023 case Intrinsic::amdgcn_frexp_exp:
4024 case Intrinsic::amdgcn_fract:
4025 case Intrinsic::amdgcn_cvt_pkrtz:
4026 case Intrinsic::amdgcn_cvt_pknorm_i16:
4027 case Intrinsic::amdgcn_cvt_pknorm_u16:
4028 case Intrinsic::amdgcn_cvt_pk_i16:
4029 case Intrinsic::amdgcn_cvt_pk_u16:
4030 case Intrinsic::amdgcn_fmed3:
4031 case Intrinsic::amdgcn_cubeid:
4032 case Intrinsic::amdgcn_cubema:
4033 case Intrinsic::amdgcn_cubesc:
4034 case Intrinsic::amdgcn_cubetc:
4035 case Intrinsic::amdgcn_sffbh:
4036 case Intrinsic::amdgcn_fmad_ftz:
4037 case Intrinsic::amdgcn_mbcnt_lo:
4038 case Intrinsic::amdgcn_mbcnt_hi:
4039 case Intrinsic::amdgcn_mul_u24:
4040 case Intrinsic::amdgcn_mul_i24:
4041 case Intrinsic::amdgcn_lerp:
4042 case Intrinsic::amdgcn_sad_u8:
4043 case Intrinsic::amdgcn_msad_u8:
4044 case Intrinsic::amdgcn_sad_hi_u8:
4045 case Intrinsic::amdgcn_sad_u16:
4046 case Intrinsic::amdgcn_qsad_pk_u16_u8:
4047 case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4048 case Intrinsic::amdgcn_mqsad_u32_u8:
4049 case Intrinsic::amdgcn_cvt_pk_u8_f32:
4050 case Intrinsic::amdgcn_alignbit:
4051 case Intrinsic::amdgcn_alignbyte:
4052 case Intrinsic::amdgcn_fdot2:
4053 case Intrinsic::amdgcn_sdot2:
4054 case Intrinsic::amdgcn_udot2:
4055 case Intrinsic::amdgcn_sdot4:
4056 case Intrinsic::amdgcn_udot4:
4057 case Intrinsic::amdgcn_sdot8:
4058 case Intrinsic::amdgcn_udot8:
4059 return getDefaultMappingVOP(MI);
4060 case Intrinsic::amdgcn_sbfe:
4061 case Intrinsic::amdgcn_ubfe:
4062 if (isSALUMapping(MI))
4063 return getDefaultMappingSOP(MI);
4064 return getDefaultMappingVOP(MI);
4065 case Intrinsic::amdgcn_ds_swizzle:
4066 case Intrinsic::amdgcn_ds_permute:
4067 case Intrinsic::amdgcn_ds_bpermute:
4068 case Intrinsic::amdgcn_update_dpp:
4069 case Intrinsic::amdgcn_mov_dpp8:
4070 case Intrinsic::amdgcn_mov_dpp:
4071 case Intrinsic::amdgcn_wwm:
4072 case Intrinsic::amdgcn_wqm:
4073 case Intrinsic::amdgcn_softwqm:
4074 case Intrinsic::amdgcn_set_inactive:
4075 return getDefaultMappingAllVGPR(MI);
4076 case Intrinsic::amdgcn_kernarg_segment_ptr:
4077 case Intrinsic::amdgcn_s_getpc:
4078 case Intrinsic::amdgcn_groupstaticsize:
4079 case Intrinsic::amdgcn_reloc_constant:
4080 case Intrinsic::returnaddress: {
4081 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4082 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4083 break;
4084 }
4085 case Intrinsic::amdgcn_wqm_vote: {
4086 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4087 OpdsMapping[0] = OpdsMapping[2]
4088 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4089 break;
4090 }
4091 case Intrinsic::amdgcn_ps_live: {
4092 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4093 break;
4094 }
4095 case Intrinsic::amdgcn_div_scale: {
4096 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4097 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4098 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4099 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4100
4101 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4102 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4103 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4104 break;
4105 }
4106 case Intrinsic::amdgcn_class: {
4107 Register Src0Reg = MI.getOperand(2).getReg();
4108 Register Src1Reg = MI.getOperand(3).getReg();
4109 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4110 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4111 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4112 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4113 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4114 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4115 break;
4116 }
4117 case Intrinsic::amdgcn_icmp:
4118 case Intrinsic::amdgcn_fcmp: {
4119 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4120 // This is not VCCRegBank because this is not used in boolean contexts.
4121 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4122 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4123 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4124 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4125 break;
4126 }
4127 case Intrinsic::amdgcn_readlane: {
4128 // This must be an SGPR, but accept a VGPR.
4129 Register IdxReg = MI.getOperand(3).getReg();
4130 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4131 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4132 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4133 LLVM_FALLTHROUGH;
4134 }
4135 case Intrinsic::amdgcn_readfirstlane: {
4136 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4137 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4138 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4139 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4140 break;
4141 }
4142 case Intrinsic::amdgcn_writelane: {
4143 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4144 Register SrcReg = MI.getOperand(2).getReg();
4145 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4146 unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4147 Register IdxReg = MI.getOperand(3).getReg();
4148 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4149 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4150 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4151
4152 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4153 // to legalize.
4154 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4155 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4156 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4157 break;
4158 }
4159 case Intrinsic::amdgcn_if_break: {
4160 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4161 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4162 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4163 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4164 break;
4165 }
4166 case Intrinsic::amdgcn_permlane16:
4167 case Intrinsic::amdgcn_permlanex16: {
4168 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4169 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4170 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4171 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4172 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4173 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4174 break;
4175 }
4176 case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4177 case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4178 case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4179 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4180 case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4181 case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4182 case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4183 case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4184 case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4185 case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4186 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4187 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4188 case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4189 case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4190 case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4191 case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4192 case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4193 case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4194 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4195 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: {
4196 // Default for MAI intrinsics.
4197 // srcC can also be an immediate which can be folded later.
4198 // FIXME: Should we eventually add an alternative mapping with AGPR src
4199 // for srcA/srcB?
4200 //
4201 // vdst, srcA, srcB, srcC
4202 OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4203 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4204 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4205 OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4206 break;
4207 }
4208 case Intrinsic::amdgcn_interp_p1:
4209 case Intrinsic::amdgcn_interp_p2:
4210 case Intrinsic::amdgcn_interp_mov:
4211 case Intrinsic::amdgcn_interp_p1_f16:
4212 case Intrinsic::amdgcn_interp_p2_f16: {
4213 const int M0Idx = MI.getNumOperands() - 1;
4214 Register M0Reg = MI.getOperand(M0Idx).getReg();
4215 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4216 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4217
4218 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4219 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4220 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4221
4222 // Must be SGPR, but we must take whatever the original bank is and fix it
4223 // later.
4224 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4225 break;
4226 }
4227 case Intrinsic::amdgcn_ballot: {
4228 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4229 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4230 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4231 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4232 break;
4233 }
4234 }
4235 break;
4236 }
4237 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4238 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
4239 auto IntrID = MI.getIntrinsicID();
4240 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
4241 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4242 // Non-images can have complications from operands that allow both SGPR
4243 // and VGPR. For now it's too complicated to figure out the final opcode
4244 // to derive the register bank from the MCInstrDesc.
4245 assert(RSrcIntrin->IsImage);
4246 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
4247 }
4248 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4249 unsigned N = MI.getNumExplicitOperands() - 2;
4250 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4251 OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
4252 for (unsigned I = 2; I < N; ++I)
4253 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4254 break;
4255 }
4256 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
4257 auto IntrID = MI.getIntrinsicID();
4258 switch (IntrID) {
4259 case Intrinsic::amdgcn_s_getreg:
4260 case Intrinsic::amdgcn_s_memtime:
4261 case Intrinsic::amdgcn_s_memrealtime:
4262 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
4263 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4264 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4265 break;
4266 }
4267 case Intrinsic::amdgcn_global_atomic_fadd:
4268 case Intrinsic::amdgcn_global_atomic_csub:
4269 return getDefaultMappingAllVGPR(MI);
4270 case Intrinsic::amdgcn_ds_ordered_add:
4271 case Intrinsic::amdgcn_ds_ordered_swap: {
4272 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4273 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4274 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4275 AMDGPU::SGPRRegBankID);
4276 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
4277 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4278 break;
4279 }
4280 case Intrinsic::amdgcn_ds_append:
4281 case Intrinsic::amdgcn_ds_consume: {
4282 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4283 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4284 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4285 break;
4286 }
4287 case Intrinsic::amdgcn_exp_compr:
4288 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4289 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4290 break;
4291 case Intrinsic::amdgcn_exp:
4292 // FIXME: Could we support packed types here?
4293 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4294 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4295 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4296 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4297 break;
4298 case Intrinsic::amdgcn_s_sendmsg:
4299 case Intrinsic::amdgcn_s_sendmsghalt: {
4300 // This must be an SGPR, but accept a VGPR.
4301 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4302 AMDGPU::SGPRRegBankID);
4303 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4304 break;
4305 }
4306 case Intrinsic::amdgcn_s_setreg: {
4307 // This must be an SGPR, but accept a VGPR.
4308 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4309 AMDGPU::SGPRRegBankID);
4310 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4311 break;
4312 }
4313 case Intrinsic::amdgcn_end_cf: {
4314 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4315 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4316 break;
4317 }
4318 case Intrinsic::amdgcn_else: {
4319 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4320 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4321 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4322 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4323 break;
4324 }
4325 case Intrinsic::amdgcn_kill: {
4326 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4327 break;
4328 }
4329 case Intrinsic::amdgcn_raw_buffer_load:
4330 case Intrinsic::amdgcn_raw_tbuffer_load: {
4331 // FIXME: Should make intrinsic ID the last operand of the instruction,
4332 // then this would be the same as store
4333 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4334 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4335 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4336 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4337 break;
4338 }
4339 case Intrinsic::amdgcn_raw_buffer_store:
4340 case Intrinsic::amdgcn_raw_buffer_store_format:
4341 case Intrinsic::amdgcn_raw_tbuffer_store: {
4342 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4343 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4344 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4345 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4346 break;
4347 }
4348 case Intrinsic::amdgcn_struct_buffer_load:
4349 case Intrinsic::amdgcn_struct_tbuffer_load: {
4350 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4351 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4352 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4353 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4354 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4355 break;
4356 }
4357 case Intrinsic::amdgcn_struct_buffer_store:
4358 case Intrinsic::amdgcn_struct_tbuffer_store: {
4359 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4360 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4361 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4362 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4363 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4364 break;
4365 }
4366 case Intrinsic::amdgcn_init_exec_from_input: {
4367 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4368 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4369 break;
4370 }
4371 case Intrinsic::amdgcn_ds_gws_init:
4372 case Intrinsic::amdgcn_ds_gws_barrier:
4373 case Intrinsic::amdgcn_ds_gws_sema_br: {
4374 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4375
4376 // This must be an SGPR, but accept a VGPR.
4377 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4378 AMDGPU::SGPRRegBankID);
4379 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4380 break;
4381 }
4382 case Intrinsic::amdgcn_ds_gws_sema_v:
4383 case Intrinsic::amdgcn_ds_gws_sema_p:
4384 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
4385 // This must be an SGPR, but accept a VGPR.
4386 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4387 AMDGPU::SGPRRegBankID);
4388 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4389 break;
4390 }
4391 default:
4392 return getInvalidInstructionMapping();
4393 }
4394 break;
4395 }
4396 case AMDGPU::G_SELECT: {
4397 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4398 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4399 AMDGPU::SGPRRegBankID);
4400 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
4401 AMDGPU::SGPRRegBankID);
4402 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
4403 Op3Bank == AMDGPU::SGPRRegBankID;
4404
4405 unsigned CondBankDefault = SGPRSrcs ?
4406 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4407 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4408 CondBankDefault);
4409 if (CondBank == AMDGPU::SGPRRegBankID)
4410 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4411 else if (CondBank == AMDGPU::VGPRRegBankID)
4412 CondBank = AMDGPU::VCCRegBankID;
4413
4414 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
4415 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4416
4417 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
4418
4419 // TODO: Should report 32-bit for scalar condition type.
4420 if (Size == 64) {
4421 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4422 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4423 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4424 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4425 } else {
4426 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
4427 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4428 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
4429 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
4430 }
4431
4432 break;
4433 }
4434
4435 case AMDGPU::G_LOAD:
4436 case AMDGPU::G_ZEXTLOAD:
4437 case AMDGPU::G_SEXTLOAD:
4438 return getInstrMappingForLoad(MI);
4439
4440 case AMDGPU::G_ATOMICRMW_XCHG:
4441 case AMDGPU::G_ATOMICRMW_ADD:
4442 case AMDGPU::G_ATOMICRMW_SUB:
4443 case AMDGPU::G_ATOMICRMW_AND:
4444 case AMDGPU::G_ATOMICRMW_OR:
4445 case AMDGPU::G_ATOMICRMW_XOR:
4446 case AMDGPU::G_ATOMICRMW_MAX:
4447 case AMDGPU::G_ATOMICRMW_MIN:
4448 case AMDGPU::G_ATOMICRMW_UMAX:
4449 case AMDGPU::G_ATOMICRMW_UMIN:
4450 case AMDGPU::G_ATOMICRMW_FADD:
4451 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
4452 case AMDGPU::G_AMDGPU_ATOMIC_INC:
4453 case AMDGPU::G_AMDGPU_ATOMIC_DEC:
4454 case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
4455 case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
4456 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4457 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4458 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4459 break;
4460 }
4461 case AMDGPU::G_ATOMIC_CMPXCHG: {
4462 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4463 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4464 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4465 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4466 break;
4467 }
4468 case AMDGPU::G_BRCOND: {
4469 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4470 AMDGPU::SGPRRegBankID);
4471 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
4472 if (Bank != AMDGPU::SGPRRegBankID)
4473 Bank = AMDGPU::VCCRegBankID;
4474
4475 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
4476 break;
4477 }
4478 }
4479
4480 return getInstructionMapping(/*ID*/1, /*Cost*/1,
4481 getOperandsMapping(OpdsMapping),
4482 MI.getNumOperands());
4483 }
4484