1 //===-- GCNNSAReassign.cpp - Reassign registers in NSA unstructions -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential
11 /// in NSA image instructions. Later SIShrinkInstructions pass will relace NSA
12 /// with sequential versions where possible.
13 ///
14 //===----------------------------------------------------------------------===//
15
16 #include "AMDGPU.h"
17 #include "AMDGPUSubtarget.h"
18 #include "SIInstrInfo.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/CodeGen/LiveInterval.h"
22 #include "llvm/CodeGen/LiveIntervals.h"
23 #include "llvm/CodeGen/LiveRegMatrix.h"
24 #include "llvm/CodeGen/MachineFunctionPass.h"
25 #include "llvm/CodeGen/VirtRegMap.h"
26 #include "llvm/InitializePasses.h"
27 #include "llvm/Support/MathExtras.h"
28 #include <algorithm>
29
30 using namespace llvm;
31
32 #define DEBUG_TYPE "amdgpu-nsa-reassign"
33
34 STATISTIC(NumNSAInstructions,
35 "Number of NSA instructions with non-sequential address found");
36 STATISTIC(NumNSAConverted,
37 "Number of NSA instructions changed to sequential");
38
39 namespace {
40
41 class GCNNSAReassign : public MachineFunctionPass {
42 public:
43 static char ID;
44
GCNNSAReassign()45 GCNNSAReassign() : MachineFunctionPass(ID) {
46 initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry());
47 }
48
49 bool runOnMachineFunction(MachineFunction &MF) override;
50
getPassName() const51 StringRef getPassName() const override { return "GCN NSA Reassign"; }
52
getAnalysisUsage(AnalysisUsage & AU) const53 void getAnalysisUsage(AnalysisUsage &AU) const override {
54 AU.addRequired<LiveIntervals>();
55 AU.addRequired<VirtRegMap>();
56 AU.addRequired<LiveRegMatrix>();
57 AU.setPreservesAll();
58 MachineFunctionPass::getAnalysisUsage(AU);
59 }
60
61 private:
62 typedef enum {
63 NOT_NSA, // Not an NSA instruction
64 FIXED, // NSA which we cannot modify
65 NON_CONTIGUOUS, // NSA with non-sequential address which we can try
66 // to optimize.
67 CONTIGUOUS // NSA with all sequential address registers
68 } NSA_Status;
69
70 const GCNSubtarget *ST;
71
72 const MachineRegisterInfo *MRI;
73
74 const SIRegisterInfo *TRI;
75
76 VirtRegMap *VRM;
77
78 LiveRegMatrix *LRM;
79
80 LiveIntervals *LIS;
81
82 unsigned MaxNumVGPRs;
83
84 const MCPhysReg *CSRegs;
85
86 NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const;
87
88 bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
89 unsigned StartReg) const;
90
91 bool canAssign(unsigned StartReg, unsigned NumRegs) const;
92
93 bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const;
94 };
95
96 } // End anonymous namespace.
97
98 INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
99 false, false)
100 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
101 INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
102 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
103 INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
104 false, false)
105
106
107 char GCNNSAReassign::ID = 0;
108
109 char &llvm::GCNNSAReassignID = GCNNSAReassign::ID;
110
111 bool
tryAssignRegisters(SmallVectorImpl<LiveInterval * > & Intervals,unsigned StartReg) const112 GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
113 unsigned StartReg) const {
114 unsigned NumRegs = Intervals.size();
115
116 for (unsigned N = 0; N < NumRegs; ++N)
117 if (VRM->hasPhys(Intervals[N]->reg))
118 LRM->unassign(*Intervals[N]);
119
120 for (unsigned N = 0; N < NumRegs; ++N)
121 if (LRM->checkInterference(*Intervals[N], StartReg + N))
122 return false;
123
124 for (unsigned N = 0; N < NumRegs; ++N)
125 LRM->assign(*Intervals[N], StartReg + N);
126
127 return true;
128 }
129
canAssign(unsigned StartReg,unsigned NumRegs) const130 bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const {
131 for (unsigned N = 0; N < NumRegs; ++N) {
132 unsigned Reg = StartReg + N;
133 if (!MRI->isAllocatable(Reg))
134 return false;
135
136 for (unsigned I = 0; CSRegs[I]; ++I)
137 if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
138 !LRM->isPhysRegUsed(CSRegs[I]))
139 return false;
140 }
141
142 return true;
143 }
144
145 bool
scavengeRegs(SmallVectorImpl<LiveInterval * > & Intervals) const146 GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const {
147 unsigned NumRegs = Intervals.size();
148
149 if (NumRegs > MaxNumVGPRs)
150 return false;
151 unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0;
152
153 for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) {
154 if (!canAssign(Reg, NumRegs))
155 continue;
156
157 if (tryAssignRegisters(Intervals, Reg))
158 return true;
159 }
160
161 return false;
162 }
163
164 GCNNSAReassign::NSA_Status
CheckNSA(const MachineInstr & MI,bool Fast) const165 GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
166 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
167 if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
168 return NSA_Status::NOT_NSA;
169
170 int VAddr0Idx =
171 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
172
173 unsigned VgprBase = 0;
174 bool NSA = false;
175 for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
176 const MachineOperand &Op = MI.getOperand(VAddr0Idx + I);
177 Register Reg = Op.getReg();
178 if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
179 return NSA_Status::FIXED;
180
181 Register PhysReg = VRM->getPhys(Reg);
182
183 if (!Fast) {
184 if (!PhysReg)
185 return NSA_Status::FIXED;
186
187 // Bail if address is not a VGPR32. That should be possible to extend the
188 // optimization to work with subregs of a wider register tuples, but the
189 // logic to find free registers will be much more complicated with much
190 // less chances for success. That seems reasonable to assume that in most
191 // cases a tuple is used because a vector variable contains different
192 // parts of an address and it is either already consequitive or cannot
193 // be reassigned if not. If needed it is better to rely on register
194 // coalescer to process such address tuples.
195 if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg())
196 return NSA_Status::FIXED;
197
198 const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
199
200 if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
201 return NSA_Status::FIXED;
202
203 for (auto U : MRI->use_nodbg_operands(Reg)) {
204 if (U.isImplicit())
205 return NSA_Status::FIXED;
206 const MachineInstr *UseInst = U.getParent();
207 if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
208 return NSA_Status::FIXED;
209 }
210
211 if (!LIS->hasInterval(Reg))
212 return NSA_Status::FIXED;
213 }
214
215 if (I == 0)
216 VgprBase = PhysReg;
217 else if (VgprBase + I != PhysReg)
218 NSA = true;
219 }
220
221 return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS;
222 }
223
runOnMachineFunction(MachineFunction & MF)224 bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
225 ST = &MF.getSubtarget<GCNSubtarget>();
226 if (ST->getGeneration() < GCNSubtarget::GFX10)
227 return false;
228
229 MRI = &MF.getRegInfo();
230 TRI = ST->getRegisterInfo();
231 VRM = &getAnalysis<VirtRegMap>();
232 LRM = &getAnalysis<LiveRegMatrix>();
233 LIS = &getAnalysis<LiveIntervals>();
234
235 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
236 MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
237 MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs);
238 CSRegs = MRI->getCalleeSavedRegs();
239
240 using Candidate = std::pair<const MachineInstr*, bool>;
241 SmallVector<Candidate, 32> Candidates;
242 for (const MachineBasicBlock &MBB : MF) {
243 for (const MachineInstr &MI : MBB) {
244 switch (CheckNSA(MI)) {
245 default:
246 continue;
247 case NSA_Status::CONTIGUOUS:
248 Candidates.push_back(std::make_pair(&MI, true));
249 break;
250 case NSA_Status::NON_CONTIGUOUS:
251 Candidates.push_back(std::make_pair(&MI, false));
252 ++NumNSAInstructions;
253 break;
254 }
255 }
256 }
257
258 bool Changed = false;
259 for (auto &C : Candidates) {
260 if (C.second)
261 continue;
262
263 const MachineInstr *MI = C.first;
264 if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) {
265 // Already happen to be fixed.
266 C.second = true;
267 ++NumNSAConverted;
268 continue;
269 }
270
271 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI->getOpcode());
272 int VAddr0Idx =
273 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0);
274
275 SmallVector<LiveInterval *, 16> Intervals;
276 SmallVector<unsigned, 16> OrigRegs;
277 SlotIndex MinInd, MaxInd;
278 for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
279 const MachineOperand &Op = MI->getOperand(VAddr0Idx + I);
280 Register Reg = Op.getReg();
281 LiveInterval *LI = &LIS->getInterval(Reg);
282 if (llvm::find(Intervals, LI) != Intervals.end()) {
283 // Same register used, unable to make sequential
284 Intervals.clear();
285 break;
286 }
287 Intervals.push_back(LI);
288 OrigRegs.push_back(VRM->getPhys(Reg));
289 MinInd = I ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex();
290 MaxInd = I ? std::max(MaxInd, LI->endIndex()) : LI->endIndex();
291 }
292
293 if (Intervals.empty())
294 continue;
295
296 LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI
297 << "\tOriginal allocation:\t";
298 for(auto *LI : Intervals)
299 dbgs() << " " << llvm::printReg((VRM->getPhys(LI->reg)), TRI);
300 dbgs() << '\n');
301
302 bool Success = scavengeRegs(Intervals);
303 if (!Success) {
304 LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n");
305 if (VRM->hasPhys(Intervals.back()->reg)) // Did not change allocation.
306 continue;
307 } else {
308 // Check we did not make it worse for other instructions.
309 auto I = std::lower_bound(Candidates.begin(), &C, MinInd,
310 [this](const Candidate &C, SlotIndex I) {
311 return LIS->getInstructionIndex(*C.first) < I;
312 });
313 for (auto E = Candidates.end(); Success && I != E &&
314 LIS->getInstructionIndex(*I->first) < MaxInd; ++I) {
315 if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) {
316 Success = false;
317 LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first);
318 }
319 }
320 }
321
322 if (!Success) {
323 for (unsigned I = 0; I < Info->VAddrDwords; ++I)
324 if (VRM->hasPhys(Intervals[I]->reg))
325 LRM->unassign(*Intervals[I]);
326
327 for (unsigned I = 0; I < Info->VAddrDwords; ++I)
328 LRM->assign(*Intervals[I], OrigRegs[I]);
329
330 continue;
331 }
332
333 C.second = true;
334 ++NumNSAConverted;
335 LLVM_DEBUG(dbgs() << "\tNew allocation:\t\t ["
336 << llvm::printReg((VRM->getPhys(Intervals.front()->reg)), TRI)
337 << " : "
338 << llvm::printReg((VRM->getPhys(Intervals.back()->reg)), TRI)
339 << "]\n");
340 Changed = true;
341 }
342
343 return Changed;
344 }
345