1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// Insert wait instructions for memory reads and writes.
12 ///
13 /// Memory reads and writes are issued asynchronously, so we need to insert
14 /// S_WAITCNT instructions when we want to access any of their results or
15 /// overwrite any register that's used asynchronously.
16 //
17 //===----------------------------------------------------------------------===//
18
19 #include "AMDGPU.h"
20 #include "AMDGPUSubtarget.h"
21 #include "SIDefines.h"
22 #include "SIInstrInfo.h"
23 #include "SIMachineFunctionInfo.h"
24 #include "SIRegisterInfo.h"
25 #include "Utils/AMDGPUBaseInfo.h"
26 #include "llvm/ADT/DenseMap.h"
27 #include "llvm/ADT/DenseSet.h"
28 #include "llvm/ADT/PostOrderIterator.h"
29 #include "llvm/ADT/STLExtras.h"
30 #include "llvm/ADT/SmallVector.h"
31 #include "llvm/CodeGen/MachineBasicBlock.h"
32 #include "llvm/CodeGen/MachineFunction.h"
33 #include "llvm/CodeGen/MachineFunctionPass.h"
34 #include "llvm/CodeGen/MachineInstr.h"
35 #include "llvm/CodeGen/MachineInstrBuilder.h"
36 #include "llvm/CodeGen/MachineLoopInfo.h"
37 #include "llvm/CodeGen/MachineMemOperand.h"
38 #include "llvm/CodeGen/MachineOperand.h"
39 #include "llvm/CodeGen/MachineRegisterInfo.h"
40 #include "llvm/IR/DebugLoc.h"
41 #include "llvm/Pass.h"
42 #include "llvm/Support/Debug.h"
43 #include "llvm/Support/DebugCounter.h"
44 #include "llvm/Support/ErrorHandling.h"
45 #include "llvm/Support/raw_ostream.h"
46 #include <algorithm>
47 #include <cassert>
48 #include <cstdint>
49 #include <cstring>
50 #include <memory>
51 #include <utility>
52 #include <vector>
53
54 using namespace llvm;
55
56 #define DEBUG_TYPE "si-insert-waitcnts"
57
58 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
59 "Force emit s_waitcnt expcnt(0) instrs");
60 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
61 "Force emit s_waitcnt lgkmcnt(0) instrs");
62 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
63 "Force emit s_waitcnt vmcnt(0) instrs");
64
65 static cl::opt<unsigned> ForceEmitZeroFlag(
66 "amdgpu-waitcnt-forcezero",
67 cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
68 cl::init(0), cl::Hidden);
69
70 namespace {
71
72 // Class of object that encapsulates latest instruction counter score
73 // associated with the operand. Used for determining whether
74 // s_waitcnt instruction needs to be emited.
75
76 #define CNT_MASK(t) (1u << (t))
77
78 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
79
80 using RegInterval = std::pair<signed, signed>;
81
82 struct {
83 int32_t VmcntMax;
84 int32_t ExpcntMax;
85 int32_t LgkmcntMax;
86 int32_t NumVGPRsMax;
87 int32_t NumSGPRsMax;
88 } HardwareLimits;
89
90 struct {
91 unsigned VGPR0;
92 unsigned VGPRL;
93 unsigned SGPR0;
94 unsigned SGPRL;
95 } RegisterEncoding;
96
97 enum WaitEventType {
98 VMEM_ACCESS, // vector-memory read & write
99 LDS_ACCESS, // lds read & write
100 GDS_ACCESS, // gds read & write
101 SQ_MESSAGE, // send message
102 SMEM_ACCESS, // scalar-memory read & write
103 EXP_GPR_LOCK, // export holding on its data src
104 GDS_GPR_LOCK, // GDS holding on its data and addr src
105 EXP_POS_ACCESS, // write to export position
106 EXP_PARAM_ACCESS, // write to export parameter
107 VMW_GPR_LOCK, // vector-memory write holding on its data src
108 NUM_WAIT_EVENTS,
109 };
110
111 // The mapping is:
112 // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
113 // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
114 // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
115 // We reserve a fixed number of VGPR slots in the scoring tables for
116 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
117 enum RegisterMapping {
118 SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
119 SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
120 NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
121 EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
122 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
123 };
124
125 #define ForAllWaitEventType(w) \
126 for (enum WaitEventType w = (enum WaitEventType)0; \
127 (w) < (enum WaitEventType)NUM_WAIT_EVENTS; \
128 (w) = (enum WaitEventType)((w) + 1))
129
130 // This is a per-basic-block object that maintains current score brackets
131 // of each wait counter, and a per-register scoreboard for each wait counter.
132 // We also maintain the latest score for every event type that can change the
133 // waitcnt in order to know if there are multiple types of events within
134 // the brackets. When multiple types of event happen in the bracket,
135 // wait count may get decreased out of order, therefore we need to put in
136 // "s_waitcnt 0" before use.
137 class BlockWaitcntBrackets {
138 public:
BlockWaitcntBrackets(const GCNSubtarget * SubTarget)139 BlockWaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
140 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
141 T = (enum InstCounterType)(T + 1)) {
142 memset(VgprScores[T], 0, sizeof(VgprScores[T]));
143 }
144 }
145
146 ~BlockWaitcntBrackets() = default;
147
getWaitCountMax(InstCounterType T)148 static int32_t getWaitCountMax(InstCounterType T) {
149 switch (T) {
150 case VM_CNT:
151 return HardwareLimits.VmcntMax;
152 case LGKM_CNT:
153 return HardwareLimits.LgkmcntMax;
154 case EXP_CNT:
155 return HardwareLimits.ExpcntMax;
156 default:
157 break;
158 }
159 return 0;
160 }
161
setScoreLB(InstCounterType T,int32_t Val)162 void setScoreLB(InstCounterType T, int32_t Val) {
163 assert(T < NUM_INST_CNTS);
164 if (T >= NUM_INST_CNTS)
165 return;
166 ScoreLBs[T] = Val;
167 }
168
setScoreUB(InstCounterType T,int32_t Val)169 void setScoreUB(InstCounterType T, int32_t Val) {
170 assert(T < NUM_INST_CNTS);
171 if (T >= NUM_INST_CNTS)
172 return;
173 ScoreUBs[T] = Val;
174 if (T == EXP_CNT) {
175 int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
176 if (ScoreLBs[T] < UB)
177 ScoreLBs[T] = UB;
178 }
179 }
180
getScoreLB(InstCounterType T)181 int32_t getScoreLB(InstCounterType T) {
182 assert(T < NUM_INST_CNTS);
183 if (T >= NUM_INST_CNTS)
184 return 0;
185 return ScoreLBs[T];
186 }
187
getScoreUB(InstCounterType T)188 int32_t getScoreUB(InstCounterType T) {
189 assert(T < NUM_INST_CNTS);
190 if (T >= NUM_INST_CNTS)
191 return 0;
192 return ScoreUBs[T];
193 }
194
195 // Mapping from event to counter.
eventCounter(WaitEventType E)196 InstCounterType eventCounter(WaitEventType E) {
197 switch (E) {
198 case VMEM_ACCESS:
199 return VM_CNT;
200 case LDS_ACCESS:
201 case GDS_ACCESS:
202 case SQ_MESSAGE:
203 case SMEM_ACCESS:
204 return LGKM_CNT;
205 case EXP_GPR_LOCK:
206 case GDS_GPR_LOCK:
207 case VMW_GPR_LOCK:
208 case EXP_POS_ACCESS:
209 case EXP_PARAM_ACCESS:
210 return EXP_CNT;
211 default:
212 llvm_unreachable("unhandled event type");
213 }
214 return NUM_INST_CNTS;
215 }
216
setRegScore(int GprNo,InstCounterType T,int32_t Val)217 void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
218 if (GprNo < NUM_ALL_VGPRS) {
219 if (GprNo > VgprUB) {
220 VgprUB = GprNo;
221 }
222 VgprScores[T][GprNo] = Val;
223 } else {
224 assert(T == LGKM_CNT);
225 if (GprNo - NUM_ALL_VGPRS > SgprUB) {
226 SgprUB = GprNo - NUM_ALL_VGPRS;
227 }
228 SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
229 }
230 }
231
getRegScore(int GprNo,InstCounterType T)232 int32_t getRegScore(int GprNo, InstCounterType T) {
233 if (GprNo < NUM_ALL_VGPRS) {
234 return VgprScores[T][GprNo];
235 }
236 return SgprScores[GprNo - NUM_ALL_VGPRS];
237 }
238
clear()239 void clear() {
240 memset(ScoreLBs, 0, sizeof(ScoreLBs));
241 memset(ScoreUBs, 0, sizeof(ScoreUBs));
242 memset(EventUBs, 0, sizeof(EventUBs));
243 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
244 T = (enum InstCounterType)(T + 1)) {
245 memset(VgprScores[T], 0, sizeof(VgprScores[T]));
246 }
247 memset(SgprScores, 0, sizeof(SgprScores));
248 }
249
250 RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
251 const MachineRegisterInfo *MRI,
252 const SIRegisterInfo *TRI, unsigned OpNo,
253 bool Def) const;
254
255 void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
256 const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
257 unsigned OpNo, int32_t Val);
258
setWaitAtBeginning()259 void setWaitAtBeginning() { WaitAtBeginning = true; }
clearWaitAtBeginning()260 void clearWaitAtBeginning() { WaitAtBeginning = false; }
getWaitAtBeginning() const261 bool getWaitAtBeginning() const { return WaitAtBeginning; }
setEventUB(enum WaitEventType W,int32_t Val)262 void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
getMaxVGPR() const263 int32_t getMaxVGPR() const { return VgprUB; }
getMaxSGPR() const264 int32_t getMaxSGPR() const { return SgprUB; }
265
getEventUB(enum WaitEventType W) const266 int32_t getEventUB(enum WaitEventType W) const {
267 assert(W < NUM_WAIT_EVENTS);
268 return EventUBs[W];
269 }
270
271 bool counterOutOfOrder(InstCounterType T);
272 unsigned int updateByWait(InstCounterType T, int ScoreToWait);
273 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
274 const MachineRegisterInfo *MRI, WaitEventType E,
275 MachineInstr &MI);
276
hasPendingSMEM() const277 bool hasPendingSMEM() const {
278 return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
279 EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
280 }
281
hasPendingFlat() const282 bool hasPendingFlat() const {
283 return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
284 LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
285 (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
286 LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
287 }
288
setPendingFlat()289 void setPendingFlat() {
290 LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
291 LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
292 }
293
pendingFlat(InstCounterType Ct) const294 int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
295
setLastFlat(InstCounterType Ct,int Val)296 void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
297
getRevisitLoop() const298 bool getRevisitLoop() const { return RevisitLoop; }
setRevisitLoop(bool RevisitLoopIn)299 void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
300
setPostOrder(int32_t PostOrderIn)301 void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
getPostOrder() const302 int32_t getPostOrder() const { return PostOrder; }
303
setWaitcnt(MachineInstr * WaitcntIn)304 void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
clearWaitcnt()305 void clearWaitcnt() { Waitcnt = nullptr; }
getWaitcnt() const306 MachineInstr *getWaitcnt() const { return Waitcnt; }
307
mixedExpTypes() const308 bool mixedExpTypes() const { return MixedExpTypes; }
setMixedExpTypes(bool MixedExpTypesIn)309 void setMixedExpTypes(bool MixedExpTypesIn) {
310 MixedExpTypes = MixedExpTypesIn;
311 }
312
313 void print(raw_ostream &);
dump()314 void dump() { print(dbgs()); }
315
316 private:
317 const GCNSubtarget *ST = nullptr;
318 bool WaitAtBeginning = false;
319 bool RevisitLoop = false;
320 bool MixedExpTypes = false;
321 int32_t PostOrder = 0;
322 MachineInstr *Waitcnt = nullptr;
323 int32_t ScoreLBs[NUM_INST_CNTS] = {0};
324 int32_t ScoreUBs[NUM_INST_CNTS] = {0};
325 int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
326 // Remember the last flat memory operation.
327 int32_t LastFlat[NUM_INST_CNTS] = {0};
328 // wait_cnt scores for every vgpr.
329 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
330 int32_t VgprUB = 0;
331 int32_t SgprUB = 0;
332 int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
333 // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
334 int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
335 };
336
337 // This is a per-loop-region object that records waitcnt status at the end of
338 // loop footer from the previous iteration. We also maintain an iteration
339 // count to track the number of times the loop has been visited. When it
340 // doesn't converge naturally, we force convergence by inserting s_waitcnt 0
341 // at the end of the loop footer.
342 class LoopWaitcntData {
343 public:
344 LoopWaitcntData() = default;
345 ~LoopWaitcntData() = default;
346
incIterCnt()347 void incIterCnt() { IterCnt++; }
resetIterCnt()348 void resetIterCnt() { IterCnt = 0; }
getIterCnt()349 unsigned getIterCnt() { return IterCnt; }
350
setWaitcnt(MachineInstr * WaitcntIn)351 void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
getWaitcnt() const352 MachineInstr *getWaitcnt() const { return LfWaitcnt; }
353
print()354 void print() { LLVM_DEBUG(dbgs() << " iteration " << IterCnt << '\n';); }
355
356 private:
357 // s_waitcnt added at the end of loop footer to stablize wait scores
358 // at the end of the loop footer.
359 MachineInstr *LfWaitcnt = nullptr;
360 // Number of iterations the loop has been visited, not including the initial
361 // walk over.
362 int32_t IterCnt = 0;
363 };
364
365 class SIInsertWaitcnts : public MachineFunctionPass {
366 private:
367 const GCNSubtarget *ST = nullptr;
368 const SIInstrInfo *TII = nullptr;
369 const SIRegisterInfo *TRI = nullptr;
370 const MachineRegisterInfo *MRI = nullptr;
371 const MachineLoopInfo *MLI = nullptr;
372 AMDGPU::IsaInfo::IsaVersion IV;
373 AMDGPUAS AMDGPUASI;
374
375 DenseSet<MachineBasicBlock *> BlockVisitedSet;
376 DenseSet<MachineInstr *> TrackedWaitcntSet;
377 DenseSet<MachineInstr *> VCCZBugHandledSet;
378
379 DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
380 BlockWaitcntBracketsMap;
381
382 std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
383
384 DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
385
386 std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
387
388 // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
389 // because of amdgpu-waitcnt-forcezero flag
390 bool ForceEmitZeroWaitcnts;
391 bool ForceEmitWaitcnt[NUM_INST_CNTS];
392
393 public:
394 static char ID;
395
SIInsertWaitcnts()396 SIInsertWaitcnts() : MachineFunctionPass(ID) {
397 (void)ForceExpCounter;
398 (void)ForceLgkmCounter;
399 (void)ForceVMCounter;
400 }
401
402 bool runOnMachineFunction(MachineFunction &MF) override;
403
getPassName() const404 StringRef getPassName() const override {
405 return "SI insert wait instructions";
406 }
407
getAnalysisUsage(AnalysisUsage & AU) const408 void getAnalysisUsage(AnalysisUsage &AU) const override {
409 AU.setPreservesCFG();
410 AU.addRequired<MachineLoopInfo>();
411 MachineFunctionPass::getAnalysisUsage(AU);
412 }
413
addKillWaitBracket(BlockWaitcntBrackets * Bracket)414 void addKillWaitBracket(BlockWaitcntBrackets *Bracket) {
415 // The waitcnt information is copied because it changes as the block is
416 // traversed.
417 KillWaitBrackets.push_back(
418 llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
419 }
420
isForceEmitWaitcnt() const421 bool isForceEmitWaitcnt() const {
422 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
423 T = (enum InstCounterType)(T + 1))
424 if (ForceEmitWaitcnt[T])
425 return true;
426 return false;
427 }
428
setForceEmitWaitcnt()429 void setForceEmitWaitcnt() {
430 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
431 // For debug builds, get the debug counter info and adjust if need be
432 #ifndef NDEBUG
433 if (DebugCounter::isCounterSet(ForceExpCounter) &&
434 DebugCounter::shouldExecute(ForceExpCounter)) {
435 ForceEmitWaitcnt[EXP_CNT] = true;
436 } else {
437 ForceEmitWaitcnt[EXP_CNT] = false;
438 }
439
440 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
441 DebugCounter::shouldExecute(ForceLgkmCounter)) {
442 ForceEmitWaitcnt[LGKM_CNT] = true;
443 } else {
444 ForceEmitWaitcnt[LGKM_CNT] = false;
445 }
446
447 if (DebugCounter::isCounterSet(ForceVMCounter) &&
448 DebugCounter::shouldExecute(ForceVMCounter)) {
449 ForceEmitWaitcnt[VM_CNT] = true;
450 } else {
451 ForceEmitWaitcnt[VM_CNT] = false;
452 }
453 #endif // NDEBUG
454 }
455
456 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
457 void generateWaitcntInstBefore(MachineInstr &MI,
458 BlockWaitcntBrackets *ScoreBrackets);
459 void updateEventWaitcntAfter(MachineInstr &Inst,
460 BlockWaitcntBrackets *ScoreBrackets);
461 void mergeInputScoreBrackets(MachineBasicBlock &Block);
462 bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
463 unsigned countNumBottomBlocks(const MachineLoop *Loop);
464 void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
465 void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
466 bool isWaitcntStronger(unsigned LHS, unsigned RHS);
467 unsigned combineWaitcnt(unsigned LHS, unsigned RHS);
468 };
469
470 } // end anonymous namespace
471
getRegInterval(const MachineInstr * MI,const SIInstrInfo * TII,const MachineRegisterInfo * MRI,const SIRegisterInfo * TRI,unsigned OpNo,bool Def) const472 RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
473 const SIInstrInfo *TII,
474 const MachineRegisterInfo *MRI,
475 const SIRegisterInfo *TRI,
476 unsigned OpNo,
477 bool Def) const {
478 const MachineOperand &Op = MI->getOperand(OpNo);
479 if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
480 (Def && !Op.isDef()))
481 return {-1, -1};
482
483 // A use via a PW operand does not need a waitcnt.
484 // A partial write is not a WAW.
485 assert(!Op.getSubReg() || !Op.isUndef());
486
487 RegInterval Result;
488 const MachineRegisterInfo &MRIA = *MRI;
489
490 unsigned Reg = TRI->getEncodingValue(Op.getReg());
491
492 if (TRI->isVGPR(MRIA, Op.getReg())) {
493 assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
494 Result.first = Reg - RegisterEncoding.VGPR0;
495 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
496 } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
497 assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
498 Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
499 assert(Result.first >= NUM_ALL_VGPRS &&
500 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
501 }
502 // TODO: Handle TTMP
503 // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
504 else
505 return {-1, -1};
506
507 const MachineInstr &MIA = *MI;
508 const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
509 unsigned Size = TRI->getRegSizeInBits(*RC);
510 Result.second = Result.first + (Size / 32);
511
512 return Result;
513 }
514
setExpScore(const MachineInstr * MI,const SIInstrInfo * TII,const SIRegisterInfo * TRI,const MachineRegisterInfo * MRI,unsigned OpNo,int32_t Val)515 void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
516 const SIInstrInfo *TII,
517 const SIRegisterInfo *TRI,
518 const MachineRegisterInfo *MRI,
519 unsigned OpNo, int32_t Val) {
520 RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
521 LLVM_DEBUG({
522 const MachineOperand &Opnd = MI->getOperand(OpNo);
523 assert(TRI->isVGPR(*MRI, Opnd.getReg()));
524 });
525 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
526 setRegScore(RegNo, EXP_CNT, Val);
527 }
528 }
529
updateByEvent(const SIInstrInfo * TII,const SIRegisterInfo * TRI,const MachineRegisterInfo * MRI,WaitEventType E,MachineInstr & Inst)530 void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
531 const SIRegisterInfo *TRI,
532 const MachineRegisterInfo *MRI,
533 WaitEventType E, MachineInstr &Inst) {
534 const MachineRegisterInfo &MRIA = *MRI;
535 InstCounterType T = eventCounter(E);
536 int32_t CurrScore = getScoreUB(T) + 1;
537 // EventUB and ScoreUB need to be update regardless if this event changes
538 // the score of a register or not.
539 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
540 EventUBs[E] = CurrScore;
541 setScoreUB(T, CurrScore);
542
543 if (T == EXP_CNT) {
544 // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
545 // is required.
546 if (!MixedExpTypes) {
547 MixedExpTypes = counterOutOfOrder(EXP_CNT);
548 }
549
550 // Put score on the source vgprs. If this is a store, just use those
551 // specific register(s).
552 if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
553 // All GDS operations must protect their address register (same as
554 // export.)
555 if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
556 Inst.getOpcode() != AMDGPU::DS_CONSUME) {
557 setExpScore(
558 &Inst, TII, TRI, MRI,
559 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
560 CurrScore);
561 }
562 if (Inst.mayStore()) {
563 setExpScore(
564 &Inst, TII, TRI, MRI,
565 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
566 CurrScore);
567 if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
568 AMDGPU::OpName::data1) != -1) {
569 setExpScore(&Inst, TII, TRI, MRI,
570 AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
571 AMDGPU::OpName::data1),
572 CurrScore);
573 }
574 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
575 Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
576 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
577 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
578 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
579 Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
580 Inst.getOpcode() != AMDGPU::DS_APPEND &&
581 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
582 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
583 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
584 const MachineOperand &Op = Inst.getOperand(I);
585 if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
586 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
587 }
588 }
589 }
590 } else if (TII->isFLAT(Inst)) {
591 if (Inst.mayStore()) {
592 setExpScore(
593 &Inst, TII, TRI, MRI,
594 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
595 CurrScore);
596 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
597 setExpScore(
598 &Inst, TII, TRI, MRI,
599 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
600 CurrScore);
601 }
602 } else if (TII->isMIMG(Inst)) {
603 if (Inst.mayStore()) {
604 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
605 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
606 setExpScore(
607 &Inst, TII, TRI, MRI,
608 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
609 CurrScore);
610 }
611 } else if (TII->isMTBUF(Inst)) {
612 if (Inst.mayStore()) {
613 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
614 }
615 } else if (TII->isMUBUF(Inst)) {
616 if (Inst.mayStore()) {
617 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
618 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
619 setExpScore(
620 &Inst, TII, TRI, MRI,
621 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
622 CurrScore);
623 }
624 } else {
625 if (TII->isEXP(Inst)) {
626 // For export the destination registers are really temps that
627 // can be used as the actual source after export patching, so
628 // we need to treat them like sources and set the EXP_CNT
629 // score.
630 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
631 MachineOperand &DefMO = Inst.getOperand(I);
632 if (DefMO.isReg() && DefMO.isDef() &&
633 TRI->isVGPR(MRIA, DefMO.getReg())) {
634 setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
635 CurrScore);
636 }
637 }
638 }
639 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
640 MachineOperand &MO = Inst.getOperand(I);
641 if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
642 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
643 }
644 }
645 }
646 #if 0 // TODO: check if this is handled by MUBUF code above.
647 } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
648 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
649 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
650 MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
651 unsigned OpNo;//TODO: find the OpNo for this operand;
652 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
653 for (signed RegNo = Interval.first; RegNo < Interval.second;
654 ++RegNo) {
655 setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
656 }
657 #endif
658 } else {
659 // Match the score to the destination registers.
660 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
661 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
662 if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
663 continue;
664 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
665 setRegScore(RegNo, T, CurrScore);
666 }
667 }
668 if (TII->isDS(Inst) && Inst.mayStore()) {
669 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
670 }
671 }
672 }
673
print(raw_ostream & OS)674 void BlockWaitcntBrackets::print(raw_ostream &OS) {
675 OS << '\n';
676 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
677 T = (enum InstCounterType)(T + 1)) {
678 int LB = getScoreLB(T);
679 int UB = getScoreUB(T);
680
681 switch (T) {
682 case VM_CNT:
683 OS << " VM_CNT(" << UB - LB << "): ";
684 break;
685 case LGKM_CNT:
686 OS << " LGKM_CNT(" << UB - LB << "): ";
687 break;
688 case EXP_CNT:
689 OS << " EXP_CNT(" << UB - LB << "): ";
690 break;
691 default:
692 OS << " UNKNOWN(" << UB - LB << "): ";
693 break;
694 }
695
696 if (LB < UB) {
697 // Print vgpr scores.
698 for (int J = 0; J <= getMaxVGPR(); J++) {
699 int RegScore = getRegScore(J, T);
700 if (RegScore <= LB)
701 continue;
702 int RelScore = RegScore - LB - 1;
703 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
704 OS << RelScore << ":v" << J << " ";
705 } else {
706 OS << RelScore << ":ds ";
707 }
708 }
709 // Also need to print sgpr scores for lgkm_cnt.
710 if (T == LGKM_CNT) {
711 for (int J = 0; J <= getMaxSGPR(); J++) {
712 int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
713 if (RegScore <= LB)
714 continue;
715 int RelScore = RegScore - LB - 1;
716 OS << RelScore << ":s" << J << " ";
717 }
718 }
719 }
720 OS << '\n';
721 }
722 OS << '\n';
723 }
724
updateByWait(InstCounterType T,int ScoreToWait)725 unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
726 int ScoreToWait) {
727 unsigned int NeedWait = 0;
728 if (ScoreToWait == -1) {
729 // The score to wait is unknown. This implies that it was not encountered
730 // during the path of the CFG walk done during the current traversal but
731 // may be seen on a different path. Emit an s_wait counter with a
732 // conservative value of 0 for the counter.
733 NeedWait = CNT_MASK(T);
734 setScoreLB(T, getScoreUB(T));
735 return NeedWait;
736 }
737
738 // If the score of src_operand falls within the bracket, we need an
739 // s_waitcnt instruction.
740 const int32_t LB = getScoreLB(T);
741 const int32_t UB = getScoreUB(T);
742 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
743 if ((T == VM_CNT || T == LGKM_CNT) &&
744 hasPendingFlat() &&
745 !ST->hasFlatLgkmVMemCountInOrder()) {
746 // If there is a pending FLAT operation, and this is a VMem or LGKM
747 // waitcnt and the target can report early completion, then we need
748 // to force a waitcnt 0.
749 NeedWait = CNT_MASK(T);
750 setScoreLB(T, getScoreUB(T));
751 } else if (counterOutOfOrder(T)) {
752 // Counter can get decremented out-of-order when there
753 // are multiple types event in the bracket. Also emit an s_wait counter
754 // with a conservative value of 0 for the counter.
755 NeedWait = CNT_MASK(T);
756 setScoreLB(T, getScoreUB(T));
757 } else {
758 NeedWait = CNT_MASK(T);
759 setScoreLB(T, ScoreToWait);
760 }
761 }
762
763 return NeedWait;
764 }
765
766 // Where there are multiple types of event in the bracket of a counter,
767 // the decrement may go out of order.
counterOutOfOrder(InstCounterType T)768 bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) {
769 switch (T) {
770 case VM_CNT:
771 return false;
772 case LGKM_CNT: {
773 if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
774 EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
775 // Scalar memory read always can go out of order.
776 return true;
777 }
778 int NumEventTypes = 0;
779 if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
780 EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
781 NumEventTypes++;
782 }
783 if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
784 EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
785 NumEventTypes++;
786 }
787 if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
788 EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
789 NumEventTypes++;
790 }
791 if (NumEventTypes <= 1) {
792 return false;
793 }
794 break;
795 }
796 case EXP_CNT: {
797 // If there has been a mixture of export types, then a waitcnt exp(0) is
798 // required.
799 if (MixedExpTypes)
800 return true;
801 int NumEventTypes = 0;
802 if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
803 EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
804 NumEventTypes++;
805 }
806 if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
807 EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
808 NumEventTypes++;
809 }
810 if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
811 EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
812 NumEventTypes++;
813 }
814 if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
815 EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
816 NumEventTypes++;
817 }
818
819 if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
820 EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
821 NumEventTypes++;
822 }
823
824 if (NumEventTypes <= 1) {
825 return false;
826 }
827 break;
828 }
829 default:
830 break;
831 }
832 return true;
833 }
834
835 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
836 false)
837 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
838 false)
839
840 char SIInsertWaitcnts::ID = 0;
841
842 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
843
createSIInsertWaitcntsPass()844 FunctionPass *llvm::createSIInsertWaitcntsPass() {
845 return new SIInsertWaitcnts();
846 }
847
readsVCCZ(const MachineInstr & MI)848 static bool readsVCCZ(const MachineInstr &MI) {
849 unsigned Opc = MI.getOpcode();
850 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
851 !MI.getOperand(1).isUndef();
852 }
853
854 /// Given wait count encodings checks if LHS is stronger than RHS.
isWaitcntStronger(unsigned LHS,unsigned RHS)855 bool SIInsertWaitcnts::isWaitcntStronger(unsigned LHS, unsigned RHS) {
856 if (AMDGPU::decodeVmcnt(IV, LHS) > AMDGPU::decodeVmcnt(IV, RHS))
857 return false;
858 if (AMDGPU::decodeLgkmcnt(IV, LHS) > AMDGPU::decodeLgkmcnt(IV, RHS))
859 return false;
860 if (AMDGPU::decodeExpcnt(IV, LHS) > AMDGPU::decodeExpcnt(IV, RHS))
861 return false;
862 return true;
863 }
864
865 /// Given wait count encodings create a new encoding which is stronger
866 /// or equal to both.
combineWaitcnt(unsigned LHS,unsigned RHS)867 unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) {
868 unsigned VmCnt = std::min(AMDGPU::decodeVmcnt(IV, LHS),
869 AMDGPU::decodeVmcnt(IV, RHS));
870 unsigned LgkmCnt = std::min(AMDGPU::decodeLgkmcnt(IV, LHS),
871 AMDGPU::decodeLgkmcnt(IV, RHS));
872 unsigned ExpCnt = std::min(AMDGPU::decodeExpcnt(IV, LHS),
873 AMDGPU::decodeExpcnt(IV, RHS));
874 return AMDGPU::encodeWaitcnt(IV, VmCnt, ExpCnt, LgkmCnt);
875 }
876
877 /// Generate s_waitcnt instruction to be placed before cur_Inst.
878 /// Instructions of a given type are returned in order,
879 /// but instructions of different types can complete out of order.
880 /// We rely on this in-order completion
881 /// and simply assign a score to the memory access instructions.
882 /// We keep track of the active "score bracket" to determine
883 /// if an access of a memory read requires an s_waitcnt
884 /// and if so what the value of each counter is.
885 /// The "score bracket" is bound by the lower bound and upper bound
886 /// scores (*_score_LB and *_score_ub respectively).
generateWaitcntInstBefore(MachineInstr & MI,BlockWaitcntBrackets * ScoreBrackets)887 void SIInsertWaitcnts::generateWaitcntInstBefore(
888 MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
889 // To emit, or not to emit - that's the question!
890 // Start with an assumption that there is no need to emit.
891 unsigned int EmitWaitcnt = 0;
892
893 // No need to wait before phi. If a phi-move exists, then the wait should
894 // has been inserted before the move. If a phi-move does not exist, then
895 // wait should be inserted before the real use. The same is true for
896 // sc-merge. It is not a coincident that all these cases correspond to the
897 // instructions that are skipped in the assembling loop.
898 bool NeedLineMapping = false; // TODO: Check on this.
899
900 // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug
901 bool ForceEmitZeroWaitcnt = false;
902
903 setForceEmitWaitcnt();
904 bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
905
906 if (MI.isDebugInstr() &&
907 // TODO: any other opcode?
908 !NeedLineMapping) {
909 return;
910 }
911
912 // See if an s_waitcnt is forced at block entry, or is needed at
913 // program end.
914 if (ScoreBrackets->getWaitAtBeginning()) {
915 // Note that we have already cleared the state, so we don't need to update
916 // it.
917 ScoreBrackets->clearWaitAtBeginning();
918 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
919 T = (enum InstCounterType)(T + 1)) {
920 EmitWaitcnt |= CNT_MASK(T);
921 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
922 }
923 }
924
925 // See if this instruction has a forced S_WAITCNT VM.
926 // TODO: Handle other cases of NeedsWaitcntVmBefore()
927 else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
928 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
929 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
930 EmitWaitcnt |=
931 ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
932 }
933
934 // All waits must be resolved at call return.
935 // NOTE: this could be improved with knowledge of all call sites or
936 // with knowledge of the called routines.
937 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
938 MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
939 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
940 T = (enum InstCounterType)(T + 1)) {
941 if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
942 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
943 EmitWaitcnt |= CNT_MASK(T);
944 }
945 }
946 }
947 // Resolve vm waits before gs-done.
948 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
949 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
950 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
951 AMDGPU::SendMsg::ID_GS_DONE)) {
952 if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
953 ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
954 EmitWaitcnt |= CNT_MASK(VM_CNT);
955 }
956 }
957 #if 0 // TODO: the following blocks of logic when we have fence.
958 else if (MI.getOpcode() == SC_FENCE) {
959 const unsigned int group_size =
960 context->shader_info->GetMaxThreadGroupSize();
961 // group_size == 0 means thread group size is unknown at compile time
962 const bool group_is_multi_wave =
963 (group_size == 0 || group_size > target_info->GetWaveFrontSize());
964 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
965
966 for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
967 SCRegType src_type = Inst->GetSrcType(i);
968 switch (src_type) {
969 case SCMEM_LDS:
970 if (group_is_multi_wave ||
971 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
972 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
973 ScoreBrackets->getScoreUB(LGKM_CNT));
974 // LDS may have to wait for VM_CNT after buffer load to LDS
975 if (target_info->HasBufferLoadToLDS()) {
976 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
977 ScoreBrackets->getScoreUB(VM_CNT));
978 }
979 }
980 break;
981
982 case SCMEM_GDS:
983 if (group_is_multi_wave || fence_is_global) {
984 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
985 ScoreBrackets->getScoreUB(EXP_CNT));
986 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
987 ScoreBrackets->getScoreUB(LGKM_CNT));
988 }
989 break;
990
991 case SCMEM_UAV:
992 case SCMEM_TFBUF:
993 case SCMEM_RING:
994 case SCMEM_SCATTER:
995 if (group_is_multi_wave || fence_is_global) {
996 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
997 ScoreBrackets->getScoreUB(EXP_CNT));
998 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
999 ScoreBrackets->getScoreUB(VM_CNT));
1000 }
1001 break;
1002
1003 case SCMEM_SCRATCH:
1004 default:
1005 break;
1006 }
1007 }
1008 }
1009 #endif
1010
1011 // Export & GDS instructions do not read the EXEC mask until after the export
1012 // is granted (which can occur well after the instruction is issued).
1013 // The shader program must flush all EXP operations on the export-count
1014 // before overwriting the EXEC mask.
1015 else {
1016 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1017 // Export and GDS are tracked individually, either may trigger a waitcnt
1018 // for EXEC.
1019 EmitWaitcnt |= ScoreBrackets->updateByWait(
1020 EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
1021 EmitWaitcnt |= ScoreBrackets->updateByWait(
1022 EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
1023 EmitWaitcnt |= ScoreBrackets->updateByWait(
1024 EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
1025 EmitWaitcnt |= ScoreBrackets->updateByWait(
1026 EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
1027 }
1028
1029 #if 0 // TODO: the following code to handle CALL.
1030 // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
1031 // However, there is a problem with EXP_CNT, because the call cannot
1032 // easily tell if a register is used in the function, and if it did, then
1033 // the referring instruction would have to have an S_WAITCNT, which is
1034 // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
1035 // before the call.
1036 if (MI.getOpcode() == SC_CALL) {
1037 if (ScoreBrackets->getScoreUB(EXP_CNT) >
1038 ScoreBrackets->getScoreLB(EXP_CNT)) {
1039 ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
1040 EmitWaitcnt |= CNT_MASK(EXP_CNT);
1041 }
1042 }
1043 #endif
1044
1045 // FIXME: Should not be relying on memoperands.
1046 // Look at the source operands of every instruction to see if
1047 // any of them results from a previous memory operation that affects
1048 // its current usage. If so, an s_waitcnt instruction needs to be
1049 // emitted.
1050 // If the source operand was defined by a load, add the s_waitcnt
1051 // instruction.
1052 for (const MachineMemOperand *Memop : MI.memoperands()) {
1053 unsigned AS = Memop->getAddrSpace();
1054 if (AS != AMDGPUASI.LOCAL_ADDRESS)
1055 continue;
1056 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1057 // VM_CNT is only relevant to vgpr or LDS.
1058 EmitWaitcnt |= ScoreBrackets->updateByWait(
1059 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1060 }
1061
1062 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1063 const MachineOperand &Op = MI.getOperand(I);
1064 const MachineRegisterInfo &MRIA = *MRI;
1065 RegInterval Interval =
1066 ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
1067 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1068 if (TRI->isVGPR(MRIA, Op.getReg())) {
1069 // VM_CNT is only relevant to vgpr or LDS.
1070 EmitWaitcnt |= ScoreBrackets->updateByWait(
1071 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1072 }
1073 EmitWaitcnt |= ScoreBrackets->updateByWait(
1074 LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
1075 }
1076 }
1077 // End of for loop that looks at all source operands to decide vm_wait_cnt
1078 // and lgk_wait_cnt.
1079
1080 // Two cases are handled for destination operands:
1081 // 1) If the destination operand was defined by a load, add the s_waitcnt
1082 // instruction to guarantee the right WAW order.
1083 // 2) If a destination operand that was used by a recent export/store ins,
1084 // add s_waitcnt on exp_cnt to guarantee the WAR order.
1085 if (MI.mayStore()) {
1086 // FIXME: Should not be relying on memoperands.
1087 for (const MachineMemOperand *Memop : MI.memoperands()) {
1088 unsigned AS = Memop->getAddrSpace();
1089 if (AS != AMDGPUASI.LOCAL_ADDRESS)
1090 continue;
1091 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1092 EmitWaitcnt |= ScoreBrackets->updateByWait(
1093 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1094 EmitWaitcnt |= ScoreBrackets->updateByWait(
1095 EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
1096 }
1097 }
1098 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1099 MachineOperand &Def = MI.getOperand(I);
1100 const MachineRegisterInfo &MRIA = *MRI;
1101 RegInterval Interval =
1102 ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
1103 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1104 if (TRI->isVGPR(MRIA, Def.getReg())) {
1105 EmitWaitcnt |= ScoreBrackets->updateByWait(
1106 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1107 EmitWaitcnt |= ScoreBrackets->updateByWait(
1108 EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
1109 }
1110 EmitWaitcnt |= ScoreBrackets->updateByWait(
1111 LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
1112 }
1113 } // End of for loop that looks at all dest operands.
1114 }
1115
1116 // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1117 // occurs before the instruction. Doing it here prevents any additional
1118 // S_WAITCNTs from being emitted if the instruction was marked as
1119 // requiring a WAITCNT beforehand.
1120 if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1121 !ST->hasAutoWaitcntBeforeBarrier()) {
1122 EmitWaitcnt |=
1123 ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
1124 EmitWaitcnt |= ScoreBrackets->updateByWait(
1125 EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
1126 EmitWaitcnt |= ScoreBrackets->updateByWait(
1127 LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
1128 }
1129
1130 // TODO: Remove this work-around, enable the assert for Bug 457939
1131 // after fixing the scheduler. Also, the Shader Compiler code is
1132 // independent of target.
1133 if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
1134 if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1135 ScoreBrackets->getScoreUB(LGKM_CNT) &&
1136 ScoreBrackets->hasPendingSMEM()) {
1137 // Wait on everything, not just LGKM. vccz reads usually come from
1138 // terminators, and we always wait on everything at the end of the
1139 // block, so if we only wait on LGKM here, we might end up with
1140 // another s_waitcnt inserted right after this if there are non-LGKM
1141 // instructions still outstanding.
1142 // FIXME: this is too conservative / the comment is wrong.
1143 // We don't wait on everything at the end of the block and we combine
1144 // waitcnts so we should never have back-to-back waitcnts.
1145 ForceEmitZeroWaitcnt = true;
1146 EmitWaitcnt = true;
1147 }
1148 }
1149
1150 // Does this operand processing indicate s_wait counter update?
1151 if (EmitWaitcnt || IsForceEmitWaitcnt) {
1152 int CntVal[NUM_INST_CNTS];
1153
1154 bool UseDefaultWaitcntStrategy = true;
1155 if (ForceEmitZeroWaitcnt || ForceEmitZeroWaitcnts) {
1156 // Force all waitcnts to 0.
1157 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1158 T = (enum InstCounterType)(T + 1)) {
1159 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1160 }
1161 CntVal[VM_CNT] = 0;
1162 CntVal[EXP_CNT] = 0;
1163 CntVal[LGKM_CNT] = 0;
1164 UseDefaultWaitcntStrategy = false;
1165 }
1166
1167 if (UseDefaultWaitcntStrategy) {
1168 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1169 T = (enum InstCounterType)(T + 1)) {
1170 if (EmitWaitcnt & CNT_MASK(T)) {
1171 int Delta =
1172 ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
1173 int MaxDelta = ScoreBrackets->getWaitCountMax(T);
1174 if (Delta >= MaxDelta) {
1175 Delta = -1;
1176 if (T != EXP_CNT) {
1177 ScoreBrackets->setScoreLB(
1178 T, ScoreBrackets->getScoreUB(T) - MaxDelta);
1179 }
1180 EmitWaitcnt &= ~CNT_MASK(T);
1181 }
1182 CntVal[T] = Delta;
1183 } else {
1184 // If we are not waiting for a particular counter then encode
1185 // it as -1 which means "don't care."
1186 CntVal[T] = -1;
1187 }
1188 }
1189 }
1190
1191 // If we are not waiting on any counter we can skip the wait altogether.
1192 if (EmitWaitcnt != 0 || IsForceEmitWaitcnt) {
1193 MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
1194 int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
1195 if (!OldWaitcnt ||
1196 (AMDGPU::decodeVmcnt(IV, Imm) !=
1197 (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
1198 (AMDGPU::decodeExpcnt(IV, Imm) !=
1199 (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
1200 (AMDGPU::decodeLgkmcnt(IV, Imm) !=
1201 (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
1202 MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
1203 if (ContainingLoop) {
1204 MachineBasicBlock *TBB = ContainingLoop->getHeader();
1205 BlockWaitcntBrackets *ScoreBracket =
1206 BlockWaitcntBracketsMap[TBB].get();
1207 if (!ScoreBracket) {
1208 assert(!BlockVisitedSet.count(TBB));
1209 BlockWaitcntBracketsMap[TBB] =
1210 llvm::make_unique<BlockWaitcntBrackets>(ST);
1211 ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
1212 }
1213 ScoreBracket->setRevisitLoop(true);
1214 LLVM_DEBUG(dbgs()
1215 << "set-revisit2: Block"
1216 << ContainingLoop->getHeader()->getNumber() << '\n';);
1217 }
1218 }
1219
1220 // Update an existing waitcount, or make a new one.
1221 unsigned Enc = AMDGPU::encodeWaitcnt(IV,
1222 ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT],
1223 ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT],
1224 ForceEmitWaitcnt[LGKM_CNT] ? 0 : CntVal[LGKM_CNT]);
1225 // We don't remove waitcnts that existed prior to the waitcnt
1226 // pass. Check if the waitcnt to-be-inserted can be avoided
1227 // or if the prev waitcnt can be updated.
1228 bool insertSWaitInst = true;
1229 for (MachineBasicBlock::iterator I = MI.getIterator(),
1230 B = MI.getParent()->begin();
1231 insertSWaitInst && I != B; --I) {
1232 if (I == MI.getIterator())
1233 continue;
1234
1235 switch (I->getOpcode()) {
1236 case AMDGPU::S_WAITCNT:
1237 if (isWaitcntStronger(I->getOperand(0).getImm(), Enc))
1238 insertSWaitInst = false;
1239 else if (!OldWaitcnt) {
1240 OldWaitcnt = &*I;
1241 Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc);
1242 }
1243 break;
1244 // TODO: skip over instructions which never require wait.
1245 }
1246 break;
1247 }
1248 if (insertSWaitInst) {
1249 if (OldWaitcnt && OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT) {
1250 if (ForceEmitZeroWaitcnts)
1251 LLVM_DEBUG(
1252 dbgs()
1253 << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n");
1254 if (IsForceEmitWaitcnt)
1255 LLVM_DEBUG(dbgs()
1256 << "Force emit a s_waitcnt due to debug counter\n");
1257
1258 OldWaitcnt->getOperand(0).setImm(Enc);
1259 if (!OldWaitcnt->getParent())
1260 MI.getParent()->insert(MI, OldWaitcnt);
1261
1262 LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
1263 << "Old Instr: " << MI << '\n'
1264 << "New Instr: " << *OldWaitcnt << '\n');
1265 } else {
1266 auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
1267 MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1268 .addImm(Enc);
1269 TrackedWaitcntSet.insert(SWaitInst);
1270
1271 LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
1272 << "Old Instr: " << MI << '\n'
1273 << "New Instr: " << *SWaitInst << '\n');
1274 }
1275 }
1276
1277 if (CntVal[EXP_CNT] == 0) {
1278 ScoreBrackets->setMixedExpTypes(false);
1279 }
1280 }
1281 }
1282 }
1283
insertWaitcntBeforeCF(MachineBasicBlock & MBB,MachineInstr * Waitcnt)1284 void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
1285 MachineInstr *Waitcnt) {
1286 if (MBB.empty()) {
1287 MBB.push_back(Waitcnt);
1288 return;
1289 }
1290
1291 MachineBasicBlock::iterator It = MBB.end();
1292 MachineInstr *MI = &*(--It);
1293 if (MI->isBranch()) {
1294 MBB.insert(It, Waitcnt);
1295 } else {
1296 MBB.push_back(Waitcnt);
1297 }
1298 }
1299
1300 // This is a flat memory operation. Check to see if it has memory
1301 // tokens for both LDS and Memory, and if so mark it as a flat.
mayAccessLDSThroughFlat(const MachineInstr & MI) const1302 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1303 if (MI.memoperands_empty())
1304 return true;
1305
1306 for (const MachineMemOperand *Memop : MI.memoperands()) {
1307 unsigned AS = Memop->getAddrSpace();
1308 if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS)
1309 return true;
1310 }
1311
1312 return false;
1313 }
1314
updateEventWaitcntAfter(MachineInstr & Inst,BlockWaitcntBrackets * ScoreBrackets)1315 void SIInsertWaitcnts::updateEventWaitcntAfter(
1316 MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
1317 // Now look at the instruction opcode. If it is a memory access
1318 // instruction, update the upper-bound of the appropriate counter's
1319 // bracket and the destination operand scores.
1320 // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1321 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1322 if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1323 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1324 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1325 } else {
1326 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1327 }
1328 } else if (TII->isFLAT(Inst)) {
1329 assert(Inst.mayLoad() || Inst.mayStore());
1330
1331 if (TII->usesVM_CNT(Inst))
1332 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1333
1334 if (TII->usesLGKM_CNT(Inst)) {
1335 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1336
1337 // This is a flat memory operation, so note it - it will require
1338 // that both the VM and LGKM be flushed to zero if it is pending when
1339 // a VM or LGKM dependency occurs.
1340 if (mayAccessLDSThroughFlat(Inst))
1341 ScoreBrackets->setPendingFlat();
1342 }
1343 } else if (SIInstrInfo::isVMEM(Inst) &&
1344 // TODO: get a better carve out.
1345 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
1346 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1347 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
1348 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1349 if (ST->vmemWriteNeedsExpWaitcnt() &&
1350 (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
1351 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1352 }
1353 } else if (TII->isSMRD(Inst)) {
1354 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1355 } else {
1356 switch (Inst.getOpcode()) {
1357 case AMDGPU::S_SENDMSG:
1358 case AMDGPU::S_SENDMSGHALT:
1359 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1360 break;
1361 case AMDGPU::EXP:
1362 case AMDGPU::EXP_DONE: {
1363 int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1364 if (Imm >= 32 && Imm <= 63)
1365 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1366 else if (Imm >= 12 && Imm <= 15)
1367 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1368 else
1369 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1370 break;
1371 }
1372 case AMDGPU::S_MEMTIME:
1373 case AMDGPU::S_MEMREALTIME:
1374 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1375 break;
1376 default:
1377 break;
1378 }
1379 }
1380 }
1381
1382 // Merge the score brackets of the Block's predecessors;
1383 // this merged score bracket is used when adding waitcnts to the Block
mergeInputScoreBrackets(MachineBasicBlock & Block)1384 void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
1385 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1386 int32_t MaxPending[NUM_INST_CNTS] = {0};
1387 int32_t MaxFlat[NUM_INST_CNTS] = {0};
1388 bool MixedExpTypes = false;
1389
1390 // For single basic block loops, we need to retain the Block's
1391 // score bracket to have accurate Pred info. So, make a copy of Block's
1392 // score bracket, clear() it (which retains several important bits of info),
1393 // populate, and then replace en masse. For non-single basic block loops,
1394 // just clear Block's current score bracket and repopulate in-place.
1395 bool IsSelfPred;
1396 std::unique_ptr<BlockWaitcntBrackets> S;
1397
1398 IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
1399 != Block.pred_end();
1400 if (IsSelfPred) {
1401 S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1402 ScoreBrackets = S.get();
1403 }
1404
1405 ScoreBrackets->clear();
1406
1407 // See if there are any uninitialized predecessors. If so, emit an
1408 // s_waitcnt 0 at the beginning of the block.
1409 for (MachineBasicBlock *Pred : Block.predecessors()) {
1410 BlockWaitcntBrackets *PredScoreBrackets =
1411 BlockWaitcntBracketsMap[Pred].get();
1412 bool Visited = BlockVisitedSet.count(Pred);
1413 if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
1414 continue;
1415 }
1416 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1417 T = (enum InstCounterType)(T + 1)) {
1418 int span =
1419 PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
1420 MaxPending[T] = std::max(MaxPending[T], span);
1421 span =
1422 PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
1423 MaxFlat[T] = std::max(MaxFlat[T], span);
1424 }
1425
1426 MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
1427 }
1428
1429 // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1430 // Also handle kills for exit block.
1431 if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1432 for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1433 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1434 T = (enum InstCounterType)(T + 1)) {
1435 int Span = KillWaitBrackets[I]->getScoreUB(T) -
1436 KillWaitBrackets[I]->getScoreLB(T);
1437 MaxPending[T] = std::max(MaxPending[T], Span);
1438 Span = KillWaitBrackets[I]->pendingFlat(T) -
1439 KillWaitBrackets[I]->getScoreLB(T);
1440 MaxFlat[T] = std::max(MaxFlat[T], Span);
1441 }
1442
1443 MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes();
1444 }
1445 }
1446
1447 // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
1448 for (MachineBasicBlock *Pred : Block.predecessors()) {
1449 BlockWaitcntBrackets *PredScoreBrackets =
1450 BlockWaitcntBracketsMap[Pred].get();
1451 bool Visited = BlockVisitedSet.count(Pred);
1452 if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
1453 continue;
1454 }
1455
1456 int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
1457 PredScoreBrackets->getScoreLB(EXP_CNT);
1458 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
1459 int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
1460 PredScoreBrackets->getScoreLB(EXP_CNT);
1461 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
1462 }
1463
1464 // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1465 if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1466 for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1467 int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) -
1468 KillWaitBrackets[I]->getScoreLB(EXP_CNT);
1469 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
1470 int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) -
1471 KillWaitBrackets[I]->getScoreLB(EXP_CNT);
1472 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
1473 }
1474 }
1475
1476 #if 0
1477 // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
1478 // TODO: how does LC distinguish between function entry and main entry?
1479 // If this is the entry to a function, force a wait.
1480 MachineBasicBlock &Entry = Block.getParent()->front();
1481 if (Entry.getNumber() == Block.getNumber()) {
1482 ScoreBrackets->setWaitAtBeginning();
1483 return;
1484 }
1485 #endif
1486
1487 // Now set the current Block's brackets to the largest ending bracket.
1488 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1489 T = (enum InstCounterType)(T + 1)) {
1490 ScoreBrackets->setScoreUB(T, MaxPending[T]);
1491 ScoreBrackets->setScoreLB(T, 0);
1492 ScoreBrackets->setLastFlat(T, MaxFlat[T]);
1493 }
1494
1495 ScoreBrackets->setMixedExpTypes(MixedExpTypes);
1496
1497 // Set the register scoreboard.
1498 for (MachineBasicBlock *Pred : Block.predecessors()) {
1499 if (!BlockVisitedSet.count(Pred)) {
1500 continue;
1501 }
1502
1503 BlockWaitcntBrackets *PredScoreBrackets =
1504 BlockWaitcntBracketsMap[Pred].get();
1505
1506 // Now merge the gpr_reg_score information
1507 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1508 T = (enum InstCounterType)(T + 1)) {
1509 int PredLB = PredScoreBrackets->getScoreLB(T);
1510 int PredUB = PredScoreBrackets->getScoreUB(T);
1511 if (PredLB < PredUB) {
1512 int PredScale = MaxPending[T] - PredUB;
1513 // Merge vgpr scores.
1514 for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
1515 int PredRegScore = PredScoreBrackets->getRegScore(J, T);
1516 if (PredRegScore <= PredLB)
1517 continue;
1518 int NewRegScore = PredScale + PredRegScore;
1519 ScoreBrackets->setRegScore(
1520 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
1521 }
1522 // Also need to merge sgpr scores for lgkm_cnt.
1523 if (T == LGKM_CNT) {
1524 for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
1525 int PredRegScore =
1526 PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
1527 if (PredRegScore <= PredLB)
1528 continue;
1529 int NewRegScore = PredScale + PredRegScore;
1530 ScoreBrackets->setRegScore(
1531 J + NUM_ALL_VGPRS, LGKM_CNT,
1532 std::max(
1533 ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
1534 NewRegScore));
1535 }
1536 }
1537 }
1538 }
1539
1540 // Also merge the WaitEvent information.
1541 ForAllWaitEventType(W) {
1542 enum InstCounterType T = PredScoreBrackets->eventCounter(W);
1543 int PredEventUB = PredScoreBrackets->getEventUB(W);
1544 if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
1545 int NewEventUB =
1546 MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
1547 if (NewEventUB > 0) {
1548 ScoreBrackets->setEventUB(
1549 W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
1550 }
1551 }
1552 }
1553 }
1554
1555 // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1556 // Set the register scoreboard.
1557 if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1558 for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1559 // Now merge the gpr_reg_score information.
1560 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1561 T = (enum InstCounterType)(T + 1)) {
1562 int PredLB = KillWaitBrackets[I]->getScoreLB(T);
1563 int PredUB = KillWaitBrackets[I]->getScoreUB(T);
1564 if (PredLB < PredUB) {
1565 int PredScale = MaxPending[T] - PredUB;
1566 // Merge vgpr scores.
1567 for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) {
1568 int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T);
1569 if (PredRegScore <= PredLB)
1570 continue;
1571 int NewRegScore = PredScale + PredRegScore;
1572 ScoreBrackets->setRegScore(
1573 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
1574 }
1575 // Also need to merge sgpr scores for lgkm_cnt.
1576 if (T == LGKM_CNT) {
1577 for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) {
1578 int PredRegScore =
1579 KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
1580 if (PredRegScore <= PredLB)
1581 continue;
1582 int NewRegScore = PredScale + PredRegScore;
1583 ScoreBrackets->setRegScore(
1584 J + NUM_ALL_VGPRS, LGKM_CNT,
1585 std::max(
1586 ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
1587 NewRegScore));
1588 }
1589 }
1590 }
1591 }
1592
1593 // Also merge the WaitEvent information.
1594 ForAllWaitEventType(W) {
1595 enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W);
1596 int PredEventUB = KillWaitBrackets[I]->getEventUB(W);
1597 if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) {
1598 int NewEventUB =
1599 MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T);
1600 if (NewEventUB > 0) {
1601 ScoreBrackets->setEventUB(
1602 W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
1603 }
1604 }
1605 }
1606 }
1607 }
1608
1609 // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
1610 // sequencing predecessors, because changes to EXEC require waitcnts due to
1611 // the delayed nature of these operations.
1612 for (MachineBasicBlock *Pred : Block.predecessors()) {
1613 if (!BlockVisitedSet.count(Pred)) {
1614 continue;
1615 }
1616
1617 BlockWaitcntBrackets *PredScoreBrackets =
1618 BlockWaitcntBracketsMap[Pred].get();
1619
1620 int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
1621 if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1622 int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
1623 PredScoreBrackets->getScoreUB(EXP_CNT);
1624 if (new_gds_ub > 0) {
1625 ScoreBrackets->setEventUB(
1626 GDS_GPR_LOCK,
1627 std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
1628 }
1629 }
1630 int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
1631 if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1632 int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
1633 PredScoreBrackets->getScoreUB(EXP_CNT);
1634 if (new_exp_ub > 0) {
1635 ScoreBrackets->setEventUB(
1636 EXP_GPR_LOCK,
1637 std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
1638 }
1639 }
1640 }
1641
1642 // if a single block loop, update the score brackets. Not needed for other
1643 // blocks, as we did this in-place
1644 if (IsSelfPred) {
1645 BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1646 }
1647 }
1648
1649 /// Return true if the given basic block is a "bottom" block of a loop.
1650 /// This works even if the loop is discontiguous. This also handles
1651 /// multiple back-edges for the same "header" block of a loop.
isLoopBottom(const MachineLoop * Loop,const MachineBasicBlock * Block)1652 bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
1653 const MachineBasicBlock *Block) {
1654 for (MachineBasicBlock *MBB : Loop->blocks()) {
1655 if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
1656 return true;
1657 }
1658 }
1659 return false;
1660 }
1661
1662 /// Count the number of "bottom" basic blocks of a loop.
countNumBottomBlocks(const MachineLoop * Loop)1663 unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
1664 unsigned Count = 0;
1665 for (MachineBasicBlock *MBB : Loop->blocks()) {
1666 if (MBB->isSuccessor(Loop->getHeader())) {
1667 Count++;
1668 }
1669 }
1670 return Count;
1671 }
1672
1673 // Generate s_waitcnt instructions where needed.
insertWaitcntInBlock(MachineFunction & MF,MachineBasicBlock & Block)1674 void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1675 MachineBasicBlock &Block) {
1676 // Initialize the state information.
1677 mergeInputScoreBrackets(Block);
1678
1679 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1680
1681 LLVM_DEBUG({
1682 dbgs() << "*** Block" << Block.getNumber() << " ***";
1683 ScoreBrackets->dump();
1684 });
1685
1686 // Walk over the instructions.
1687 for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
1688 Iter != E;) {
1689 MachineInstr &Inst = *Iter;
1690 // Remove any previously existing waitcnts.
1691 if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
1692 // Leave pre-existing waitcnts, but note their existence via setWaitcnt.
1693 // Remove the waitcnt-pass-generated waitcnts; the pass will add them back
1694 // as needed.
1695 if (!TrackedWaitcntSet.count(&Inst))
1696 ++Iter;
1697 else {
1698 ++Iter;
1699 Inst.removeFromParent();
1700 }
1701 ScoreBrackets->setWaitcnt(&Inst);
1702 continue;
1703 }
1704
1705 // Kill instructions generate a conditional branch to the endmain block.
1706 // Merge the current waitcnt state into the endmain block information.
1707 // TODO: Are there other flavors of KILL instruction?
1708 if (Inst.getOpcode() == AMDGPU::KILL) {
1709 addKillWaitBracket(ScoreBrackets);
1710 }
1711
1712 bool VCCZBugWorkAround = false;
1713 if (readsVCCZ(Inst) &&
1714 (!VCCZBugHandledSet.count(&Inst))) {
1715 if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1716 ScoreBrackets->getScoreUB(LGKM_CNT) &&
1717 ScoreBrackets->hasPendingSMEM()) {
1718 if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
1719 VCCZBugWorkAround = true;
1720 }
1721 }
1722
1723 // Generate an s_waitcnt instruction to be placed before
1724 // cur_Inst, if needed.
1725 generateWaitcntInstBefore(Inst, ScoreBrackets);
1726
1727 updateEventWaitcntAfter(Inst, ScoreBrackets);
1728
1729 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
1730 // If this instruction generates a S_SETVSKIP because it is an
1731 // indexed resource, and we are on Tahiti, then it will also force
1732 // an S_WAITCNT vmcnt(0)
1733 if (RequireCheckResourceType(Inst, context)) {
1734 // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1735 ScoreBrackets->setScoreLB(VM_CNT,
1736 ScoreBrackets->getScoreUB(VM_CNT));
1737 }
1738 #endif
1739
1740 ScoreBrackets->clearWaitcnt();
1741
1742 LLVM_DEBUG({
1743 Inst.print(dbgs());
1744 ScoreBrackets->dump();
1745 });
1746
1747 // Check to see if this is a GWS instruction. If so, and if this is CI or
1748 // VI, then the generated code sequence will include an S_WAITCNT 0.
1749 // TODO: Are these the only GWS instructions?
1750 if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
1751 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
1752 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
1753 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
1754 Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
1755 // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
1756 ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
1757 ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
1758 ScoreBrackets->updateByWait(LGKM_CNT,
1759 ScoreBrackets->getScoreUB(LGKM_CNT));
1760 }
1761
1762 // TODO: Remove this work-around after fixing the scheduler and enable the
1763 // assert above.
1764 if (VCCZBugWorkAround) {
1765 // Restore the vccz bit. Any time a value is written to vcc, the vcc
1766 // bit is updated, so we can restore the bit by reading the value of
1767 // vcc and then writing it back to the register.
1768 BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
1769 AMDGPU::VCC)
1770 .addReg(AMDGPU::VCC);
1771 VCCZBugHandledSet.insert(&Inst);
1772 }
1773
1774 ++Iter;
1775 }
1776
1777 // Check if we need to force convergence at loop footer.
1778 MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
1779 if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
1780 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1781 WaitcntData->print();
1782 LLVM_DEBUG(dbgs() << '\n';);
1783
1784 // The iterative waitcnt insertion algorithm aims for optimal waitcnt
1785 // placement, but doesn't guarantee convergence for a loop. Each
1786 // loop should take at most (n+1) iterations for it to converge naturally,
1787 // where n is the number of bottom blocks. If this threshold is reached and
1788 // the result hasn't converged, then we force convergence by inserting
1789 // a s_waitcnt at the end of loop footer.
1790 if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
1791 // To ensure convergence, need to make wait events at loop footer be no
1792 // more than those from the previous iteration.
1793 // As a simplification, instead of tracking individual scores and
1794 // generating the precise wait count, just wait on 0.
1795 bool HasPending = false;
1796 MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
1797 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1798 T = (enum InstCounterType)(T + 1)) {
1799 if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
1800 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1801 HasPending = true;
1802 break;
1803 }
1804 }
1805
1806 if (HasPending) {
1807 if (!SWaitInst) {
1808 SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
1809 DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1810 .addImm(0);
1811 TrackedWaitcntSet.insert(SWaitInst);
1812 #if 0 // TODO: Format the debug output
1813 OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
1814 OutputTransformAdd(SWaitInst, context);
1815 #endif
1816 }
1817 #if 0 // TODO: ??
1818 _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
1819 #endif
1820 }
1821
1822 if (SWaitInst) {
1823 LLVM_DEBUG({
1824 SWaitInst->print(dbgs());
1825 dbgs() << "\nAdjusted score board:";
1826 ScoreBrackets->dump();
1827 });
1828
1829 // Add this waitcnt to the block. It is either newly created or
1830 // created in previous iterations and added back since block traversal
1831 // always removes waitcnts.
1832 insertWaitcntBeforeCF(Block, SWaitInst);
1833 WaitcntData->setWaitcnt(SWaitInst);
1834 }
1835 }
1836 }
1837 }
1838
runOnMachineFunction(MachineFunction & MF)1839 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1840 ST = &MF.getSubtarget<GCNSubtarget>();
1841 TII = ST->getInstrInfo();
1842 TRI = &TII->getRegisterInfo();
1843 MRI = &MF.getRegInfo();
1844 MLI = &getAnalysis<MachineLoopInfo>();
1845 IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
1846 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1847 AMDGPUASI = ST->getAMDGPUAS();
1848
1849 ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
1850 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1851 T = (enum InstCounterType)(T + 1))
1852 ForceEmitWaitcnt[T] = false;
1853
1854 HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1855 HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1856 HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1857
1858 HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
1859 HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
1860 assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1861 assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1862
1863 RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1864 RegisterEncoding.VGPRL =
1865 RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
1866 RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1867 RegisterEncoding.SGPRL =
1868 RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
1869
1870 TrackedWaitcntSet.clear();
1871 BlockVisitedSet.clear();
1872 VCCZBugHandledSet.clear();
1873 LoopWaitcntDataMap.clear();
1874 BlockWaitcntProcessedSet.clear();
1875
1876 // Walk over the blocks in reverse post-dominator order, inserting
1877 // s_waitcnt where needed.
1878 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
1879 bool Modified = false;
1880 for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1881 I = RPOT.begin(),
1882 E = RPOT.end(), J = RPOT.begin();
1883 I != E;) {
1884 MachineBasicBlock &MBB = **I;
1885
1886 BlockVisitedSet.insert(&MBB);
1887
1888 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1889 if (!ScoreBrackets) {
1890 BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);
1891 ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1892 }
1893 ScoreBrackets->setPostOrder(MBB.getNumber());
1894 MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
1895 if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
1896 LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
1897
1898 // If we are walking into the block from before the loop, then guarantee
1899 // at least 1 re-walk over the loop to propagate the information, even if
1900 // no S_WAITCNT instructions were generated.
1901 if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
1902 unsigned Count = countNumBottomBlocks(ContainingLoop);
1903
1904 // If the loop has multiple back-edges, and so more than one "bottom"
1905 // basic block, we have to guarantee a re-walk over every blocks.
1906 if ((std::count(BlockWaitcntProcessedSet.begin(),
1907 BlockWaitcntProcessedSet.end(), &MBB) < (int)Count)) {
1908 BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
1909 LLVM_DEBUG(dbgs() << "set-revisit1: Block"
1910 << ContainingLoop->getHeader()->getNumber() << '\n';);
1911 }
1912 }
1913
1914 // Walk over the instructions.
1915 insertWaitcntInBlock(MF, MBB);
1916
1917 // Record that waitcnts have been processed at least once for this block.
1918 BlockWaitcntProcessedSet.push_back(&MBB);
1919
1920 // See if we want to revisit the loop. If a loop has multiple back-edges,
1921 // we shouldn't revisit the same "bottom" basic block.
1922 if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
1923 std::count(BlockWaitcntProcessedSet.begin(),
1924 BlockWaitcntProcessedSet.end(), &MBB) == 1) {
1925 MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
1926 BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
1927 if (EntrySB && EntrySB->getRevisitLoop()) {
1928 EntrySB->setRevisitLoop(false);
1929 J = I;
1930 int32_t PostOrder = EntrySB->getPostOrder();
1931 // TODO: Avoid this loop. Find another way to set I.
1932 for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1933 X = RPOT.begin(),
1934 Y = RPOT.end();
1935 X != Y; ++X) {
1936 MachineBasicBlock &MBBX = **X;
1937 if (MBBX.getNumber() == PostOrder) {
1938 I = X;
1939 break;
1940 }
1941 }
1942 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1943 WaitcntData->incIterCnt();
1944 LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
1945 continue;
1946 } else {
1947 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1948 // Loop converged, reset iteration count. If this loop gets revisited,
1949 // it must be from an outer loop, the counter will restart, this will
1950 // ensure we don't force convergence on such revisits.
1951 WaitcntData->resetIterCnt();
1952 }
1953 }
1954
1955 J = I;
1956 ++I;
1957 }
1958
1959 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
1960
1961 bool HaveScalarStores = false;
1962
1963 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1964 ++BI) {
1965 MachineBasicBlock &MBB = *BI;
1966
1967 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1968 ++I) {
1969 if (!HaveScalarStores && TII->isScalarStore(*I))
1970 HaveScalarStores = true;
1971
1972 if (I->getOpcode() == AMDGPU::S_ENDPGM ||
1973 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1974 EndPgmBlocks.push_back(&MBB);
1975 }
1976 }
1977
1978 if (HaveScalarStores) {
1979 // If scalar writes are used, the cache must be flushed or else the next
1980 // wave to reuse the same scratch memory can be clobbered.
1981 //
1982 // Insert s_dcache_wb at wave termination points if there were any scalar
1983 // stores, and only if the cache hasn't already been flushed. This could be
1984 // improved by looking across blocks for flushes in postdominating blocks
1985 // from the stores but an explicitly requested flush is probably very rare.
1986 for (MachineBasicBlock *MBB : EndPgmBlocks) {
1987 bool SeenDCacheWB = false;
1988
1989 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1990 ++I) {
1991 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1992 SeenDCacheWB = true;
1993 else if (TII->isScalarStore(*I))
1994 SeenDCacheWB = false;
1995
1996 // FIXME: It would be better to insert this before a waitcnt if any.
1997 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1998 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1999 !SeenDCacheWB) {
2000 Modified = true;
2001 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
2002 }
2003 }
2004 }
2005 }
2006
2007 if (!MFI->isEntryFunction()) {
2008 // Wait for any outstanding memory operations that the input registers may
2009 // depend on. We can't track them and it's better to the wait after the
2010 // costly call sequence.
2011
2012 // TODO: Could insert earlier and schedule more liberally with operations
2013 // that only use caller preserved registers.
2014 MachineBasicBlock &EntryBB = MF.front();
2015 BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
2016 .addImm(0);
2017
2018 Modified = true;
2019 }
2020
2021 return Modified;
2022 }
2023