• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Insert wait instructions for memory reads and writes.
12 ///
13 /// Memory reads and writes are issued asynchronously, so we need to insert
14 /// S_WAITCNT instructions when we want to access any of their results or
15 /// overwrite any register that's used asynchronously.
16 //
17 //===----------------------------------------------------------------------===//
18 
19 #include "AMDGPU.h"
20 #include "AMDGPUSubtarget.h"
21 #include "SIDefines.h"
22 #include "SIInstrInfo.h"
23 #include "SIMachineFunctionInfo.h"
24 #include "llvm/CodeGen/MachineFunction.h"
25 #include "llvm/CodeGen/MachineFunctionPass.h"
26 #include "llvm/CodeGen/MachineInstrBuilder.h"
27 #include "llvm/CodeGen/MachineRegisterInfo.h"
28 
29 #define DEBUG_TYPE "si-insert-waits"
30 
31 using namespace llvm;
32 
33 namespace {
34 
35 /// \brief One variable for each of the hardware counters
36 typedef union {
37   struct {
38     unsigned VM;
39     unsigned EXP;
40     unsigned LGKM;
41   } Named;
42   unsigned Array[3];
43 
44 } Counters;
45 
46 typedef enum {
47   OTHER,
48   SMEM,
49   VMEM
50 } InstType;
51 
52 typedef Counters RegCounters[512];
53 typedef std::pair<unsigned, unsigned> RegInterval;
54 
55 class SIInsertWaits : public MachineFunctionPass {
56 
57 private:
58   const SISubtarget *ST;
59   const SIInstrInfo *TII;
60   const SIRegisterInfo *TRI;
61   const MachineRegisterInfo *MRI;
62 
63   /// \brief Constant hardware limits
64   static const Counters WaitCounts;
65 
66   /// \brief Constant zero value
67   static const Counters ZeroCounts;
68 
69   /// \brief Counter values we have already waited on.
70   Counters WaitedOn;
71 
72   /// \brief Counter values that we must wait on before the next counter
73   /// increase.
74   Counters DelayedWaitOn;
75 
76   /// \brief Counter values for last instruction issued.
77   Counters LastIssued;
78 
79   /// \brief Registers used by async instructions.
80   RegCounters UsedRegs;
81 
82   /// \brief Registers defined by async instructions.
83   RegCounters DefinedRegs;
84 
85   /// \brief Different export instruction types seen since last wait.
86   unsigned ExpInstrTypesSeen;
87 
88   /// \brief Type of the last opcode.
89   InstType LastOpcodeType;
90 
91   bool LastInstWritesM0;
92 
93   /// \brief Whether the machine function returns void
94   bool ReturnsVoid;
95 
96   /// Whether the VCCZ bit is possibly corrupt
97   bool VCCZCorrupt;
98 
99   /// \brief Get increment/decrement amount for this instruction.
100   Counters getHwCounts(MachineInstr &MI);
101 
102   /// \brief Is operand relevant for async execution?
103   bool isOpRelevant(MachineOperand &Op);
104 
105   /// \brief Get register interval an operand affects.
106   RegInterval getRegInterval(const TargetRegisterClass *RC,
107                              const MachineOperand &Reg) const;
108 
109   /// \brief Handle instructions async components
110   void pushInstruction(MachineBasicBlock &MBB,
111                        MachineBasicBlock::iterator I,
112                        const Counters& Increment);
113 
114   /// \brief Insert the actual wait instruction
115   bool insertWait(MachineBasicBlock &MBB,
116                   MachineBasicBlock::iterator I,
117                   const Counters &Counts);
118 
119   /// \brief Handle existing wait instructions (from intrinsics)
120   void handleExistingWait(MachineBasicBlock::iterator I);
121 
122   /// \brief Do we need def2def checks?
123   bool unorderedDefines(MachineInstr &MI);
124 
125   /// \brief Resolve all operand dependencies to counter requirements
126   Counters handleOperands(MachineInstr &MI);
127 
128   /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
129   void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
130 
131   /// Return true if there are LGKM instrucitons that haven't been waited on
132   /// yet.
133   bool hasOutstandingLGKM() const;
134 
135 public:
136   static char ID;
137 
SIInsertWaits()138   SIInsertWaits() :
139     MachineFunctionPass(ID),
140     ST(nullptr),
141     TII(nullptr),
142     TRI(nullptr),
143     ExpInstrTypesSeen(0),
144     VCCZCorrupt(false) { }
145 
146   bool runOnMachineFunction(MachineFunction &MF) override;
147 
getPassName() const148   const char *getPassName() const override {
149     return "SI insert wait instructions";
150   }
151 
getAnalysisUsage(AnalysisUsage & AU) const152   void getAnalysisUsage(AnalysisUsage &AU) const override {
153     AU.setPreservesCFG();
154     MachineFunctionPass::getAnalysisUsage(AU);
155   }
156 };
157 
158 } // End anonymous namespace
159 
160 INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE,
161                       "SI Insert Waits", false, false)
162 INITIALIZE_PASS_END(SIInsertWaits, DEBUG_TYPE,
163                     "SI Insert Waits", false, false)
164 
165 char SIInsertWaits::ID = 0;
166 
167 char &llvm::SIInsertWaitsID = SIInsertWaits::ID;
168 
createSIInsertWaitsPass()169 FunctionPass *llvm::createSIInsertWaitsPass() {
170   return new SIInsertWaits();
171 }
172 
173 const Counters SIInsertWaits::WaitCounts = { { 15, 7, 15 } };
174 const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
175 
readsVCCZ(unsigned Opcode)176 static bool readsVCCZ(unsigned Opcode) {
177   return Opcode == AMDGPU::S_CBRANCH_VCCNZ || Opcode == AMDGPU::S_CBRANCH_VCCZ;
178 }
179 
hasOutstandingLGKM() const180 bool SIInsertWaits::hasOutstandingLGKM() const {
181   return WaitedOn.Named.LGKM != LastIssued.Named.LGKM;
182 }
183 
getHwCounts(MachineInstr & MI)184 Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
185   uint64_t TSFlags = MI.getDesc().TSFlags;
186   Counters Result = { { 0, 0, 0 } };
187 
188   Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
189 
190   // Only consider stores or EXP for EXP_CNT
191   Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT &&
192       (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore()));
193 
194   // LGKM may uses larger values
195   if (TSFlags & SIInstrFlags::LGKM_CNT) {
196 
197     if (TII->isSMRD(MI)) {
198 
199       if (MI.getNumOperands() != 0) {
200         assert(MI.getOperand(0).isReg() &&
201                "First LGKM operand must be a register!");
202 
203         // XXX - What if this is a write into a super register?
204         const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
205         unsigned Size = RC->getSize();
206         Result.Named.LGKM = Size > 4 ? 2 : 1;
207       } else {
208         // s_dcache_inv etc. do not have a a destination register. Assume we
209         // want a wait on these.
210         // XXX - What is the right value?
211         Result.Named.LGKM = 1;
212       }
213     } else {
214       // DS
215       Result.Named.LGKM = 1;
216     }
217 
218   } else {
219     Result.Named.LGKM = 0;
220   }
221 
222   return Result;
223 }
224 
isOpRelevant(MachineOperand & Op)225 bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
226   // Constants are always irrelevant
227   if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
228     return false;
229 
230   // Defines are always relevant
231   if (Op.isDef())
232     return true;
233 
234   // For exports all registers are relevant
235   MachineInstr &MI = *Op.getParent();
236   if (MI.getOpcode() == AMDGPU::EXP)
237     return true;
238 
239   // For stores the stored value is also relevant
240   if (!MI.getDesc().mayStore())
241     return false;
242 
243   // Check if this operand is the value being stored.
244   // Special case for DS/FLAT instructions, since the address
245   // operand comes before the value operand and it may have
246   // multiple data operands.
247 
248   if (TII->isDS(MI) || TII->isFLAT(MI)) {
249     MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data);
250     if (Data && Op.isIdenticalTo(*Data))
251       return true;
252   }
253 
254   if (TII->isDS(MI)) {
255     MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
256     if (Data0 && Op.isIdenticalTo(*Data0))
257       return true;
258 
259     MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
260     return Data1 && Op.isIdenticalTo(*Data1);
261   }
262 
263   // NOTE: This assumes that the value operand is before the
264   // address operand, and that there is only one value operand.
265   for (MachineInstr::mop_iterator I = MI.operands_begin(),
266        E = MI.operands_end(); I != E; ++I) {
267 
268     if (I->isReg() && I->isUse())
269       return Op.isIdenticalTo(*I);
270   }
271 
272   return false;
273 }
274 
getRegInterval(const TargetRegisterClass * RC,const MachineOperand & Reg) const275 RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
276                                           const MachineOperand &Reg) const {
277   unsigned Size = RC->getSize();
278   assert(Size >= 4);
279 
280   RegInterval Result;
281   Result.first = TRI->getEncodingValue(Reg.getReg());
282   Result.second = Result.first + Size / 4;
283 
284   return Result;
285 }
286 
pushInstruction(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const Counters & Increment)287 void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
288                                     MachineBasicBlock::iterator I,
289                                     const Counters &Increment) {
290 
291   // Get the hardware counter increments and sum them up
292   Counters Limit = ZeroCounts;
293   unsigned Sum = 0;
294 
295   for (unsigned i = 0; i < 3; ++i) {
296     LastIssued.Array[i] += Increment.Array[i];
297     if (Increment.Array[i])
298       Limit.Array[i] = LastIssued.Array[i];
299     Sum += Increment.Array[i];
300   }
301 
302   // If we don't increase anything then that's it
303   if (Sum == 0) {
304     LastOpcodeType = OTHER;
305     return;
306   }
307 
308   if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
309     // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
310     // or SMEM clause, respectively.
311     //
312     // The temporary workaround is to break the clauses with S_NOP.
313     //
314     // The proper solution would be to allocate registers such that all source
315     // and destination registers don't overlap, e.g. this is illegal:
316     //   r0 = load r2
317     //   r2 = load r0
318     if (LastOpcodeType == VMEM && Increment.Named.VM) {
319       // Insert a NOP to break the clause.
320       BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
321           .addImm(0);
322       LastInstWritesM0 = false;
323     }
324 
325     if (TII->isSMRD(*I))
326       LastOpcodeType = SMEM;
327     else if (Increment.Named.VM)
328       LastOpcodeType = VMEM;
329   }
330 
331   // Remember which export instructions we have seen
332   if (Increment.Named.EXP) {
333     ExpInstrTypesSeen |= I->getOpcode() == AMDGPU::EXP ? 1 : 2;
334   }
335 
336   for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
337     MachineOperand &Op = I->getOperand(i);
338     if (!isOpRelevant(Op))
339       continue;
340 
341     const TargetRegisterClass *RC = TII->getOpRegClass(*I, i);
342     RegInterval Interval = getRegInterval(RC, Op);
343     for (unsigned j = Interval.first; j < Interval.second; ++j) {
344 
345       // Remember which registers we define
346       if (Op.isDef())
347         DefinedRegs[j] = Limit;
348 
349       // and which one we are using
350       if (Op.isUse())
351         UsedRegs[j] = Limit;
352     }
353   }
354 }
355 
insertWait(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const Counters & Required)356 bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
357                                MachineBasicBlock::iterator I,
358                                const Counters &Required) {
359 
360   // End of program? No need to wait on anything
361   // A function not returning void needs to wait, because other bytecode will
362   // be appended after it and we don't know what it will be.
363   if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid)
364     return false;
365 
366   // Figure out if the async instructions execute in order
367   bool Ordered[3];
368 
369   // VM_CNT is always ordered
370   Ordered[0] = true;
371 
372   // EXP_CNT is unordered if we have both EXP & VM-writes
373   Ordered[1] = ExpInstrTypesSeen == 3;
374 
375   // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
376   Ordered[2] = false;
377 
378   // The values we are going to put into the S_WAITCNT instruction
379   Counters Counts = WaitCounts;
380 
381   // Do we really need to wait?
382   bool NeedWait = false;
383 
384   for (unsigned i = 0; i < 3; ++i) {
385 
386     if (Required.Array[i] <= WaitedOn.Array[i])
387       continue;
388 
389     NeedWait = true;
390 
391     if (Ordered[i]) {
392       unsigned Value = LastIssued.Array[i] - Required.Array[i];
393 
394       // Adjust the value to the real hardware possibilities.
395       Counts.Array[i] = std::min(Value, WaitCounts.Array[i]);
396 
397     } else
398       Counts.Array[i] = 0;
399 
400     // Remember on what we have waited on.
401     WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
402   }
403 
404   if (!NeedWait)
405     return false;
406 
407   // Reset EXP_CNT instruction types
408   if (Counts.Named.EXP == 0)
409     ExpInstrTypesSeen = 0;
410 
411   // Build the wait instruction
412   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
413           .addImm((Counts.Named.VM & 0xF) |
414                   ((Counts.Named.EXP & 0x7) << 4) |
415                   ((Counts.Named.LGKM & 0xF) << 8));
416 
417   LastOpcodeType = OTHER;
418   LastInstWritesM0 = false;
419   return true;
420 }
421 
422 /// \brief helper function for handleOperands
increaseCounters(Counters & Dst,const Counters & Src)423 static void increaseCounters(Counters &Dst, const Counters &Src) {
424 
425   for (unsigned i = 0; i < 3; ++i)
426     Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
427 }
428 
429 /// \brief check whether any of the counters is non-zero
countersNonZero(const Counters & Counter)430 static bool countersNonZero(const Counters &Counter) {
431   for (unsigned i = 0; i < 3; ++i)
432     if (Counter.Array[i])
433       return true;
434   return false;
435 }
436 
handleExistingWait(MachineBasicBlock::iterator I)437 void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
438   assert(I->getOpcode() == AMDGPU::S_WAITCNT);
439 
440   unsigned Imm = I->getOperand(0).getImm();
441   Counters Counts, WaitOn;
442 
443   Counts.Named.VM = Imm & 0xF;
444   Counts.Named.EXP = (Imm >> 4) & 0x7;
445   Counts.Named.LGKM = (Imm >> 8) & 0xF;
446 
447   for (unsigned i = 0; i < 3; ++i) {
448     if (Counts.Array[i] <= LastIssued.Array[i])
449       WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
450     else
451       WaitOn.Array[i] = 0;
452   }
453 
454   increaseCounters(DelayedWaitOn, WaitOn);
455 }
456 
handleOperands(MachineInstr & MI)457 Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
458 
459   Counters Result = ZeroCounts;
460 
461   // For each register affected by this instruction increase the result
462   // sequence.
463   //
464   // TODO: We could probably just look at explicit operands if we removed VCC /
465   // EXEC from SMRD dest reg classes.
466   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
467     MachineOperand &Op = MI.getOperand(i);
468     if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
469       continue;
470 
471     const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
472     RegInterval Interval = getRegInterval(RC, Op);
473     for (unsigned j = Interval.first; j < Interval.second; ++j) {
474 
475       if (Op.isDef()) {
476         increaseCounters(Result, UsedRegs[j]);
477         increaseCounters(Result, DefinedRegs[j]);
478       }
479 
480       if (Op.isUse())
481         increaseCounters(Result, DefinedRegs[j]);
482     }
483   }
484 
485   return Result;
486 }
487 
handleSendMsg(MachineBasicBlock & MBB,MachineBasicBlock::iterator I)488 void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
489                                   MachineBasicBlock::iterator I) {
490   if (ST->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
491     return;
492 
493   // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
494   if (LastInstWritesM0 && I->getOpcode() == AMDGPU::S_SENDMSG) {
495     BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
496     LastInstWritesM0 = false;
497     return;
498   }
499 
500   // Set whether this instruction sets M0
501   LastInstWritesM0 = false;
502 
503   unsigned NumOperands = I->getNumOperands();
504   for (unsigned i = 0; i < NumOperands; i++) {
505     const MachineOperand &Op = I->getOperand(i);
506 
507     if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
508       LastInstWritesM0 = true;
509   }
510 }
511 
512 // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
513 // around other non-memory instructions.
runOnMachineFunction(MachineFunction & MF)514 bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
515   bool Changes = false;
516 
517   ST = &MF.getSubtarget<SISubtarget>();
518   TII = ST->getInstrInfo();
519   TRI = &TII->getRegisterInfo();
520   MRI = &MF.getRegInfo();
521 
522   WaitedOn = ZeroCounts;
523   DelayedWaitOn = ZeroCounts;
524   LastIssued = ZeroCounts;
525   LastOpcodeType = OTHER;
526   LastInstWritesM0 = false;
527   ReturnsVoid = MF.getInfo<SIMachineFunctionInfo>()->returnsVoid();
528 
529   memset(&UsedRegs, 0, sizeof(UsedRegs));
530   memset(&DefinedRegs, 0, sizeof(DefinedRegs));
531 
532   SmallVector<MachineInstr *, 4> RemoveMI;
533 
534   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
535        BI != BE; ++BI) {
536 
537     MachineBasicBlock &MBB = *BI;
538     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
539          I != E; ++I) {
540 
541       if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
542         // There is a hardware bug on CI/SI where SMRD instruction may corrupt
543         // vccz bit, so when we detect that an instruction may read from a
544         // corrupt vccz bit, we need to:
545         // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to
546         //    complete.
547         // 2. Restore the correct value of vccz by writing the current value
548         //    of vcc back to vcc.
549 
550         if (TII->isSMRD(I->getOpcode())) {
551           VCCZCorrupt = true;
552         } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) {
553           // FIXME: We only care about SMRD instructions here, not LDS or GDS.
554           // Whenever we store a value in vcc, the correct value of vccz is
555           // restored.
556           VCCZCorrupt = false;
557         }
558 
559         // Check if we need to apply the bug work-around
560         if (readsVCCZ(I->getOpcode()) && VCCZCorrupt) {
561           DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n');
562 
563           // Wait on everything, not just LGKM.  vccz reads usually come from
564           // terminators, and we always wait on everything at the end of the
565           // block, so if we only wait on LGKM here, we might end up with
566           // another s_waitcnt inserted right after this if there are non-LGKM
567           // instructions still outstanding.
568           insertWait(MBB, I, LastIssued);
569 
570           // Restore the vccz bit.  Any time a value is written to vcc, the vcc
571           // bit is updated, so we can restore the bit by reading the value of
572           // vcc and then writing it back to the register.
573           BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
574                   AMDGPU::VCC)
575                   .addReg(AMDGPU::VCC);
576         }
577       }
578 
579       // Record pre-existing, explicitly requested waits
580       if (I->getOpcode() == AMDGPU::S_WAITCNT) {
581         handleExistingWait(*I);
582         RemoveMI.push_back(&*I);
583         continue;
584       }
585 
586       Counters Required;
587 
588       // Wait for everything before a barrier.
589       //
590       // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
591       // but we also want to wait for any other outstanding transfers before
592       // signalling other hardware blocks
593       if (I->getOpcode() == AMDGPU::S_BARRIER ||
594           I->getOpcode() == AMDGPU::S_SENDMSG)
595         Required = LastIssued;
596       else
597         Required = handleOperands(*I);
598 
599       Counters Increment = getHwCounts(*I);
600 
601       if (countersNonZero(Required) || countersNonZero(Increment))
602         increaseCounters(Required, DelayedWaitOn);
603 
604       Changes |= insertWait(MBB, I, Required);
605 
606       pushInstruction(MBB, I, Increment);
607       handleSendMsg(MBB, I);
608     }
609 
610     // Wait for everything at the end of the MBB
611     Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
612   }
613 
614   for (MachineInstr *I : RemoveMI)
615     I->eraseFromParent();
616 
617   return Changes;
618 }
619