1 //===-- SIMachineScheduler.h - SI Scheduler Interface -----------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI Machine Scheduler interface 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H 15 #define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H 16 17 #include "SIInstrInfo.h" 18 #include "llvm/CodeGen/MachineBasicBlock.h" 19 #include "llvm/CodeGen/MachineScheduler.h" 20 #include "llvm/CodeGen/RegisterPressure.h" 21 #include "llvm/CodeGen/ScheduleDAG.h" 22 #include <cassert> 23 #include <cstdint> 24 #include <map> 25 #include <memory> 26 #include <set> 27 #include <vector> 28 29 namespace llvm { 30 31 enum SIScheduleCandReason { 32 NoCand, 33 RegUsage, 34 Latency, 35 Successor, 36 Depth, 37 NodeOrder 38 }; 39 40 struct SISchedulerCandidate { 41 // The reason for this candidate. 42 SIScheduleCandReason Reason = NoCand; 43 44 // Set of reasons that apply to multiple candidates. 45 uint32_t RepeatReasonSet = 0; 46 47 SISchedulerCandidate() = default; 48 isRepeatSISchedulerCandidate49 bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); } setRepeatSISchedulerCandidate50 void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); } 51 }; 52 53 class SIScheduleDAGMI; 54 class SIScheduleBlockCreator; 55 56 enum SIScheduleBlockLinkKind { 57 NoData, 58 Data 59 }; 60 61 class SIScheduleBlock { 62 SIScheduleDAGMI *DAG; 63 SIScheduleBlockCreator *BC; 64 65 std::vector<SUnit*> SUnits; 66 std::map<unsigned, unsigned> NodeNum2Index; 67 std::vector<SUnit*> TopReadySUs; 68 std::vector<SUnit*> ScheduledSUnits; 69 70 /// The top of the unscheduled zone. 71 IntervalPressure TopPressure; 72 RegPressureTracker TopRPTracker; 73 74 // Pressure: number of said class of registers needed to 75 // store the live virtual and real registers. 76 // We do care only of SGPR32 and VGPR32 and do track only virtual registers. 77 // Pressure of additional registers required inside the block. 78 std::vector<unsigned> InternalAdditionnalPressure; 79 // Pressure of input and output registers 80 std::vector<unsigned> LiveInPressure; 81 std::vector<unsigned> LiveOutPressure; 82 // Registers required by the block, and outputs. 83 // We do track only virtual registers. 84 // Note that some registers are not 32 bits, 85 // and thus the pressure is not equal 86 // to the number of live registers. 87 std::set<unsigned> LiveInRegs; 88 std::set<unsigned> LiveOutRegs; 89 90 bool Scheduled = false; 91 bool HighLatencyBlock = false; 92 93 std::vector<unsigned> HasLowLatencyNonWaitedParent; 94 95 // Unique ID, the index of the Block in the SIScheduleDAGMI Blocks table. 96 unsigned ID; 97 98 std::vector<SIScheduleBlock*> Preds; // All blocks predecessors. 99 // All blocks successors, and the kind of link 100 std::vector<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>> Succs; 101 unsigned NumHighLatencySuccessors = 0; 102 103 public: SIScheduleBlock(SIScheduleDAGMI * DAG,SIScheduleBlockCreator * BC,unsigned ID)104 SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC, 105 unsigned ID): 106 DAG(DAG), BC(BC), TopRPTracker(TopPressure), ID(ID) {} 107 108 ~SIScheduleBlock() = default; 109 getID()110 unsigned getID() const { return ID; } 111 112 /// Functions for Block construction. 113 void addUnit(SUnit *SU); 114 115 // When all SUs have been added. 116 void finalizeUnits(); 117 118 // Add block pred, which has instruction predecessor of SU. 119 void addPred(SIScheduleBlock *Pred); 120 void addSucc(SIScheduleBlock *Succ, SIScheduleBlockLinkKind Kind); 121 getPreds()122 const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; } 123 ArrayRef<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>> getSuccs()124 getSuccs() const { return Succs; } 125 126 unsigned Height; // Maximum topdown path length to block without outputs 127 unsigned Depth; // Maximum bottomup path length to block without inputs 128 getNumHighLatencySuccessors()129 unsigned getNumHighLatencySuccessors() const { 130 return NumHighLatencySuccessors; 131 } 132 isHighLatencyBlock()133 bool isHighLatencyBlock() { return HighLatencyBlock; } 134 135 // This is approximative. 136 // Ideally should take into accounts some instructions (rcp, etc) 137 // are 4 times slower. getCost()138 int getCost() { return SUnits.size(); } 139 140 // The block Predecessors and Successors must be all registered 141 // before fastSchedule(). 142 // Fast schedule with no particular requirement. 143 void fastSchedule(); 144 getScheduledUnits()145 std::vector<SUnit*> getScheduledUnits() { return ScheduledSUnits; } 146 147 // Complete schedule that will try to minimize reg pressure and 148 // low latencies, and will fill liveins and liveouts. 149 // Needs all MIs to be grouped between BeginBlock and EndBlock. 150 // The MIs can be moved after the scheduling, 151 // it is just used to allow correct track of live registers. 152 void schedule(MachineBasicBlock::iterator BeginBlock, 153 MachineBasicBlock::iterator EndBlock); 154 isScheduled()155 bool isScheduled() { return Scheduled; } 156 157 // Needs the block to be scheduled inside 158 // TODO: find a way to compute it. getInternalAdditionnalRegUsage()159 std::vector<unsigned> &getInternalAdditionnalRegUsage() { 160 return InternalAdditionnalPressure; 161 } 162 getInRegs()163 std::set<unsigned> &getInRegs() { return LiveInRegs; } getOutRegs()164 std::set<unsigned> &getOutRegs() { return LiveOutRegs; } 165 166 void printDebug(bool Full); 167 168 private: 169 struct SISchedCandidate : SISchedulerCandidate { 170 // The best SUnit candidate. 171 SUnit *SU = nullptr; 172 173 unsigned SGPRUsage; 174 unsigned VGPRUsage; 175 bool IsLowLatency; 176 unsigned LowLatencyOffset; 177 bool HasLowLatencyNonWaitedParent; 178 179 SISchedCandidate() = default; 180 isValidSISchedCandidate181 bool isValid() const { return SU; } 182 183 // Copy the status of another candidate without changing policy. setBestSISchedCandidate184 void setBest(SISchedCandidate &Best) { 185 assert(Best.Reason != NoCand && "uninitialized Sched candidate"); 186 SU = Best.SU; 187 Reason = Best.Reason; 188 SGPRUsage = Best.SGPRUsage; 189 VGPRUsage = Best.VGPRUsage; 190 IsLowLatency = Best.IsLowLatency; 191 LowLatencyOffset = Best.LowLatencyOffset; 192 HasLowLatencyNonWaitedParent = Best.HasLowLatencyNonWaitedParent; 193 } 194 }; 195 196 void undoSchedule(); 197 198 void undoReleaseSucc(SUnit *SU, SDep *SuccEdge); 199 void releaseSucc(SUnit *SU, SDep *SuccEdge); 200 // InOrOutBlock: restrict to links pointing inside the block (true), 201 // or restrict to links pointing outside the block (false). 202 void releaseSuccessors(SUnit *SU, bool InOrOutBlock); 203 204 void nodeScheduled(SUnit *SU); 205 void tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand); 206 void tryCandidateBottomUp(SISchedCandidate &Cand, SISchedCandidate &TryCand); 207 SUnit* pickNode(); 208 void traceCandidate(const SISchedCandidate &Cand); 209 void initRegPressure(MachineBasicBlock::iterator BeginBlock, 210 MachineBasicBlock::iterator EndBlock); 211 }; 212 213 struct SIScheduleBlocks { 214 std::vector<SIScheduleBlock*> Blocks; 215 std::vector<int> TopDownIndex2Block; 216 std::vector<int> TopDownBlock2Index; 217 }; 218 219 enum SISchedulerBlockCreatorVariant { 220 LatenciesAlone, 221 LatenciesGrouped, 222 LatenciesAlonePlusConsecutive 223 }; 224 225 class SIScheduleBlockCreator { 226 SIScheduleDAGMI *DAG; 227 // unique_ptr handles freeing memory for us. 228 std::vector<std::unique_ptr<SIScheduleBlock>> BlockPtrs; 229 std::map<SISchedulerBlockCreatorVariant, 230 SIScheduleBlocks> Blocks; 231 std::vector<SIScheduleBlock*> CurrentBlocks; 232 std::vector<int> Node2CurrentBlock; 233 234 // Topological sort 235 // Maps topological index to the node number. 236 std::vector<int> TopDownIndex2Block; 237 std::vector<int> TopDownBlock2Index; 238 std::vector<int> BottomUpIndex2Block; 239 240 // 0 -> Color not given. 241 // 1 to SUnits.size() -> Reserved group (you should only add elements to them). 242 // Above -> Other groups. 243 int NextReservedID; 244 int NextNonReservedID; 245 std::vector<int> CurrentColoring; 246 std::vector<int> CurrentTopDownReservedDependencyColoring; 247 std::vector<int> CurrentBottomUpReservedDependencyColoring; 248 249 public: 250 SIScheduleBlockCreator(SIScheduleDAGMI *DAG); 251 252 SIScheduleBlocks 253 getBlocks(SISchedulerBlockCreatorVariant BlockVariant); 254 255 bool isSUInBlock(SUnit *SU, unsigned ID); 256 257 private: 258 // Give a Reserved color to every high latency. 259 void colorHighLatenciesAlone(); 260 261 // Create groups of high latencies with a Reserved color. 262 void colorHighLatenciesGroups(); 263 264 // Compute coloring for topdown and bottom traversals with 265 // different colors depending on dependencies on Reserved colors. 266 void colorComputeReservedDependencies(); 267 268 // Give color to all non-colored SUs according to Reserved groups dependencies. 269 void colorAccordingToReservedDependencies(); 270 271 // Divides Blocks having no bottom up or top down dependencies on Reserved groups. 272 // The new colors are computed according to the dependencies on the other blocks 273 // formed with colorAccordingToReservedDependencies. 274 void colorEndsAccordingToDependencies(); 275 276 // Cut groups into groups with SUs in consecutive order (except for Reserved groups). 277 void colorForceConsecutiveOrderInGroup(); 278 279 // Merge Constant loads that have all their users into another group to the group. 280 // (TODO: else if all their users depend on the same group, put them there) 281 void colorMergeConstantLoadsNextGroup(); 282 283 // Merge SUs that have all their users into another group to the group 284 void colorMergeIfPossibleNextGroup(); 285 286 // Merge SUs that have all their users into another group to the group, 287 // but only for Reserved groups. 288 void colorMergeIfPossibleNextGroupOnlyForReserved(); 289 290 // Merge SUs that have all their users into another group to the group, 291 // but only if the group is no more than a few SUs. 292 void colorMergeIfPossibleSmallGroupsToNextGroup(); 293 294 // Divides Blocks with important size. 295 // Idea of implementation: attribute new colors depending on topdown and 296 // bottom up links to other blocks. 297 void cutHugeBlocks(); 298 299 // Put in one group all instructions with no users in this scheduling region 300 // (we'd want these groups be at the end). 301 void regroupNoUserInstructions(); 302 303 // Give Reserved color to export instructions 304 void colorExports(); 305 306 void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant); 307 308 void topologicalSort(); 309 310 void scheduleInsideBlocks(); 311 312 void fillStats(); 313 }; 314 315 enum SISchedulerBlockSchedulerVariant { 316 BlockLatencyRegUsage, 317 BlockRegUsageLatency, 318 BlockRegUsage 319 }; 320 321 class SIScheduleBlockScheduler { 322 SIScheduleDAGMI *DAG; 323 SISchedulerBlockSchedulerVariant Variant; 324 std::vector<SIScheduleBlock*> Blocks; 325 326 std::vector<std::map<unsigned, unsigned>> LiveOutRegsNumUsages; 327 std::set<unsigned> LiveRegs; 328 // Num of schedulable unscheduled blocks reading the register. 329 std::map<unsigned, unsigned> LiveRegsConsumers; 330 331 std::vector<unsigned> LastPosHighLatencyParentScheduled; 332 int LastPosWaitedHighLatency; 333 334 std::vector<SIScheduleBlock*> BlocksScheduled; 335 unsigned NumBlockScheduled; 336 std::vector<SIScheduleBlock*> ReadyBlocks; 337 338 unsigned VregCurrentUsage; 339 unsigned SregCurrentUsage; 340 341 // Currently is only approximation. 342 unsigned maxVregUsage; 343 unsigned maxSregUsage; 344 345 std::vector<unsigned> BlockNumPredsLeft; 346 std::vector<unsigned> BlockNumSuccsLeft; 347 348 public: 349 SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, 350 SISchedulerBlockSchedulerVariant Variant, 351 SIScheduleBlocks BlocksStruct); 352 ~SIScheduleBlockScheduler() = default; 353 getBlocks()354 std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; } 355 getVGPRUsage()356 unsigned getVGPRUsage() { return maxVregUsage; } getSGPRUsage()357 unsigned getSGPRUsage() { return maxSregUsage; } 358 359 private: 360 struct SIBlockSchedCandidate : SISchedulerCandidate { 361 // The best Block candidate. 362 SIScheduleBlock *Block = nullptr; 363 364 bool IsHighLatency; 365 int VGPRUsageDiff; 366 unsigned NumSuccessors; 367 unsigned NumHighLatencySuccessors; 368 unsigned LastPosHighLatParentScheduled; 369 unsigned Height; 370 371 SIBlockSchedCandidate() = default; 372 isValidSIBlockSchedCandidate373 bool isValid() const { return Block; } 374 375 // Copy the status of another candidate without changing policy. setBestSIBlockSchedCandidate376 void setBest(SIBlockSchedCandidate &Best) { 377 assert(Best.Reason != NoCand && "uninitialized Sched candidate"); 378 Block = Best.Block; 379 Reason = Best.Reason; 380 IsHighLatency = Best.IsHighLatency; 381 VGPRUsageDiff = Best.VGPRUsageDiff; 382 NumSuccessors = Best.NumSuccessors; 383 NumHighLatencySuccessors = Best.NumHighLatencySuccessors; 384 LastPosHighLatParentScheduled = Best.LastPosHighLatParentScheduled; 385 Height = Best.Height; 386 } 387 }; 388 389 bool tryCandidateLatency(SIBlockSchedCandidate &Cand, 390 SIBlockSchedCandidate &TryCand); 391 bool tryCandidateRegUsage(SIBlockSchedCandidate &Cand, 392 SIBlockSchedCandidate &TryCand); 393 SIScheduleBlock *pickBlock(); 394 395 void addLiveRegs(std::set<unsigned> &Regs); 396 void decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs); 397 void releaseBlockSuccs(SIScheduleBlock *Parent); 398 void blockScheduled(SIScheduleBlock *Block); 399 400 // Check register pressure change 401 // by scheduling a block with these LiveIn and LiveOut. 402 std::vector<int> checkRegUsageImpact(std::set<unsigned> &InRegs, 403 std::set<unsigned> &OutRegs); 404 405 void schedule(); 406 }; 407 408 struct SIScheduleBlockResult { 409 std::vector<unsigned> SUs; 410 unsigned MaxSGPRUsage; 411 unsigned MaxVGPRUsage; 412 }; 413 414 class SIScheduler { 415 SIScheduleDAGMI *DAG; 416 SIScheduleBlockCreator BlockCreator; 417 418 public: SIScheduler(SIScheduleDAGMI * DAG)419 SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {} 420 421 ~SIScheduler() = default; 422 423 struct SIScheduleBlockResult 424 scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, 425 SISchedulerBlockSchedulerVariant ScheduleVariant); 426 }; 427 428 class SIScheduleDAGMI final : public ScheduleDAGMILive { 429 const SIInstrInfo *SITII; 430 const SIRegisterInfo *SITRI; 431 432 std::vector<SUnit> SUnitsLinksBackup; 433 434 // For moveLowLatencies. After all Scheduling variants are tested. 435 std::vector<unsigned> ScheduledSUnits; 436 std::vector<unsigned> ScheduledSUnitsInv; 437 438 unsigned VGPRSetID; 439 unsigned SGPRSetID; 440 441 public: 442 SIScheduleDAGMI(MachineSchedContext *C); 443 444 ~SIScheduleDAGMI() override; 445 446 // Entry point for the schedule. 447 void schedule() override; 448 449 // To init Block's RPTracker. initRPTracker(RegPressureTracker & RPTracker)450 void initRPTracker(RegPressureTracker &RPTracker) { 451 RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin, false, false); 452 } 453 getBB()454 MachineBasicBlock *getBB() { return BB; } getCurrentTop()455 MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; } getCurrentBottom()456 MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; } getLIS()457 LiveIntervals *getLIS() { return LIS; } getMRI()458 MachineRegisterInfo *getMRI() { return &MRI; } getTRI()459 const TargetRegisterInfo *getTRI() { return TRI; } GetTopo()460 ScheduleDAGTopologicalSort *GetTopo() { return &Topo; } getEntrySU()461 SUnit& getEntrySU() { return EntrySU; } getExitSU()462 SUnit& getExitSU() { return ExitSU; } 463 464 void restoreSULinksLeft(); 465 466 template<typename _Iterator> void fillVgprSgprCost(_Iterator First, 467 _Iterator End, 468 unsigned &VgprUsage, 469 unsigned &SgprUsage); 470 getInRegs()471 std::set<unsigned> getInRegs() { 472 std::set<unsigned> InRegs; 473 for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) { 474 InRegs.insert(RegMaskPair.RegUnit); 475 } 476 return InRegs; 477 } 478 getOutRegs()479 std::set<unsigned> getOutRegs() { 480 std::set<unsigned> OutRegs; 481 for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) { 482 OutRegs.insert(RegMaskPair.RegUnit); 483 } 484 return OutRegs; 485 }; 486 getVGPRSetID()487 unsigned getVGPRSetID() const { return VGPRSetID; } getSGPRSetID()488 unsigned getSGPRSetID() const { return SGPRSetID; } 489 490 private: 491 void topologicalSort(); 492 // After scheduling is done, improve low latency placements. 493 void moveLowLatencies(); 494 495 public: 496 // Some stats for scheduling inside blocks. 497 std::vector<unsigned> IsLowLatencySU; 498 std::vector<unsigned> LowLatencyOffset; 499 std::vector<unsigned> IsHighLatencySU; 500 // Topological sort 501 // Maps topological index to the node number. 502 std::vector<int> TopDownIndex2SU; 503 std::vector<int> BottomUpIndex2SU; 504 }; 505 506 } // end namespace llvm 507 508 #endif // LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H 509