• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2023 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "aarch64_schedule.h"
17 #include <ctime>
18 #include "aarch64_cg.h"
19 #include "aarch64_operand.h"
20 #include "aarch64_dependence.h"
21 #include "file_utils.h"
22 #include "pressure.h"
23 
24 /*
25  * This phase is Instruction Scheduling.
26  * There is a local list scheduling, it is scheduling in basic block.
27  * The entry is AArch64Schedule::ListScheduling, will traversal all basic block,
28  * for a basic block:
29  *  1. build a dependence graph;
30  *  2. combine clinit pairs and str&ldr pairs;
31  *  3. reorder instructions.
32  */
33 namespace maplebe {
34 namespace {
35 constexpr uint32 kSecondToLastNode = 2;
36 }  // namespace
37 
38 uint32 AArch64Schedule::maxUnitIndex = 0;
39 /* reserve two register for special purpose */
40 int AArch64Schedule::intRegPressureThreshold = static_cast<int>(R27 - R0);
41 int AArch64Schedule::fpRegPressureThreshold = static_cast<int>(V30 - V0);
42 int AArch64Schedule::intCalleeSaveThresholdBase = static_cast<int>(R29 - R19);
43 int AArch64Schedule::intCalleeSaveThresholdEnhance = static_cast<int>(R30 - R19);
44 int AArch64Schedule::fpCalleeSaveThreshold = static_cast<int>(R16 - R8);
45 /* Init schedule's data struction. */
Init()46 void AArch64Schedule::Init()
47 {
48     readyList.clear();
49     nodeSize = nodes.size();
50     lastSeparatorIndex = 0;
51     mad->ReleaseAllUnits();
52     DepNode *node = nodes[0];
53 
54     DEBUG_ASSERT(node->GetType() == kNodeTypeSeparator,
55                  "CG internal error, the first node should be a separator node.");
56 
57     if (CGOptions::IsDruteForceSched() || CGOptions::IsSimulateSched()) {
58         for (auto nodeTemp : nodes) {
59             nodeTemp->SetVisit(0);
60             nodeTemp->SetState(kNormal);
61             nodeTemp->SetSchedCycle(0);
62             nodeTemp->SetEStart(0);
63             nodeTemp->SetLStart(0);
64         }
65     }
66 
67     readyList.emplace_back(node);
68     node->SetState(kReady);
69 
70     /* Init validPredsSize and validSuccsSize. */
71     for (auto nodeTemp : nodes) {
72         nodeTemp->SetValidPredsSize(nodeTemp->GetPreds().size());
73         nodeTemp->SetValidSuccsSize(nodeTemp->GetSuccs().size());
74     }
75 }
76 
77 /*
78  *  A insn which can be combine should meet this conditions:
79  *  1. it is str/ldr insn;
80  *  2. address mode is kAddrModeBOi, [baseReg, offset];
81  *  3. the register operand size equal memory operand size;
82  *  4. if define USE_32BIT_REF, register operand size should be 4 byte;
83  *  5. for stp/ldp, the imm should be within -512 and 504(64bit), or -256 and 252(32bit);
84  *  6. pair instr for 8/4 byte registers must have multiple of 8/4 for imm.
85  *  If insn can be combine, return true.
86  */
CanCombine(const Insn & insn) const87 bool AArch64Schedule::CanCombine(const Insn &insn) const
88 {
89     MOperator opCode = insn.GetMachineOpcode();
90     if ((opCode != MOP_xldr) && (opCode != MOP_wldr) && (opCode != MOP_dldr) && (opCode != MOP_sldr) &&
91         (opCode != MOP_xstr) && (opCode != MOP_wstr) && (opCode != MOP_dstr) && (opCode != MOP_sstr)) {
92         return false;
93     }
94 
95     DEBUG_ASSERT(insn.GetOperand(1).IsMemoryAccessOperand(), "expects mem operands");
96     auto &memOpnd = static_cast<MemOperand &>(insn.GetOperand(1));
97     MemOperand::AArch64AddressingMode addrMode = memOpnd.GetAddrMode();
98     if (addrMode != MemOperand::kAddrModeBOi) {
99         return false;
100     }
101 
102     auto &regOpnd = static_cast<RegOperand &>(insn.GetOperand(0));
103     if (regOpnd.GetSize() != memOpnd.GetSize()) {
104         return false;
105     }
106 
107     uint32 size = regOpnd.GetSize() >> kLog2BitsPerByte;
108 #ifdef USE_32BIT_REF
109     if (insn.IsAccessRefField() && (size > (kAarch64IntregBytelen >> 1))) {
110         return false;
111     }
112 #endif /* USE_32BIT_REF */
113 
114     OfstOperand *offset = memOpnd.GetOffsetImmediate();
115     if (offset == nullptr) {
116         return false;
117     }
118     int32 offsetValue = static_cast<int32>(offset->GetOffsetValue());
119     if (size == kAarch64IntregBytelen) { /* 64 bit */
120         if ((offsetValue <= kStpLdpImm64LowerBound) || (offsetValue >= kStpLdpImm64UpperBound)) {
121             return false;
122         }
123     } else if (size == (kAarch64IntregBytelen >> 1)) { /* 32 bit */
124         if ((offsetValue <= kStpLdpImm32LowerBound) || (offsetValue >= kStpLdpImm32UpperBound)) {
125             return false;
126         }
127     }
128 
129     /* pair instr for 8/4 byte registers must have multiple of 8/4 for imm */
130     if ((static_cast<uint32>(offsetValue) % size) != 0) {
131         return false;
132     }
133     return true;
134 }
135 
136 /* After building dependence graph, combine str&ldr pairs. */
MemoryAccessPairOpt()137 void AArch64Schedule::MemoryAccessPairOpt()
138 {
139     Init();
140     std::vector<DepNode *> memList;
141 
142     while ((!readyList.empty()) || !memList.empty()) {
143         DepNode *readNode = nullptr;
144         if (!readyList.empty()) {
145             readNode = readyList[0];
146             readyList.erase(readyList.begin());
147         } else {
148             if (memList[0]->GetType() != kNodeTypeEmpty) {
149                 FindAndCombineMemoryAccessPair(memList);
150             }
151             readNode = memList[0];
152             memList.erase(memList.cbegin());
153         }
154 
155         /* schedule readNode */
156         CHECK_FATAL(readNode != nullptr, "readNode is null in MemoryAccessPairOpt");
157         readNode->SetState(kScheduled);
158 
159         /* add readNode's succs to readyList or memList. */
160         for (auto succLink : readNode->GetSuccs()) {
161             DepNode &succNode = succLink->GetTo();
162             succNode.DecreaseValidPredsSize();
163             if (succNode.GetValidPredsSize() == 0) {
164                 DEBUG_ASSERT(succNode.GetState() == kNormal, "schedule state should be kNormal");
165                 succNode.SetState(kReady);
166                 DEBUG_ASSERT(succNode.GetInsn() != nullptr, "insn can't be nullptr!");
167                 if (CanCombine(*succNode.GetInsn())) {
168                     memList.emplace_back(&succNode);
169                 } else {
170                     readyList.emplace_back(&succNode);
171                 }
172             }
173         }
174     }
175 
176     for (auto node : nodes) {
177         node->SetVisit(0);
178         node->SetState(kNormal);
179     }
180 }
181 
182 /* Find and combine correct MemoryAccessPair for memList[0]. */
FindAndCombineMemoryAccessPair(const std::vector<DepNode * > & memList)183 void AArch64Schedule::FindAndCombineMemoryAccessPair(const std::vector<DepNode *> &memList)
184 {
185     DEBUG_ASSERT(!memList.empty(), "memList should not be empty");
186     CHECK_FATAL(memList[0]->GetInsn() != nullptr, "memList[0]'s insn should not be nullptr");
187     MemOperand *currMemOpnd = static_cast<MemOperand *>(memList[0]->GetInsn()->GetMemOpnd());
188     DEBUG_ASSERT(currMemOpnd != nullptr, "opnd should not be nullptr");
189     DEBUG_ASSERT(currMemOpnd->IsMemoryAccessOperand(), "opnd should be memOpnd");
190     int32 currOffsetVal = static_cast<int32>(currMemOpnd->GetOffsetImmediate()->GetOffsetValue());
191     MOperator currMop = memList[0]->GetInsn()->GetMachineOpcode();
192     /* find a depNode to combine with memList[0], and break; */
193     for (auto it = std::next(memList.begin(), 1); it != memList.end(); ++it) {
194         DEBUG_ASSERT((*it)->GetInsn() != nullptr, "null ptr check");
195 
196         if (currMop == (*it)->GetInsn()->GetMachineOpcode()) {
197             MemOperand *nextMemOpnd = static_cast<MemOperand *>((*it)->GetInsn()->GetMemOpnd());
198             CHECK_FATAL(nextMemOpnd != nullptr, "opnd should not be nullptr");
199             CHECK_FATAL(nextMemOpnd->IsMemoryAccessOperand(), "opnd should be MemOperand");
200             int32 nextOffsetVal = static_cast<int32>(nextMemOpnd->GetOffsetImmediate()->GetOffsetValue());
201             uint32 size = currMemOpnd->GetSize() >> kLog2BitsPerByte;
202             if ((nextMemOpnd->GetBaseRegister() == currMemOpnd->GetBaseRegister()) &&
203                 (nextMemOpnd->GetSize() == currMemOpnd->GetSize()) &&
204                 (static_cast<uint32>(abs(nextOffsetVal - currOffsetVal)) == size)) {
205                 /*
206                  * In ARM Architecture Reference Manual ARMv8, for ARMv8-A architecture profile
207                  * LDP on page K1-6125 declare that ldp can't use same reg
208                  */
209                 if (((currMop == MOP_xldr) || (currMop == MOP_sldr) || (currMop == MOP_dldr) ||
210                      (currMop == MOP_wldr)) &&
211                     &(memList[0]->GetInsn()->GetOperand(0)) == &((*it)->GetInsn()->GetOperand(0))) {
212                     continue;
213                 }
214                 if (static_cast<RegOperand &>((*it)->GetInsn()->GetOperand(0)).GetRegisterType() !=
215                     static_cast<RegOperand &>(memList[0]->GetInsn()->GetOperand(0)).GetRegisterType()) {
216                     continue;
217                 }
218 
219                 if (LIST_SCHED_DUMP_REF) {
220                     LogInfo::MapleLogger() << "Combine insn: "
221                                            << "\n";
222                     memList[0]->GetInsn()->Dump();
223                     (*it)->GetInsn()->Dump();
224                 }
225                 depAnalysis->CombineMemoryAccessPair(*memList[0], **it, nextOffsetVal > currOffsetVal);
226                 if (LIST_SCHED_DUMP_REF) {
227                     LogInfo::MapleLogger() << "To: "
228                                            << "\n";
229                     memList[0]->GetInsn()->Dump();
230                 }
231                 break;
232             }
233         }
234     }
235 }
236 
237 /* combine clinit pairs. */
ClinitPairOpt()238 void AArch64Schedule::ClinitPairOpt()
239 {
240     for (auto it = nodes.begin(); it != nodes.end(); ++it) {
241         auto nextIt = std::next(it, 1);
242         if (nextIt == nodes.end()) {
243             return;
244         }
245 
246         if ((*it)->GetInsn()->GetMachineOpcode() == MOP_adrp_ldr) {
247             if ((*nextIt)->GetInsn()->GetMachineOpcode() == MOP_clinit_tail) {
248                 depAnalysis->CombineClinit(**it, **(nextIt), false);
249             } else if ((*nextIt)->GetType() == kNodeTypeSeparator) {
250                 nextIt = std::next(nextIt, 1);
251                 if (nextIt == nodes.end()) {
252                     return;
253                 }
254                 if ((*nextIt)->GetInsn()->GetMachineOpcode() == MOP_clinit_tail) {
255                     /* Do something. */
256                     depAnalysis->CombineClinit(**it, **(nextIt), true);
257                 }
258             }
259         }
260     }
261 }
262 
263 /* Return the next node's index who is kNodeTypeSeparator. */
GetNextSepIndex() const264 uint32 AArch64Schedule::GetNextSepIndex() const
265 {
266     CHECK_FATAL(nodes.size() >= 1, "value overflow");
267     return ((lastSeparatorIndex + kMaxDependenceNum) < nodeSize) ? (lastSeparatorIndex + kMaxDependenceNum)
268                                                                  : (nodes.size() - 1);
269 }
270 
271 /* Do register pressure schduling. */
RegPressureScheduling(BB & bb,MapleVector<DepNode * > & nodes)272 void AArch64Schedule::RegPressureScheduling(BB &bb, MapleVector<DepNode *> &nodes)
273 {
274     auto *regSchedule = memPool.New<RegPressureSchedule>(cgFunc, alloc);
275     /*
276      * Get physical register amount currently
277      * undef, Int Reg, Float Reg, Flag Reg
278      */
279     const std::vector<int32> kRegNumVec = {0, V0, (kMaxRegNum - V0) + 1, 1};
280     regSchedule->InitBBInfo(bb, memPool, nodes);
281     regSchedule->BuildPhyRegInfo(kRegNumVec);
282     regSchedule->DoScheduling(nodes);
283 }
284 
285 /*
286  * Compute earliest start of the node,
287  * return value : the maximum estart.
288  */
ComputeEstart(uint32 cycle)289 uint32 AArch64Schedule::ComputeEstart(uint32 cycle)
290 {
291     std::vector<DepNode *> readyNodes;
292     uint32 maxIndex = GetNextSepIndex();
293 
294     if (CGOptions::IsDebugSched()) {
295         /* Check validPredsSize. */
296         for (uint32 i = lastSeparatorIndex; i <= maxIndex; ++i) {
297             DepNode *node = nodes[i];
298             int32 schedNum = 0;
299             for (const auto *predLink : node->GetPreds()) {
300                 if (predLink->GetFrom().GetState() == kScheduled) {
301                     ++schedNum;
302                 }
303             }
304             DEBUG_ASSERT(static_cast<uint32>(node->GetPreds().size()) >= static_cast<uint32>(schedNum),
305                          "unsigned value overflow");
306             CHECK_FATAL((static_cast<uint32>(node->GetPreds().size()) - static_cast<uint32>(schedNum)) ==
307                             node->GetValidPredsSize(),
308                 "validPredsSize error.");
309         }
310     }
311 
312     DEBUG_ASSERT(nodes[maxIndex]->GetType() == kNodeTypeSeparator,
313                  "CG internal error, nodes[maxIndex] should be a separator node.");
314 
315     (void)readyNodes.insert(readyNodes.cbegin(), readyList.cbegin(), readyList.cend());
316 
317     uint32 maxEstart = cycle;
318     for (uint32 i = lastSeparatorIndex; i <= maxIndex; ++i) {
319         DepNode *node = nodes[i];
320         node->SetVisit(0);
321     }
322 
323     for (auto *node : readyNodes) {
324         DEBUG_ASSERT(node->GetState() == kReady, "CG internal error, all nodes in ready list should be ready.");
325         if (node->GetEStart() < cycle) {
326             node->SetEStart(cycle);
327         }
328     }
329 
330     while (!readyNodes.empty()) {
331         DepNode *node = readyNodes.front();
332         readyNodes.erase(readyNodes.cbegin());
333 
334         for (const auto *succLink : node->GetSuccs()) {
335             DepNode &succNode = succLink->GetTo();
336             if (succNode.GetType() == kNodeTypeSeparator) {
337                 continue;
338             }
339 
340             if (succNode.GetEStart() < (node->GetEStart() + succLink->GetLatency())) {
341                 succNode.SetEStart(node->GetEStart() + succLink->GetLatency());
342             }
343             maxEstart = (maxEstart < succNode.GetEStart() ? succNode.GetEStart() : maxEstart);
344             succNode.IncreaseVisit();
345             if ((succNode.GetVisit() >= succNode.GetValidPredsSize()) && (succNode.GetType() != kNodeTypeSeparator)) {
346                 readyNodes.emplace_back(&succNode);
347             }
348             DEBUG_ASSERT(succNode.GetVisit() <= succNode.GetValidPredsSize(), "CG internal error.");
349         }
350     }
351 
352     return maxEstart;
353 }
354 
355 /* Compute latest start of the node. */
ComputeLstart(uint32 maxEstart)356 void AArch64Schedule::ComputeLstart(uint32 maxEstart)
357 {
358     /* std::vector is better than std::queue in run time */
359     std::vector<DepNode *> readyNodes;
360     uint32 maxIndex = GetNextSepIndex();
361 
362     DEBUG_ASSERT(nodes[maxIndex]->GetType() == kNodeTypeSeparator,
363                  "CG internal error, nodes[maxIndex] should be a separator node.");
364 
365     for (uint32 i = lastSeparatorIndex; i <= maxIndex; ++i) {
366         DepNode *node = nodes[i];
367         node->SetLStart(maxEstart);
368         node->SetVisit(0);
369     }
370 
371     readyNodes.emplace_back(nodes[maxIndex]);
372     while (!readyNodes.empty()) {
373         DepNode *node = readyNodes.front();
374         readyNodes.erase(readyNodes.cbegin());
375         for (const auto *predLink : node->GetPreds()) {
376             DepNode &predNode = predLink->GetFrom();
377             if (predNode.GetState() == kScheduled) {
378                 continue;
379             }
380 
381             if (predNode.GetLStart() > (node->GetLStart() - predLink->GetLatency())) {
382                 predNode.SetLStart(node->GetLStart() - predLink->GetLatency());
383             }
384             predNode.IncreaseVisit();
385             if ((predNode.GetVisit() >= predNode.GetValidSuccsSize()) && (predNode.GetType() != kNodeTypeSeparator)) {
386                 readyNodes.emplace_back(&predNode);
387             }
388 
389             DEBUG_ASSERT(predNode.GetVisit() <= predNode.GetValidSuccsSize(), "CG internal error.");
390         }
391     }
392 }
393 
394 /* Compute earliest start and latest start of the node that is in readyList and not be scheduled. */
UpdateELStartsOnCycle(uint32 cycle)395 void AArch64Schedule::UpdateELStartsOnCycle(uint32 cycle)
396 {
397     ComputeLstart(ComputeEstart(cycle));
398 }
399 
400 /* Count unit kinds to an array. Each element of the array indicates the unit kind number of a node set. */
CountUnitKind(const DepNode & depNode,uint32 array[],const uint32 arraySize) const401 void AArch64Schedule::CountUnitKind(const DepNode &depNode, uint32 array[], const uint32 arraySize) const
402 {
403     (void)arraySize;
404     DEBUG_ASSERT(arraySize >= kUnitKindLast, "CG internal error. unit kind number is not correct.");
405     uint32 unitKind = depNode.GetUnitKind();
406     uint32 index = static_cast<uint32>(__builtin_ffs(static_cast<int>(unitKind)));
407     while (index != 0) {
408         DEBUG_ASSERT(index < kUnitKindLast, "CG internal error. index error.");
409         ++array[index];
410         unitKind &= ~(1u << (index - 1u));
411         index = static_cast<uint32>(__builtin_ffs(static_cast<int>(unitKind)));
412     }
413 }
414 
415 /* Check if a node use a specific unit kind. */
IfUseUnitKind(const DepNode & depNode,uint32 index)416 bool AArch64Schedule::IfUseUnitKind(const DepNode &depNode, uint32 index)
417 {
418     uint32 unitKind = depNode.GetUnitKind();
419     uint32 idx = static_cast<uint32>(__builtin_ffs(static_cast<int>(unitKind)));
420     while (idx != 0) {
421         DEBUG_ASSERT(index < kUnitKindLast, "CG internal error. index error.");
422         if (idx == index) {
423             return true;
424         }
425         unitKind &= ~(1u << (idx - 1u));
426         idx = static_cast<uint32>(__builtin_ffs(static_cast<int>(unitKind)));
427     }
428 
429     return false;
430 }
431 
432 /* A sample schedule according dependence graph only, to verify correctness of dependence graph. */
RandomTest()433 void AArch64Schedule::RandomTest()
434 {
435     Init();
436     nodes.clear();
437 
438     while (!readyList.empty()) {
439         DepNode *currNode = readyList.back();
440         currNode->SetState(kScheduled);
441         readyList.pop_back();
442         nodes.emplace_back(currNode);
443 
444         for (auto succLink : currNode->GetSuccs()) {
445             DepNode &succNode = succLink->GetTo();
446             bool ready = true;
447             for (auto predLink : succNode.GetPreds()) {
448                 DepNode &predNode = predLink->GetFrom();
449                 if (predNode.GetState() != kScheduled) {
450                     ready = false;
451                     break;
452                 }
453             }
454 
455             if (ready) {
456                 DEBUG_ASSERT(succNode.GetState() == kNormal, "succNode must be kNormal");
457                 readyList.emplace_back(&succNode);
458                 succNode.SetState(kReady);
459             }
460         }
461     }
462 }
463 
464 /* Remove target from readyList. */
EraseNodeFromReadyList(const DepNode & target)465 void AArch64Schedule::EraseNodeFromReadyList(const DepNode &target)
466 {
467     EraseNodeFromNodeList(target, readyList);
468 }
469 
470 /* Remove target from nodeList. */
EraseNodeFromNodeList(const DepNode & target,MapleVector<DepNode * > & nodeList)471 void AArch64Schedule::EraseNodeFromNodeList(const DepNode &target, MapleVector<DepNode *> &nodeList)
472 {
473     for (auto it = nodeList.begin(); it != nodeList.end(); ++it) {
474         if ((*it) == &target) {
475             nodeList.erase(it);
476             return;
477         }
478     }
479 
480     DEBUG_ASSERT(false, "CG internal error, erase node fail.");
481 }
482 
483 /* Dump all node of availableReadyList schedule information in current cycle. */
DumpDebugInfo(const ScheduleProcessInfo & scheduleInfo)484 void AArch64Schedule::DumpDebugInfo(const ScheduleProcessInfo &scheduleInfo)
485 {
486     LogInfo::MapleLogger() << "Current cycle[ " << scheduleInfo.GetCurrCycle() << " ], Available in readyList is : \n";
487     for (auto node : scheduleInfo.GetAvailableReadyList()) {
488         LogInfo::MapleLogger() << "NodeIndex[ " << node->GetIndex() << " ], Estart[ " << node->GetEStart()
489                                << " ], Lstart[ ";
490         LogInfo::MapleLogger() << node->GetLStart() << " ], slot[ ";
491         LogInfo::MapleLogger() << (node->GetReservation() == nullptr ? "SlotNone"
492                                                                      : node->GetReservation()->GetSlotName())
493                                << " ], ";
494         LogInfo::MapleLogger() << "succNodeNum[ " << node->GetSuccs().size() << " ], ";
495         node->GetInsn()->Dump();
496         LogInfo::MapleLogger() << '\n';
497     }
498 }
499 
500 /*
501  * Select a node from availableReadyList according to some heuristic rules, then:
502  *   1. change targetNode's schedule information;
503  *   2. try to add successors of targetNode to readyList;
504  *   3. update unscheduled node set, when targetNode is last kNodeTypeSeparator;
505  *   4. update AdvanceCycle.
506  */
SelectNode(AArch64ScheduleProcessInfo & scheduleInfo)507 void AArch64Schedule::SelectNode(AArch64ScheduleProcessInfo &scheduleInfo)
508 {
509     auto &availableReadyList = scheduleInfo.GetAvailableReadyList();
510     auto it = availableReadyList.begin();
511     DepNode *targetNode = *it;
512     if (availableReadyList.size() > 1) {
513         CalculateMaxUnitKindCount(scheduleInfo);
514         if (GetConsiderRegPressure()) {
515             UpdateReleaseRegInfo(scheduleInfo);
516         }
517         ++it;
518         for (; it != availableReadyList.end(); ++it) {
519             if (CompareDepNode(**it, *targetNode, scheduleInfo)) {
520                 targetNode = *it;
521             }
522         }
523     }
524     /* The priority of free-reg node is higher than pipeline */
525     while (!targetNode->IsResourceIdle()) {
526         scheduleInfo.IncCurrCycle();
527         mad->AdvanceOneCycleForAll();
528     }
529     if (GetConsiderRegPressure() && !scheduleInfo.IsFirstSeparator()) {
530         UpdateLiveRegSet(scheduleInfo, *targetNode);
531     }
532     /* push target node into scheduled nodes and turn it into kScheduled state */
533     scheduleInfo.PushElemIntoScheduledNodes(targetNode);
534 
535     EraseNodeFromReadyList(*targetNode);
536 
537     if (CGOptions::IsDebugSched()) {
538         LogInfo::MapleLogger() << "TargetNode : ";
539         targetNode->GetInsn()->Dump();
540         LogInfo::MapleLogger() << "\n";
541     }
542 
543     /* Update readyList. */
544     UpdateReadyList(*targetNode, readyList, true);
545 
546     if (targetNode->GetType() == kNodeTypeSeparator) {
547         /* If target node is separator node, update lastSeparatorIndex and calculate those depNodes's estart and lstart
548          * between current separator node  and new Separator node.
549          */
550         if (!scheduleInfo.IsFirstSeparator()) {
551             lastSeparatorIndex += kMaxDependenceNum;
552             UpdateELStartsOnCycle(scheduleInfo.GetCurrCycle());
553         } else {
554             scheduleInfo.ResetIsFirstSeparator();
555         }
556     }
557 
558     UpdateAdvanceCycle(scheduleInfo, *targetNode);
559 }
560 
UpdateAdvanceCycle(AArch64ScheduleProcessInfo & scheduleInfo,const DepNode & targetNode) const561 void AArch64Schedule::UpdateAdvanceCycle(AArch64ScheduleProcessInfo &scheduleInfo, const DepNode &targetNode) const
562 {
563     switch (targetNode.GetInsn()->GetLatencyType()) {
564         case kLtClinit:
565             scheduleInfo.SetAdvanceCycle(kClinitAdvanceCycle);
566             break;
567         case kLtAdrpLdr:
568             scheduleInfo.SetAdvanceCycle(kAdrpLdrAdvanceCycle);
569             break;
570         case kLtClinitTail:
571             scheduleInfo.SetAdvanceCycle(kClinitTailAdvanceCycle);
572             break;
573         default:
574             break;
575     }
576 
577     if ((scheduleInfo.GetAdvanceCycle() == 0) && mad->IsFullIssued()) {
578         if (targetNode.GetEStart() > scheduleInfo.GetCurrCycle()) {
579             scheduleInfo.SetAdvanceCycle(1 + targetNode.GetEStart() - scheduleInfo.GetCurrCycle());
580         } else {
581             scheduleInfo.SetAdvanceCycle(1);
582         }
583     }
584 }
585 
586 /*
587  * Advance mad's cycle until info's advanceCycle equal zero,
588  * and then clear info's availableReadyList.
589  */
UpdateScheduleProcessInfo(AArch64ScheduleProcessInfo & info) const590 void AArch64Schedule::UpdateScheduleProcessInfo(AArch64ScheduleProcessInfo &info) const
591 {
592     while (info.GetAdvanceCycle() > 0) {
593         info.IncCurrCycle();
594         mad->AdvanceOneCycleForAll();
595         info.DecAdvanceCycle();
596     }
597     info.ClearAvailableReadyList();
598 }
599 
600 /*
601  * Forward traversal readyList, if a node in readyList can be Schedule, add it to availableReadyList.
602  * Return true, if availableReadyList is not empty.
603  */
CheckSchedulable(AArch64ScheduleProcessInfo & info) const604 bool AArch64Schedule::CheckSchedulable(AArch64ScheduleProcessInfo &info) const
605 {
606     for (auto node : readyList) {
607         if (GetConsiderRegPressure()) {
608             info.PushElemIntoAvailableReadyList(node);
609         } else {
610             if (node->IsResourceIdle() && node->GetEStart() <= info.GetCurrCycle()) {
611                 info.PushElemIntoAvailableReadyList(node);
612             }
613         }
614     }
615     return info.AvailableReadyListIsEmpty() ? false : true;
616 }
617 
618 /*
619  * Calculate estimated machine cycle count for an input node series
620  */
CalSeriesCycles(const MapleVector<DepNode * > & nodes) const621 int AArch64Schedule::CalSeriesCycles(const MapleVector<DepNode *> &nodes) const
622 {
623     int currentCycle = 0;
624     /* after an instruction is issued, the minimum cycle count for the next instruction is 1 */
625     int instructionBaseCycleCount = 1;
626     std::map<DepNode *, int> scheduledCycleMap;
627     for (auto node : nodes) {
628         int latencyCycle = 0;
629         /* calculate the latest begin time of this node based on its predecessor's issue time and latency */
630         for (auto pred : node->GetPreds()) {
631             DepNode &from = pred->GetFrom();
632             int latency = static_cast<int>(pred->GetLatency());
633             int fromCycle = scheduledCycleMap[&from];
634             if (fromCycle + latency > latencyCycle) {
635                 latencyCycle = fromCycle + latency;
636             }
637         }
638         /* the issue time of this node is the max value between the next cycle and latest begin time */
639         if (currentCycle + instructionBaseCycleCount >= latencyCycle) {
640             currentCycle = currentCycle + instructionBaseCycleCount;
641         } else {
642             currentCycle = latencyCycle;
643         }
644         /* record this node's issue cycle */
645         scheduledCycleMap[node] = currentCycle;
646     }
647     return currentCycle;
648 }
649 
650 /* After building dependence graph, schedule insns. */
DoSchedule()651 uint32 AArch64Schedule::DoSchedule()
652 {
653     AArch64ScheduleProcessInfo scheduleInfo(nodeSize);
654     Init();
655     UpdateELStartsOnCycle(scheduleInfo.GetCurrCycle());
656     InitLiveRegSet(scheduleInfo);
657     while (!readyList.empty()) {
658         UpdateScheduleProcessInfo(scheduleInfo);
659         /* Check if schedulable */
660         if (!CheckSchedulable(scheduleInfo)) {
661             /* Advance cycle. */
662             scheduleInfo.SetAdvanceCycle(1);
663             continue;
664         }
665 
666         if (scheduleInfo.GetLastUpdateCycle() < scheduleInfo.GetCurrCycle()) {
667             scheduleInfo.SetLastUpdateCycle(scheduleInfo.GetCurrCycle());
668         }
669 
670         if (CGOptions::IsDebugSched()) {
671             DumpDebugInfo(scheduleInfo);
672         }
673 
674         /* Select a node to scheduling */
675         SelectNode(scheduleInfo);
676     }
677 
678     DEBUG_ASSERT(scheduleInfo.SizeOfScheduledNodes() == nodes.size(), "CG internal error, Not all nodes scheduled.");
679 
680     nodes.clear();
681     (void)nodes.insert(nodes.cbegin(), scheduleInfo.GetScheduledNodes().cbegin(),
682                        scheduleInfo.GetScheduledNodes().cend());
683     /* the second to last node is the true last node, because the last is kNodeTypeSeparator node */
684     DEBUG_ASSERT(nodes.size() - 2 >= 0, "size of nodes should be greater than or equal 2");  // size not less than 2
685     return (nodes[nodes.size() - 2]->GetSchedCycle());  // size-2 get the second to last node
686 }
687 
688 struct RegisterInfoUnit {
RegisterInfoUnitmaplebe::RegisterInfoUnit689     RegisterInfoUnit() : intRegNum(0), fpRegNum(0), ccRegNum(0) {}
690     uint32 intRegNum = 0;
691     uint32 fpRegNum = 0;
692     uint32 ccRegNum = 0;
693 };
694 
GetDepNodeDefType(const DepNode & depNode,const CGFunc & f)695 RegisterInfoUnit GetDepNodeDefType(const DepNode &depNode, const CGFunc &f)
696 {
697     RegisterInfoUnit rIU;
698     for (auto defRegNO : depNode.GetDefRegnos()) {
699         RegType defRegTy = AArch64ScheduleProcessInfo::GetRegisterType(f, defRegNO);
700         if (defRegTy == kRegTyInt) {
701             rIU.intRegNum++;
702         } else if (defRegTy == kRegTyFloat) {
703             rIU.fpRegNum++;
704         } else if (defRegTy == kRegTyCc) {
705             rIU.ccRegNum++;
706             DEBUG_ASSERT(rIU.ccRegNum <= 1, "spill cc reg?");
707         } else {
708             CHECK_FATAL(false, "NIY aarch64 register type");
709         }
710     }
711     /* call node will not increase reg def pressure */
712     if (depNode.GetInsn() != nullptr && depNode.GetInsn()->IsCall()) {
713         rIU.intRegNum = 0;
714         rIU.fpRegNum = 0;
715     }
716     return rIU;
717 }
718 
DoCSR(DepNode & node1,DepNode & node2,AArch64ScheduleProcessInfo & scheduleInfo) const719 AArch64Schedule::CSRResult AArch64Schedule::DoCSR(DepNode &node1, DepNode &node2,
720                                                   AArch64ScheduleProcessInfo &scheduleInfo) const
721 {
722     RegisterInfoUnit defRIU1 = GetDepNodeDefType(node1, cgFunc);
723     RegisterInfoUnit defRIU2 = GetDepNodeDefType(node2, cgFunc);
724     /* do not increase callee save pressure before call */
725     if (static_cast<int>(scheduleInfo.SizeOfCalleeSaveLiveRegister(true)) >= intCalleeSaveThreshold) {
726         if (defRIU1.intRegNum > 0 && defRIU2.intRegNum > 0) {
727             CSRResult csrInfo = ScheduleCrossCall(node1, node2);
728             if ((csrInfo == kNode1 && defRIU1.intRegNum >= scheduleInfo.GetFreeIntRegs(node1)) ||
729                 (csrInfo == kNode2 && defRIU2.intRegNum >= scheduleInfo.GetFreeIntRegs(node2))) {
730                 return csrInfo;
731             }
732         }
733     }
734     if (static_cast<int>(scheduleInfo.SizeOfCalleeSaveLiveRegister(false)) >= fpCalleeSaveThreshold) {
735         if (defRIU1.fpRegNum > 0 && defRIU2.fpRegNum > 0) {
736             CSRResult csrInfo = ScheduleCrossCall(node1, node2);
737             if ((csrInfo == kNode1 && defRIU1.fpRegNum >= scheduleInfo.GetFreeFpRegs(node1)) ||
738                 (csrInfo == kNode2 && defRIU2.fpRegNum >= scheduleInfo.GetFreeFpRegs(node2))) {
739                 return csrInfo;
740             }
741         }
742     }
743     auto findFreeRegNode = [&scheduleInfo, &node1, &node2](bool isInt) -> CSRResult {
744         auto freeRegNodes = isInt ? scheduleInfo.GetFreeIntRegNodeSet() : scheduleInfo.GetFreeFpRegNodeSet();
745         if (freeRegNodes.find(&node1) != freeRegNodes.end() && freeRegNodes.find(&node2) == freeRegNodes.end()) {
746             return kNode1;
747         }
748         if (freeRegNodes.find(&node1) == freeRegNodes.end() && freeRegNodes.find(&node2) != freeRegNodes.end()) {
749             return kNode2;
750         }
751         return kDoCSP;
752     };
753     if (static_cast<int>(scheduleInfo.SizeOfIntLiveRegSet()) >= intRegPressureThreshold) {
754         if (findFreeRegNode(true) != kDoCSP) {
755             return findFreeRegNode(true);
756         }
757     }
758     if (static_cast<int>(scheduleInfo.SizeOfFpLiveRegSet()) >= fpRegPressureThreshold) {
759         if (findFreeRegNode(false) != kDoCSP) {
760             return findFreeRegNode(false);
761         }
762     }
763 
764     bool canDoCSPFurther = false;
765     if (static_cast<int>(scheduleInfo.SizeOfIntLiveRegSet()) >= intRegPressureThreshold) {
766         if (defRIU1.intRegNum != defRIU2.intRegNum) {
767             return defRIU1.intRegNum < defRIU2.intRegNum ? kNode1 : kNode2;
768         } else {
769             canDoCSPFurther = defRIU1.intRegNum == 0;
770         }
771     }
772     if (static_cast<int>(scheduleInfo.SizeOfFpLiveRegSet()) >= fpRegPressureThreshold) {
773         if (defRIU1.fpRegNum != defRIU2.fpRegNum) {
774             return defRIU1.fpRegNum < defRIU2.fpRegNum ? kNode1 : kNode2;
775         } else {
776             canDoCSPFurther = (defRIU1.fpRegNum == 0 && canDoCSPFurther);
777         }
778     }
779     /* if both nodes are going to increase reg pressure, do not do CSP further */
780     return canDoCSPFurther ? kDoCSP : (node1.GetInsn()->GetId() < node2.GetInsn()->GetId() ? kNode1 : kNode2);
781 }
782 
ScheduleCrossCall(const DepNode & node1,const DepNode & node2) const783 AArch64Schedule::CSRResult AArch64Schedule::ScheduleCrossCall(const DepNode &node1, const DepNode &node2) const
784 {
785     uint32 node1ID = node1.GetInsn()->GetId();
786     uint32 node2ID = node2.GetInsn()->GetId();
787     bool order = node1ID < node2ID; /* true -- node1 before node2  false -- node1 after node2 */
788     Insn *beginInsn = order ? node1.GetInsn() : node2.GetInsn();
789     uint32 finialId = order ? node2ID : node1ID;
790     for (Insn *checkInsn = beginInsn; (checkInsn != nullptr && checkInsn->GetId() <= finialId);
791          checkInsn = checkInsn->GetNextMachineInsn()) {
792         if (checkInsn->IsCall()) {
793             return order ? kNode1 : kNode2;
794         }
795     }
796     return kDoCSP;
797 };
798 
799 /*
800  * Comparing priorities of node1 and node2 according to some heuristic rules
801  * return true if node1's priority is higher
802  * crp -- consider reg pressure
803  */
CompareDepNode(DepNode & node1,DepNode & node2,AArch64ScheduleProcessInfo & scheduleInfo) const804 bool AArch64Schedule::CompareDepNode(DepNode &node1, DepNode &node2, AArch64ScheduleProcessInfo &scheduleInfo) const
805 {
806     /*
807      * strategy CSR -- code schedule for register pressure
808      * if pressure is above the threshold, select the node which can reduce register pressure
809      */
810     if (GetConsiderRegPressure()) {
811         switch (DoCSR(node1, node2, scheduleInfo)) {
812             case kNode1:
813                 return true;
814             case kNode2:
815                 return false;
816             default:
817                 break;
818         }
819     }
820     /* strategy CSP -- code schedule for CPU pipeline */
821     /* less LStart first */
822     if (node1.GetLStart() != node2.GetLStart()) {
823         return node1.GetLStart() < node2.GetLStart();
824     }
825 
826     /* max unit kind use */
827     bool use1 = IfUseUnitKind(node1, maxUnitIndex);
828     bool use2 = IfUseUnitKind(node2, maxUnitIndex);
829     if (use1 != use2) {
830         return use1;
831     }
832 
833     /* slot0 first */
834     SlotType slotType1 = node1.GetReservation()->GetSlot();
835     SlotType slotType2 = node2.GetReservation()->GetSlot();
836     if (slotType1 == kSlots) {
837         slotType1 = kSlot0;
838     }
839     if (slotType2 == kSlots) {
840         slotType2 = kSlot0;
841     }
842     if (slotType1 != slotType2) {
843         return slotType1 < slotType2;
844     }
845 
846     /* more succNodes fisrt */
847     if (node1.GetSuccs().size() != node2.GetSuccs().size()) {
848         return node1.GetSuccs().size() > node2.GetSuccs().size();
849     }
850 
851     /* default order */
852     return node1.GetInsn()->GetId() < node2.GetInsn()->GetId();
853 }
854 
855 /*
856  * Calculate number of every unit that used by avaliableReadyList's nodes and save the max in maxUnitIndex
857  */
CalculateMaxUnitKindCount(ScheduleProcessInfo & scheduleInfo) const858 void AArch64Schedule::CalculateMaxUnitKindCount(ScheduleProcessInfo &scheduleInfo) const
859 {
860     uint32 unitKindCount[kUnitKindLast] = {0};
861     for (auto node : scheduleInfo.GetAvailableReadyList()) {
862         CountUnitKind(*node, unitKindCount, kUnitKindLast);
863     }
864 
865     uint32 maxCount = 0;
866     maxUnitIndex = 0;
867     for (size_t i = 1; i < kUnitKindLast; ++i) {
868         if (maxCount < unitKindCount[i]) {
869             maxCount = unitKindCount[i];
870             maxUnitIndex = i;
871         }
872     }
873 }
874 
875 /*
876  * Update the release reg node set
877  * When node in this set is scheduled, register pressure can be reduced
878  */
UpdateReleaseRegInfo(AArch64ScheduleProcessInfo & scheduleInfo)879 void AArch64Schedule::UpdateReleaseRegInfo(AArch64ScheduleProcessInfo &scheduleInfo)
880 {
881     auto &availableReadyList = scheduleInfo.GetAvailableReadyList();
882     scheduleInfo.ClearALLFreeRegNodeSet();
883     /* Traverse availableReadyList and add those can reduce register pressure to release reg node set */
884     for (auto node : availableReadyList) {
885         std::set<regno_t> freeRegNO = CanFreeRegister(*node);
886         if (!freeRegNO.empty()) {
887             scheduleInfo.VaryFreeRegSet(cgFunc, freeRegNO, *node);
888         }
889     }
890 }
891 
892 /*
893  * return registers which an instruction can release after being scheduled
894  */
CanFreeRegister(const DepNode & node) const895 std::set<regno_t> AArch64Schedule::CanFreeRegister(const DepNode &node) const
896 {
897     std::set<regno_t> freeRegSet;
898     for (auto reg : node.GetUseRegnos()) {
899         if (RegPressureSchedule::IsLastUse(node, reg)) {
900             freeRegSet.emplace(reg);
901         }
902     }
903     return freeRegSet;
904 }
905 
906 /*
907  * After an instruction is scheduled, update live reg set
908  */
UpdateLiveRegSet(AArch64ScheduleProcessInfo & scheduleInfo,const DepNode & node)909 void AArch64Schedule::UpdateLiveRegSet(AArch64ScheduleProcessInfo &scheduleInfo, const DepNode &node)
910 {
911     /* dealing with def reg, add def reg into the live reg set */
912     size_t i = 1;
913     for (auto &defReg : node.GetDefRegnos()) {
914         if (scheduleInfo.FindIntLiveReg(defReg) == 0 && scheduleInfo.FindFpLiveReg(defReg) == 0) {
915             scheduleInfo.VaryLiveRegSet(cgFunc, defReg, true);
916         }
917         /* delete dead def reg from live reg set because its live range is only 1 cycle */
918         if (node.GetRegDefs(i) == nullptr && liveOutRegNo.find(defReg) == liveOutRegNo.end()) {
919             scheduleInfo.VaryLiveRegSet(cgFunc, defReg, false);
920         }
921         ++i;
922     }
923     /* dealing with use reg, delete use reg from live reg set if this instruction is last use of it */
924     for (auto &useReg : node.GetUseRegnos()) {
925         if (RegPressureSchedule::IsLastUse(node, useReg)) {
926             if ((scheduleInfo.FindIntLiveReg(useReg) != 0 || scheduleInfo.FindFpLiveReg(useReg) != 0) &&
927                 liveOutRegNo.find(useReg) == liveOutRegNo.end()) {
928                 scheduleInfo.VaryLiveRegSet(cgFunc, useReg, false);
929             }
930         }
931     }
932 }
933 
934 /*
935  * Initialize the live reg set based on the live in reg information
936  */
InitLiveRegSet(AArch64ScheduleProcessInfo & scheduleInfo)937 void AArch64Schedule::InitLiveRegSet(AArch64ScheduleProcessInfo &scheduleInfo)
938 {
939     if (GetConsiderRegPressure()) {
940         for (auto reg : liveInRegNo) {
941             scheduleInfo.VaryLiveRegSet(cgFunc, reg, true);
942         }
943     }
944 }
945 
946 /*
947  * A simulated schedule:
948  * scheduling instruction in original order to calculate original execute cycles.
949  */
SimulateOnly()950 uint32 AArch64Schedule::SimulateOnly()
951 {
952     uint32 currCycle = 0;
953     uint32 advanceCycle = 0;
954     Init();
955 
956     for (uint32 i = 0; i < nodes.size();) {
957         while (advanceCycle > 0) {
958             ++currCycle;
959             mad->AdvanceOneCycleForAll();
960             --advanceCycle;
961         }
962 
963         DepNode *targetNode = nodes[i];
964         if ((currCycle >= targetNode->GetEStart()) && targetNode->IsResourceIdle()) {
965             targetNode->SetSimulateCycle(currCycle);
966             targetNode->OccupyRequiredUnits();
967 
968             /* Update estart. */
969             for (auto succLink : targetNode->GetSuccs()) {
970                 DepNode &succNode = succLink->GetTo();
971                 uint32 eStart = currCycle + succLink->GetLatency();
972                 if (succNode.GetEStart() < eStart) {
973                     succNode.SetEStart(eStart);
974                 }
975             }
976 
977             if (CGOptions::IsDebugSched()) {
978                 LogInfo::MapleLogger() << "[Simulate] TargetNode : ";
979                 targetNode->GetInsn()->Dump();
980                 LogInfo::MapleLogger() << "\n";
981             }
982 
983             switch (targetNode->GetInsn()->GetLatencyType()) {
984                 case kLtClinit:
985                     advanceCycle = kClinitAdvanceCycle;
986                     break;
987                 case kLtAdrpLdr:
988                     advanceCycle = kAdrpLdrAdvanceCycle;
989                     break;
990                 case kLtClinitTail:
991                     advanceCycle = kClinitTailAdvanceCycle;
992                     break;
993                 default:
994                     break;
995             }
996 
997             ++i;
998         } else {
999             advanceCycle = 1;
1000         }
1001     }
1002     /* the second to last node is the true last node, because the last is kNodeTypeSeparator nod */
1003     DEBUG_ASSERT(nodes.size() - kSecondToLastNode >= 0, "size of nodes should be greater than or equal 2");
1004     return (nodes[nodes.size() - kSecondToLastNode]->GetSimulateCycle());
1005 }
1006 
1007 /* Restore dependence graph to normal CGIR. */
FinalizeScheduling(BB & bb,const DepAnalysis & depAnalysis)1008 void AArch64Schedule::FinalizeScheduling(BB &bb, const DepAnalysis &depAnalysis)
1009 {
1010     bb.ClearInsns();
1011 
1012     const Insn *prevLocInsn = (bb.GetPrev() != nullptr ? bb.GetPrev()->GetLastLoc() : nullptr);
1013     for (auto node : nodes) {
1014         /* Append comments first. */
1015         for (auto comment : node->GetComments()) {
1016             if (comment->GetPrev() != nullptr && comment->GetPrev()->IsDbgInsn()) {
1017                 bb.AppendInsn(*comment->GetPrev());
1018             }
1019             bb.AppendInsn(*comment);
1020         }
1021         /* Append cfi instructions. */
1022         for (auto cfi : node->GetCfiInsns()) {
1023             bb.AppendInsn(*cfi);
1024         }
1025         /* Append insn. */
1026         if (!node->GetClinitInsns().empty()) {
1027             for (auto clinit : node->GetClinitInsns()) {
1028                 bb.AppendInsn(*clinit);
1029             }
1030         } else if (node->GetType() == kNodeTypeNormal) {
1031             if (node->GetInsn()->GetPrev() != nullptr && node->GetInsn()->GetPrev()->IsDbgInsn()) {
1032                 bb.AppendInsn(*node->GetInsn()->GetPrev());
1033             }
1034             bb.AppendInsn(*node->GetInsn());
1035         }
1036     }
1037     bb.SetLastLoc(prevLocInsn);
1038 
1039     for (auto lastComment : depAnalysis.GetLastComments()) {
1040         bb.AppendInsn(*lastComment);
1041     }
1042 }
1043 
1044 /* For every node of nodes, update it's bruteForceSchedCycle. */
UpdateBruteForceSchedCycle()1045 void AArch64Schedule::UpdateBruteForceSchedCycle()
1046 {
1047     for (auto node : nodes) {
1048         node->SetBruteForceSchedCycle(node->GetSchedCycle());
1049     }
1050 }
1051 
1052 /* Recursively schedule all of the possible node. */
IterateBruteForce(DepNode & targetNode,MapleVector<DepNode * > & readyList,uint32 currCycle,MapleVector<DepNode * > & scheduledNodes,uint32 & maxCycleCount,MapleVector<DepNode * > & optimizedScheduledNodes)1053 void AArch64Schedule::IterateBruteForce(DepNode &targetNode, MapleVector<DepNode *> &readyList, uint32 currCycle,
1054                                         MapleVector<DepNode *> &scheduledNodes, uint32 &maxCycleCount,
1055                                         MapleVector<DepNode *> &optimizedScheduledNodes)
1056 {
1057     /* Save states. */
1058     constexpr int32 unitSize = 31;
1059     DEBUG_ASSERT(unitSize == mad->GetAllUnitsSize(), "CG internal error.");
1060     constexpr int32 bitSetLength = 32;
1061     std::vector<std::bitset<bitSetLength>> occupyTable;
1062     occupyTable.resize(unitSize, 0);
1063     mad->SaveStates(occupyTable, unitSize);
1064 
1065     /* Schedule targetNode first. */
1066     targetNode.SetState(kScheduled);
1067     targetNode.SetSchedCycle(currCycle);
1068     scheduledNodes.emplace_back(&targetNode);
1069 
1070     MapleVector<DepNode *> tempList = readyList;
1071     EraseNodeFromNodeList(targetNode, tempList);
1072     targetNode.OccupyRequiredUnits();
1073 
1074     /* Update readyList. */
1075     UpdateReadyList(targetNode, tempList, true);
1076 
1077     if (targetNode.GetType() == kNodeTypeSeparator) {
1078         /* If target node is separator node, update lastSeparatorIndex. */
1079         lastSeparatorIndex += kMaxDependenceNum;
1080     }
1081 
1082     if (tempList.empty()) {
1083         DEBUG_ASSERT(scheduledNodes.size() == nodes.size(), "CG internal error, Not all nodes scheduled.");
1084         if (currCycle < maxCycleCount) {
1085             maxCycleCount = currCycle;
1086             UpdateBruteForceSchedCycle();
1087             optimizedScheduledNodes = scheduledNodes;
1088         }
1089     } else {
1090         uint32 advanceCycle = 0;
1091         switch (targetNode.GetInsn()->GetLatencyType()) {
1092             case kLtClinit:
1093                 advanceCycle = kClinitAdvanceCycle;
1094                 break;
1095             case kLtAdrpLdr:
1096                 advanceCycle = kAdrpLdrAdvanceCycle;
1097                 break;
1098             case kLtClinitTail:
1099                 advanceCycle = kClinitTailAdvanceCycle;
1100                 break;
1101             default:
1102                 break;
1103         }
1104 
1105         do {
1106             std::vector<DepNode *> availableReadyList;
1107             std::vector<DepNode *> tempAvailableList;
1108             while (advanceCycle > 0) {
1109                 ++currCycle;
1110                 mad->AdvanceOneCycleForAll();
1111                 --advanceCycle;
1112             }
1113             /* Check EStart. */
1114             for (auto node : tempList) {
1115                 if (node->GetEStart() <= currCycle) {
1116                     tempAvailableList.emplace_back(node);
1117                 }
1118             }
1119 
1120             if (tempAvailableList.empty()) {
1121                 /* Advance cycle. */
1122                 advanceCycle = 1;
1123                 continue;
1124             }
1125 
1126             /* Check if schedulable */
1127             for (auto node : tempAvailableList) {
1128                 if (node->IsResourceIdle()) {
1129                     availableReadyList.emplace_back(node);
1130                 }
1131             }
1132 
1133             if (availableReadyList.empty()) {
1134                 /* Advance cycle. */
1135                 advanceCycle = 1;
1136                 continue;
1137             }
1138 
1139             for (auto node : availableReadyList) {
1140                 IterateBruteForce(*node, tempList, currCycle, scheduledNodes, maxCycleCount, optimizedScheduledNodes);
1141             }
1142 
1143             break;
1144         } while (true);
1145     }
1146 
1147     /*
1148      * Recover states.
1149      * Restore targetNode first.
1150      */
1151     targetNode.SetState(kReady);
1152     targetNode.SetSchedCycle(0);
1153     scheduledNodes.pop_back();
1154     mad->RestoreStates(occupyTable, unitSize);
1155 
1156     /* Update readyList. */
1157     for (auto succLink : targetNode.GetSuccs()) {
1158         DepNode &succNode = succLink->GetTo();
1159         succNode.IncreaseValidPredsSize();
1160         succNode.SetState(kNormal);
1161     }
1162 
1163     if (targetNode.GetType() == kNodeTypeSeparator) {
1164         /* If target node is separator node, update lastSeparatorIndex. */
1165         lastSeparatorIndex -= kMaxDependenceNum;
1166     }
1167 }
1168 
1169 /*
1170  * Brute force schedule:
1171  * Finding all possibile schedule list of current bb, and calculate every list's execute cycles,
1172  * return the optimal schedule list and it's cycles.
1173  */
DoBruteForceSchedule()1174 uint32 AArch64Schedule::DoBruteForceSchedule()
1175 {
1176     MapleVector<DepNode *> scheduledNodes(alloc.Adapter());
1177     MapleVector<DepNode *> optimizedScheduledNodes(alloc.Adapter());
1178 
1179     uint32 currCycle = 0;
1180     uint32 maxCycleCount = 0xFFFFFFFF;
1181     Init();
1182 
1183     /* Schedule First separator. */
1184     DepNode *targetNode = readyList.front();
1185     targetNode->SetState(kScheduled);
1186     targetNode->SetSchedCycle(currCycle);
1187     scheduledNodes.emplace_back(targetNode);
1188     readyList.clear();
1189 
1190     /* Update readyList. */
1191     UpdateReadyList(*targetNode, readyList, false);
1192 
1193     DEBUG_ASSERT(targetNode->GetType() == kNodeTypeSeparator, "The first node should be separator node.");
1194     DEBUG_ASSERT(!readyList.empty(), "readyList should not be empty.");
1195 
1196     for (auto targetNodeTemp : readyList) {
1197         IterateBruteForce(*targetNodeTemp, readyList, currCycle, scheduledNodes, maxCycleCount,
1198                           optimizedScheduledNodes);
1199     }
1200 
1201     nodes = optimizedScheduledNodes;
1202     return maxCycleCount;
1203 }
1204 
1205 /*
1206  * Update ready list after the targetNode has been scheduled.
1207  * For every targetNode's successor, if it's all predecessors have been scheduled,
1208  * add it to ready list and update it's information (like state, estart).
1209  */
UpdateReadyList(DepNode & targetNode,MapleVector<DepNode * > & readyList,bool updateEStart)1210 void AArch64Schedule::UpdateReadyList(DepNode &targetNode, MapleVector<DepNode *> &readyList, bool updateEStart)
1211 {
1212     for (auto succLink : targetNode.GetSuccs()) {
1213         DepNode &succNode = succLink->GetTo();
1214         succNode.DecreaseValidPredsSize();
1215         if (succNode.GetValidPredsSize() == 0) {
1216             readyList.emplace_back(&succNode);
1217             succNode.SetState(kReady);
1218 
1219             /* Set eStart. */
1220             if (updateEStart) {
1221                 uint32 maxEstart = 0;
1222                 for (auto predLink : succNode.GetPreds()) {
1223                     DepNode &predNode = predLink->GetFrom();
1224                     uint32 eStart = predNode.GetSchedCycle() + predLink->GetLatency();
1225                     maxEstart = (maxEstart < eStart ? eStart : maxEstart);
1226                 }
1227                 succNode.SetEStart(maxEstart);
1228             }
1229         }
1230     }
1231 }
1232 
1233 /* For every node of nodes, dump it's Depdence information. */
DumpDepGraph(const MapleVector<DepNode * > & nodes) const1234 void AArch64Schedule::DumpDepGraph(const MapleVector<DepNode *> &nodes) const
1235 {
1236     for (auto node : nodes) {
1237         depAnalysis->DumpDepNode(*node);
1238         LogInfo::MapleLogger() << "---------- preds ----------"
1239                                << "\n";
1240         for (auto pred : node->GetPreds()) {
1241             depAnalysis->DumpDepLink(*pred, &(pred->GetFrom()));
1242         }
1243         LogInfo::MapleLogger() << "---------- succs ----------"
1244                                << "\n";
1245         for (auto succ : node->GetSuccs()) {
1246             depAnalysis->DumpDepLink(*succ, &(succ->GetTo()));
1247         }
1248         LogInfo::MapleLogger() << "---------------------------"
1249                                << "\n";
1250     }
1251 }
1252 
1253 /* For every node of nodes, dump it's schedule time according simulate type and instruction information. */
DumpScheduleResult(const MapleVector<DepNode * > & nodes,SimulateType type) const1254 void AArch64Schedule::DumpScheduleResult(const MapleVector<DepNode *> &nodes, SimulateType type) const
1255 {
1256     for (auto node : nodes) {
1257         LogInfo::MapleLogger() << "cycle[ ";
1258         switch (type) {
1259             case kListSchedule:
1260                 LogInfo::MapleLogger() << node->GetSchedCycle();
1261                 break;
1262             case kBruteForce:
1263                 LogInfo::MapleLogger() << node->GetBruteForceSchedCycle();
1264                 break;
1265             case kSimulateOnly:
1266                 LogInfo::MapleLogger() << node->GetSimulateCycle();
1267                 break;
1268         }
1269         LogInfo::MapleLogger() << " ]  ";
1270         node->GetInsn()->Dump();
1271         LogInfo::MapleLogger() << "\n";
1272     }
1273 }
1274 
1275 /* Print bb's dependence dot graph information to a file. */
GenerateDot(const BB & bb,const MapleVector<DepNode * > & nodes) const1276 void AArch64Schedule::GenerateDot(const BB &bb, const MapleVector<DepNode *> &nodes) const
1277 {
1278     std::streambuf *coutBuf = std::cout.rdbuf(); /* keep original cout buffer */
1279     std::ofstream dgFile;
1280     std::streambuf *buf = dgFile.rdbuf();
1281     std::cout.rdbuf(buf);
1282 
1283     /* construct the file name */
1284     std::string fileName;
1285     fileName.append(phaseName);
1286     fileName.append("_");
1287     fileName.append(cgFunc.GetName());
1288     fileName.append("_BB");
1289     auto str = std::to_string(bb.GetId());
1290     fileName.append(str);
1291     fileName.append("_dep_graph.dot");
1292 
1293     dgFile.open(FileUtils::GetRealPath(fileName), std::ios::trunc);
1294     if (!dgFile.is_open()) {
1295         LogInfo::MapleLogger(kLlWarn) << "fileName:" << fileName << " open failure.\n";
1296         return;
1297     }
1298     dgFile << "digraph {\n";
1299     for (auto node : nodes) {
1300         for (auto succ : node->GetSuccs()) {
1301             dgFile << "insn" << node->GetInsn() << " -> "
1302                    << "insn" << succ->GetTo().GetInsn();
1303             dgFile << " [";
1304             if (succ->GetDepType() == kDependenceTypeTrue) {
1305                 dgFile << "color=red,";
1306             }
1307             dgFile << "label= \"" << succ->GetLatency() << "\"";
1308             dgFile << "];\n";
1309         }
1310     }
1311 
1312     for (auto node : nodes) {
1313         MOperator mOp = node->GetInsn()->GetMachineOpcode();
1314         const InsnDesc *md = &AArch64CG::kMd[mOp];
1315         dgFile << "insn" << node->GetInsn() << "[";
1316         dgFile << "shape=box,label= \" " << node->GetInsn()->GetId() << ":\n";
1317         dgFile << "{ ";
1318         dgFile << md->name << "\n";
1319         dgFile << "}\"];\n";
1320     }
1321     dgFile << "}\n";
1322     dgFile.flush();
1323     dgFile.close();
1324     std::cout.rdbuf(coutBuf);
1325 }
1326 
GetRegisterType(const CGFunc & f,regno_t regNO)1327 RegType AArch64ScheduleProcessInfo::GetRegisterType(const CGFunc &f, regno_t regNO)
1328 {
1329     if (AArch64isa::IsPhysicalRegister(regNO)) {
1330         if (AArch64isa::IsGPRegister(static_cast<AArch64reg>(regNO))) {
1331             return kRegTyInt;
1332         } else if (AArch64isa::IsFPSIMDRegister(static_cast<AArch64reg>(regNO))) {
1333             return kRegTyFloat;
1334         } else {
1335             CHECK_FATAL(false, "unknown physical reg");
1336         }
1337     } else {
1338         RegOperand *curRegOpnd = f.GetVirtualRegisterOperand(regNO);
1339         DEBUG_ASSERT(curRegOpnd != nullptr, "register which is not physical and virtual");
1340         return curRegOpnd->GetRegisterType();
1341     }
1342 }
1343 
VaryLiveRegSet(const CGFunc & f,regno_t regNO,bool isInc)1344 void AArch64ScheduleProcessInfo::VaryLiveRegSet(const CGFunc &f, regno_t regNO, bool isInc)
1345 {
1346     RegType registerTy = GetRegisterType(f, regNO);
1347     if (registerTy == kRegTyInt || registerTy == kRegTyVary) {
1348         isInc ? IncIntLiveRegSet(regNO) : DecIntLiveRegSet(regNO);
1349     } else if (registerTy == kRegTyFloat) {
1350         isInc ? IncFpLiveRegSet(regNO) : DecFpLiveRegSet(regNO);
1351     }
1352     /* consider other type register */
1353 }
1354 
VaryFreeRegSet(const CGFunc & f,std::set<regno_t> regNOs,DepNode & node)1355 void AArch64ScheduleProcessInfo::VaryFreeRegSet(const CGFunc &f, std::set<regno_t> regNOs, DepNode &node)
1356 {
1357     for (auto regNO : regNOs) {
1358         RegType registerTy = GetRegisterType(f, regNO);
1359         if (registerTy == kRegTyInt || registerTy == kRegTyVary /* memory base register must be int */) {
1360             IncFreeIntRegNode(node);
1361         } else if (registerTy == kRegTyFloat) {
1362             IncFreeFpRegNode(node);
1363         } else if (registerTy == kRegTyCc) {
1364             /* do not count CC reg */
1365             return;
1366         } else {
1367             /* consider other type register */
1368             CHECK_FATAL(false, "do not support this type of register");
1369         }
1370     }
1371 }
1372 
1373 /* Do brute force scheduling and dump scheduling information */
BruteForceScheduling(const BB & bb)1374 void AArch64Schedule::BruteForceScheduling(const BB &bb)
1375 {
1376     LogInfo::MapleLogger() << "\n\n$$ Function: " << cgFunc.GetName();
1377     LogInfo::MapleLogger() << "\n    BB id = " << bb.GetId() << "; nodes.size = " << nodes.size() << "\n";
1378 
1379     constexpr uint32 maxBruteForceNum = 50;
1380     if (nodes.size() < maxBruteForceNum) {
1381         GenerateDot(bb, nodes);
1382         uint32 maxBruteForceCycle = DoBruteForceSchedule();
1383         MapleVector<DepNode *> bruteNodes = nodes;
1384         uint32 maxSchedCycle = DoSchedule();
1385         if (maxBruteForceCycle < maxSchedCycle) {
1386             LogInfo::MapleLogger() << "maxBruteForceCycle = " << maxBruteForceCycle << "; maxSchedCycle = ";
1387             LogInfo::MapleLogger() << maxSchedCycle << "\n";
1388             LogInfo::MapleLogger() << "\n    ## Dump dependence graph ##    "
1389                                    << "\n";
1390             DumpDepGraph(nodes);
1391             LogInfo::MapleLogger() << "\n    ** Dump bruteForce scheduling result."
1392                                    << "\n";
1393             DumpScheduleResult(bruteNodes, kBruteForce);
1394             LogInfo::MapleLogger() << "\n    ^^ Dump list scheduling result."
1395                                    << "\n";
1396             DumpScheduleResult(nodes, kListSchedule);
1397         }
1398     } else {
1399         LogInfo::MapleLogger() << "Skip BruteForce scheduling."
1400                                << "\n";
1401         DoSchedule();
1402     }
1403 }
1404 
1405 /* Do simulate scheduling and dump scheduling information */
SimulateScheduling(const BB & bb)1406 void AArch64Schedule::SimulateScheduling(const BB &bb)
1407 {
1408     uint32 originCycle = SimulateOnly();
1409     MapleVector<DepNode *> oldNodes = nodes;
1410     uint32 schedCycle = DoSchedule();
1411     if (originCycle < schedCycle) {
1412         LogInfo::MapleLogger() << "Worse cycle [ " << (schedCycle - originCycle) << " ]; ";
1413         LogInfo::MapleLogger() << "originCycle = " << originCycle << "; schedCycle = ";
1414         LogInfo::MapleLogger() << schedCycle << "; nodes.size = " << nodes.size();
1415         LogInfo::MapleLogger() << ";    $$ Function: " << cgFunc.GetName();
1416         LogInfo::MapleLogger() << ";    BB id = " << bb.GetId() << "\n";
1417         LogInfo::MapleLogger() << "\n    ** Dump original result."
1418                                << "\n";
1419         DumpScheduleResult(oldNodes, kSimulateOnly);
1420         LogInfo::MapleLogger() << "\n    ^^ Dump list scheduling result."
1421                                << "\n";
1422         DumpScheduleResult(nodes, kListSchedule);
1423     } else if (originCycle > schedCycle) {
1424         LogInfo::MapleLogger() << "Advance cycle [ " << (originCycle - schedCycle) << " ]; ";
1425         LogInfo::MapleLogger() << "originCycle = " << originCycle << "; schedCycle = ";
1426         LogInfo::MapleLogger() << schedCycle << "; nodes.size = " << nodes.size();
1427         LogInfo::MapleLogger() << ";    $$ Function: " << cgFunc.GetName();
1428         LogInfo::MapleLogger() << ";    BB id = " << bb.GetId() << "\n";
1429     } else {
1430         LogInfo::MapleLogger() << "Equal cycle [ 0 ]; originCycle = " << originCycle;
1431         LogInfo::MapleLogger() << " ], ignore. nodes.size = " << nodes.size() << "\n";
1432     }
1433 }
1434 
1435 /*
1436  * A local list scheduling.
1437  * Schedule insns in basic blocks.
1438  */
ListScheduling(bool beforeRA)1439 void AArch64Schedule::ListScheduling(bool beforeRA)
1440 {
1441     InitIDAndLoc();
1442 
1443     mad = Globals::GetInstance()->GetMAD();
1444     if (beforeRA) {
1445         RegPressure::SetMaxRegClassNum(kRegisterLast);
1446     }
1447     depAnalysis = memPool.New<AArch64DepAnalysis>(cgFunc, memPool, *mad, beforeRA);
1448     FOR_ALL_BB(bb, &cgFunc)
1449     {
1450         if (bb->IsUnreachable()) {
1451             continue;
1452         }
1453         if (bb->IsAtomicBuiltInBB()) {
1454             continue;
1455         }
1456         depAnalysis->Run(*bb, nodes);
1457 
1458         if (LIST_SCHED_DUMP_REF) {
1459             GenerateDot(*bb, nodes);
1460             DumpDepGraph(nodes);
1461         }
1462         if (beforeRA) {
1463             liveInRegNo = bb->GetLiveInRegNO();
1464             liveOutRegNo = bb->GetLiveOutRegNO();
1465             if (bb->GetKind() != BB::kBBReturn) {
1466                 SetConsiderRegPressure();
1467                 DoSchedule();
1468             } else {
1469                 RegPressureScheduling(*bb, nodes);
1470             }
1471         } else {
1472             ClinitPairOpt();
1473             MemoryAccessPairOpt();
1474             if (CGOptions::IsDruteForceSched()) {
1475                 BruteForceScheduling(*bb);
1476             } else if (CGOptions::IsSimulateSched()) {
1477                 SimulateScheduling(*bb);
1478             } else {
1479                 DoSchedule();
1480             }
1481         }
1482 
1483         FinalizeScheduling(*bb, *depAnalysis);
1484     }
1485 }
1486 } /* namespace maplebe */
1487