1 /*
2 * Copyright (c) 2023 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "aarch64_schedule.h"
17 #include <ctime>
18 #include "aarch64_cg.h"
19 #include "aarch64_operand.h"
20 #include "aarch64_dependence.h"
21 #include "file_utils.h"
22 #include "pressure.h"
23
24 /*
25 * This phase is Instruction Scheduling.
26 * There is a local list scheduling, it is scheduling in basic block.
27 * The entry is AArch64Schedule::ListScheduling, will traversal all basic block,
28 * for a basic block:
29 * 1. build a dependence graph;
30 * 2. combine clinit pairs and str&ldr pairs;
31 * 3. reorder instructions.
32 */
33 namespace maplebe {
34 namespace {
35 constexpr uint32 kSecondToLastNode = 2;
36 } // namespace
37
38 uint32 AArch64Schedule::maxUnitIndex = 0;
39 /* reserve two register for special purpose */
40 int AArch64Schedule::intRegPressureThreshold = static_cast<int>(R27 - R0);
41 int AArch64Schedule::fpRegPressureThreshold = static_cast<int>(V30 - V0);
42 int AArch64Schedule::intCalleeSaveThresholdBase = static_cast<int>(R29 - R19);
43 int AArch64Schedule::intCalleeSaveThresholdEnhance = static_cast<int>(R30 - R19);
44 int AArch64Schedule::fpCalleeSaveThreshold = static_cast<int>(R16 - R8);
45 /* Init schedule's data struction. */
Init()46 void AArch64Schedule::Init()
47 {
48 readyList.clear();
49 nodeSize = nodes.size();
50 lastSeparatorIndex = 0;
51 mad->ReleaseAllUnits();
52 DepNode *node = nodes[0];
53
54 DEBUG_ASSERT(node->GetType() == kNodeTypeSeparator,
55 "CG internal error, the first node should be a separator node.");
56
57 if (CGOptions::IsDruteForceSched() || CGOptions::IsSimulateSched()) {
58 for (auto nodeTemp : nodes) {
59 nodeTemp->SetVisit(0);
60 nodeTemp->SetState(kNormal);
61 nodeTemp->SetSchedCycle(0);
62 nodeTemp->SetEStart(0);
63 nodeTemp->SetLStart(0);
64 }
65 }
66
67 readyList.emplace_back(node);
68 node->SetState(kReady);
69
70 /* Init validPredsSize and validSuccsSize. */
71 for (auto nodeTemp : nodes) {
72 nodeTemp->SetValidPredsSize(nodeTemp->GetPreds().size());
73 nodeTemp->SetValidSuccsSize(nodeTemp->GetSuccs().size());
74 }
75 }
76
77 /*
78 * A insn which can be combine should meet this conditions:
79 * 1. it is str/ldr insn;
80 * 2. address mode is kAddrModeBOi, [baseReg, offset];
81 * 3. the register operand size equal memory operand size;
82 * 4. if define USE_32BIT_REF, register operand size should be 4 byte;
83 * 5. for stp/ldp, the imm should be within -512 and 504(64bit), or -256 and 252(32bit);
84 * 6. pair instr for 8/4 byte registers must have multiple of 8/4 for imm.
85 * If insn can be combine, return true.
86 */
CanCombine(const Insn & insn) const87 bool AArch64Schedule::CanCombine(const Insn &insn) const
88 {
89 MOperator opCode = insn.GetMachineOpcode();
90 if ((opCode != MOP_xldr) && (opCode != MOP_wldr) && (opCode != MOP_dldr) && (opCode != MOP_sldr) &&
91 (opCode != MOP_xstr) && (opCode != MOP_wstr) && (opCode != MOP_dstr) && (opCode != MOP_sstr)) {
92 return false;
93 }
94
95 DEBUG_ASSERT(insn.GetOperand(1).IsMemoryAccessOperand(), "expects mem operands");
96 auto &memOpnd = static_cast<MemOperand &>(insn.GetOperand(1));
97 MemOperand::AArch64AddressingMode addrMode = memOpnd.GetAddrMode();
98 if (addrMode != MemOperand::kAddrModeBOi) {
99 return false;
100 }
101
102 auto ®Opnd = static_cast<RegOperand &>(insn.GetOperand(0));
103 if (regOpnd.GetSize() != memOpnd.GetSize()) {
104 return false;
105 }
106
107 uint32 size = regOpnd.GetSize() >> kLog2BitsPerByte;
108 #ifdef USE_32BIT_REF
109 if (insn.IsAccessRefField() && (size > (kAarch64IntregBytelen >> 1))) {
110 return false;
111 }
112 #endif /* USE_32BIT_REF */
113
114 OfstOperand *offset = memOpnd.GetOffsetImmediate();
115 if (offset == nullptr) {
116 return false;
117 }
118 int32 offsetValue = static_cast<int32>(offset->GetOffsetValue());
119 if (size == kAarch64IntregBytelen) { /* 64 bit */
120 if ((offsetValue <= kStpLdpImm64LowerBound) || (offsetValue >= kStpLdpImm64UpperBound)) {
121 return false;
122 }
123 } else if (size == (kAarch64IntregBytelen >> 1)) { /* 32 bit */
124 if ((offsetValue <= kStpLdpImm32LowerBound) || (offsetValue >= kStpLdpImm32UpperBound)) {
125 return false;
126 }
127 }
128
129 /* pair instr for 8/4 byte registers must have multiple of 8/4 for imm */
130 if ((static_cast<uint32>(offsetValue) % size) != 0) {
131 return false;
132 }
133 return true;
134 }
135
136 /* After building dependence graph, combine str&ldr pairs. */
MemoryAccessPairOpt()137 void AArch64Schedule::MemoryAccessPairOpt()
138 {
139 Init();
140 std::vector<DepNode *> memList;
141
142 while ((!readyList.empty()) || !memList.empty()) {
143 DepNode *readNode = nullptr;
144 if (!readyList.empty()) {
145 readNode = readyList[0];
146 readyList.erase(readyList.begin());
147 } else {
148 if (memList[0]->GetType() != kNodeTypeEmpty) {
149 FindAndCombineMemoryAccessPair(memList);
150 }
151 readNode = memList[0];
152 memList.erase(memList.cbegin());
153 }
154
155 /* schedule readNode */
156 CHECK_FATAL(readNode != nullptr, "readNode is null in MemoryAccessPairOpt");
157 readNode->SetState(kScheduled);
158
159 /* add readNode's succs to readyList or memList. */
160 for (auto succLink : readNode->GetSuccs()) {
161 DepNode &succNode = succLink->GetTo();
162 succNode.DecreaseValidPredsSize();
163 if (succNode.GetValidPredsSize() == 0) {
164 DEBUG_ASSERT(succNode.GetState() == kNormal, "schedule state should be kNormal");
165 succNode.SetState(kReady);
166 DEBUG_ASSERT(succNode.GetInsn() != nullptr, "insn can't be nullptr!");
167 if (CanCombine(*succNode.GetInsn())) {
168 memList.emplace_back(&succNode);
169 } else {
170 readyList.emplace_back(&succNode);
171 }
172 }
173 }
174 }
175
176 for (auto node : nodes) {
177 node->SetVisit(0);
178 node->SetState(kNormal);
179 }
180 }
181
182 /* Find and combine correct MemoryAccessPair for memList[0]. */
FindAndCombineMemoryAccessPair(const std::vector<DepNode * > & memList)183 void AArch64Schedule::FindAndCombineMemoryAccessPair(const std::vector<DepNode *> &memList)
184 {
185 DEBUG_ASSERT(!memList.empty(), "memList should not be empty");
186 CHECK_FATAL(memList[0]->GetInsn() != nullptr, "memList[0]'s insn should not be nullptr");
187 MemOperand *currMemOpnd = static_cast<MemOperand *>(memList[0]->GetInsn()->GetMemOpnd());
188 DEBUG_ASSERT(currMemOpnd != nullptr, "opnd should not be nullptr");
189 DEBUG_ASSERT(currMemOpnd->IsMemoryAccessOperand(), "opnd should be memOpnd");
190 int32 currOffsetVal = static_cast<int32>(currMemOpnd->GetOffsetImmediate()->GetOffsetValue());
191 MOperator currMop = memList[0]->GetInsn()->GetMachineOpcode();
192 /* find a depNode to combine with memList[0], and break; */
193 for (auto it = std::next(memList.begin(), 1); it != memList.end(); ++it) {
194 DEBUG_ASSERT((*it)->GetInsn() != nullptr, "null ptr check");
195
196 if (currMop == (*it)->GetInsn()->GetMachineOpcode()) {
197 MemOperand *nextMemOpnd = static_cast<MemOperand *>((*it)->GetInsn()->GetMemOpnd());
198 CHECK_FATAL(nextMemOpnd != nullptr, "opnd should not be nullptr");
199 CHECK_FATAL(nextMemOpnd->IsMemoryAccessOperand(), "opnd should be MemOperand");
200 int32 nextOffsetVal = static_cast<int32>(nextMemOpnd->GetOffsetImmediate()->GetOffsetValue());
201 uint32 size = currMemOpnd->GetSize() >> kLog2BitsPerByte;
202 if ((nextMemOpnd->GetBaseRegister() == currMemOpnd->GetBaseRegister()) &&
203 (nextMemOpnd->GetSize() == currMemOpnd->GetSize()) &&
204 (static_cast<uint32>(abs(nextOffsetVal - currOffsetVal)) == size)) {
205 /*
206 * In ARM Architecture Reference Manual ARMv8, for ARMv8-A architecture profile
207 * LDP on page K1-6125 declare that ldp can't use same reg
208 */
209 if (((currMop == MOP_xldr) || (currMop == MOP_sldr) || (currMop == MOP_dldr) ||
210 (currMop == MOP_wldr)) &&
211 &(memList[0]->GetInsn()->GetOperand(0)) == &((*it)->GetInsn()->GetOperand(0))) {
212 continue;
213 }
214 if (static_cast<RegOperand &>((*it)->GetInsn()->GetOperand(0)).GetRegisterType() !=
215 static_cast<RegOperand &>(memList[0]->GetInsn()->GetOperand(0)).GetRegisterType()) {
216 continue;
217 }
218
219 if (LIST_SCHED_DUMP_REF) {
220 LogInfo::MapleLogger() << "Combine insn: "
221 << "\n";
222 memList[0]->GetInsn()->Dump();
223 (*it)->GetInsn()->Dump();
224 }
225 depAnalysis->CombineMemoryAccessPair(*memList[0], **it, nextOffsetVal > currOffsetVal);
226 if (LIST_SCHED_DUMP_REF) {
227 LogInfo::MapleLogger() << "To: "
228 << "\n";
229 memList[0]->GetInsn()->Dump();
230 }
231 break;
232 }
233 }
234 }
235 }
236
237 /* combine clinit pairs. */
ClinitPairOpt()238 void AArch64Schedule::ClinitPairOpt()
239 {
240 for (auto it = nodes.begin(); it != nodes.end(); ++it) {
241 auto nextIt = std::next(it, 1);
242 if (nextIt == nodes.end()) {
243 return;
244 }
245
246 if ((*it)->GetInsn()->GetMachineOpcode() == MOP_adrp_ldr) {
247 if ((*nextIt)->GetInsn()->GetMachineOpcode() == MOP_clinit_tail) {
248 depAnalysis->CombineClinit(**it, **(nextIt), false);
249 } else if ((*nextIt)->GetType() == kNodeTypeSeparator) {
250 nextIt = std::next(nextIt, 1);
251 if (nextIt == nodes.end()) {
252 return;
253 }
254 if ((*nextIt)->GetInsn()->GetMachineOpcode() == MOP_clinit_tail) {
255 /* Do something. */
256 depAnalysis->CombineClinit(**it, **(nextIt), true);
257 }
258 }
259 }
260 }
261 }
262
263 /* Return the next node's index who is kNodeTypeSeparator. */
GetNextSepIndex() const264 uint32 AArch64Schedule::GetNextSepIndex() const
265 {
266 CHECK_FATAL(nodes.size() >= 1, "value overflow");
267 return ((lastSeparatorIndex + kMaxDependenceNum) < nodeSize) ? (lastSeparatorIndex + kMaxDependenceNum)
268 : (nodes.size() - 1);
269 }
270
271 /* Do register pressure schduling. */
RegPressureScheduling(BB & bb,MapleVector<DepNode * > & nodes)272 void AArch64Schedule::RegPressureScheduling(BB &bb, MapleVector<DepNode *> &nodes)
273 {
274 auto *regSchedule = memPool.New<RegPressureSchedule>(cgFunc, alloc);
275 /*
276 * Get physical register amount currently
277 * undef, Int Reg, Float Reg, Flag Reg
278 */
279 const std::vector<int32> kRegNumVec = {0, V0, (kMaxRegNum - V0) + 1, 1};
280 regSchedule->InitBBInfo(bb, memPool, nodes);
281 regSchedule->BuildPhyRegInfo(kRegNumVec);
282 regSchedule->DoScheduling(nodes);
283 }
284
285 /*
286 * Compute earliest start of the node,
287 * return value : the maximum estart.
288 */
ComputeEstart(uint32 cycle)289 uint32 AArch64Schedule::ComputeEstart(uint32 cycle)
290 {
291 std::vector<DepNode *> readyNodes;
292 uint32 maxIndex = GetNextSepIndex();
293
294 if (CGOptions::IsDebugSched()) {
295 /* Check validPredsSize. */
296 for (uint32 i = lastSeparatorIndex; i <= maxIndex; ++i) {
297 DepNode *node = nodes[i];
298 int32 schedNum = 0;
299 for (const auto *predLink : node->GetPreds()) {
300 if (predLink->GetFrom().GetState() == kScheduled) {
301 ++schedNum;
302 }
303 }
304 DEBUG_ASSERT(static_cast<uint32>(node->GetPreds().size()) >= static_cast<uint32>(schedNum),
305 "unsigned value overflow");
306 CHECK_FATAL((static_cast<uint32>(node->GetPreds().size()) - static_cast<uint32>(schedNum)) ==
307 node->GetValidPredsSize(),
308 "validPredsSize error.");
309 }
310 }
311
312 DEBUG_ASSERT(nodes[maxIndex]->GetType() == kNodeTypeSeparator,
313 "CG internal error, nodes[maxIndex] should be a separator node.");
314
315 (void)readyNodes.insert(readyNodes.cbegin(), readyList.cbegin(), readyList.cend());
316
317 uint32 maxEstart = cycle;
318 for (uint32 i = lastSeparatorIndex; i <= maxIndex; ++i) {
319 DepNode *node = nodes[i];
320 node->SetVisit(0);
321 }
322
323 for (auto *node : readyNodes) {
324 DEBUG_ASSERT(node->GetState() == kReady, "CG internal error, all nodes in ready list should be ready.");
325 if (node->GetEStart() < cycle) {
326 node->SetEStart(cycle);
327 }
328 }
329
330 while (!readyNodes.empty()) {
331 DepNode *node = readyNodes.front();
332 readyNodes.erase(readyNodes.cbegin());
333
334 for (const auto *succLink : node->GetSuccs()) {
335 DepNode &succNode = succLink->GetTo();
336 if (succNode.GetType() == kNodeTypeSeparator) {
337 continue;
338 }
339
340 if (succNode.GetEStart() < (node->GetEStart() + succLink->GetLatency())) {
341 succNode.SetEStart(node->GetEStart() + succLink->GetLatency());
342 }
343 maxEstart = (maxEstart < succNode.GetEStart() ? succNode.GetEStart() : maxEstart);
344 succNode.IncreaseVisit();
345 if ((succNode.GetVisit() >= succNode.GetValidPredsSize()) && (succNode.GetType() != kNodeTypeSeparator)) {
346 readyNodes.emplace_back(&succNode);
347 }
348 DEBUG_ASSERT(succNode.GetVisit() <= succNode.GetValidPredsSize(), "CG internal error.");
349 }
350 }
351
352 return maxEstart;
353 }
354
355 /* Compute latest start of the node. */
ComputeLstart(uint32 maxEstart)356 void AArch64Schedule::ComputeLstart(uint32 maxEstart)
357 {
358 /* std::vector is better than std::queue in run time */
359 std::vector<DepNode *> readyNodes;
360 uint32 maxIndex = GetNextSepIndex();
361
362 DEBUG_ASSERT(nodes[maxIndex]->GetType() == kNodeTypeSeparator,
363 "CG internal error, nodes[maxIndex] should be a separator node.");
364
365 for (uint32 i = lastSeparatorIndex; i <= maxIndex; ++i) {
366 DepNode *node = nodes[i];
367 node->SetLStart(maxEstart);
368 node->SetVisit(0);
369 }
370
371 readyNodes.emplace_back(nodes[maxIndex]);
372 while (!readyNodes.empty()) {
373 DepNode *node = readyNodes.front();
374 readyNodes.erase(readyNodes.cbegin());
375 for (const auto *predLink : node->GetPreds()) {
376 DepNode &predNode = predLink->GetFrom();
377 if (predNode.GetState() == kScheduled) {
378 continue;
379 }
380
381 if (predNode.GetLStart() > (node->GetLStart() - predLink->GetLatency())) {
382 predNode.SetLStart(node->GetLStart() - predLink->GetLatency());
383 }
384 predNode.IncreaseVisit();
385 if ((predNode.GetVisit() >= predNode.GetValidSuccsSize()) && (predNode.GetType() != kNodeTypeSeparator)) {
386 readyNodes.emplace_back(&predNode);
387 }
388
389 DEBUG_ASSERT(predNode.GetVisit() <= predNode.GetValidSuccsSize(), "CG internal error.");
390 }
391 }
392 }
393
394 /* Compute earliest start and latest start of the node that is in readyList and not be scheduled. */
UpdateELStartsOnCycle(uint32 cycle)395 void AArch64Schedule::UpdateELStartsOnCycle(uint32 cycle)
396 {
397 ComputeLstart(ComputeEstart(cycle));
398 }
399
400 /* Count unit kinds to an array. Each element of the array indicates the unit kind number of a node set. */
CountUnitKind(const DepNode & depNode,uint32 array[],const uint32 arraySize) const401 void AArch64Schedule::CountUnitKind(const DepNode &depNode, uint32 array[], const uint32 arraySize) const
402 {
403 (void)arraySize;
404 DEBUG_ASSERT(arraySize >= kUnitKindLast, "CG internal error. unit kind number is not correct.");
405 uint32 unitKind = depNode.GetUnitKind();
406 uint32 index = static_cast<uint32>(__builtin_ffs(static_cast<int>(unitKind)));
407 while (index != 0) {
408 DEBUG_ASSERT(index < kUnitKindLast, "CG internal error. index error.");
409 ++array[index];
410 unitKind &= ~(1u << (index - 1u));
411 index = static_cast<uint32>(__builtin_ffs(static_cast<int>(unitKind)));
412 }
413 }
414
415 /* Check if a node use a specific unit kind. */
IfUseUnitKind(const DepNode & depNode,uint32 index)416 bool AArch64Schedule::IfUseUnitKind(const DepNode &depNode, uint32 index)
417 {
418 uint32 unitKind = depNode.GetUnitKind();
419 uint32 idx = static_cast<uint32>(__builtin_ffs(static_cast<int>(unitKind)));
420 while (idx != 0) {
421 DEBUG_ASSERT(index < kUnitKindLast, "CG internal error. index error.");
422 if (idx == index) {
423 return true;
424 }
425 unitKind &= ~(1u << (idx - 1u));
426 idx = static_cast<uint32>(__builtin_ffs(static_cast<int>(unitKind)));
427 }
428
429 return false;
430 }
431
432 /* A sample schedule according dependence graph only, to verify correctness of dependence graph. */
RandomTest()433 void AArch64Schedule::RandomTest()
434 {
435 Init();
436 nodes.clear();
437
438 while (!readyList.empty()) {
439 DepNode *currNode = readyList.back();
440 currNode->SetState(kScheduled);
441 readyList.pop_back();
442 nodes.emplace_back(currNode);
443
444 for (auto succLink : currNode->GetSuccs()) {
445 DepNode &succNode = succLink->GetTo();
446 bool ready = true;
447 for (auto predLink : succNode.GetPreds()) {
448 DepNode &predNode = predLink->GetFrom();
449 if (predNode.GetState() != kScheduled) {
450 ready = false;
451 break;
452 }
453 }
454
455 if (ready) {
456 DEBUG_ASSERT(succNode.GetState() == kNormal, "succNode must be kNormal");
457 readyList.emplace_back(&succNode);
458 succNode.SetState(kReady);
459 }
460 }
461 }
462 }
463
464 /* Remove target from readyList. */
EraseNodeFromReadyList(const DepNode & target)465 void AArch64Schedule::EraseNodeFromReadyList(const DepNode &target)
466 {
467 EraseNodeFromNodeList(target, readyList);
468 }
469
470 /* Remove target from nodeList. */
EraseNodeFromNodeList(const DepNode & target,MapleVector<DepNode * > & nodeList)471 void AArch64Schedule::EraseNodeFromNodeList(const DepNode &target, MapleVector<DepNode *> &nodeList)
472 {
473 for (auto it = nodeList.begin(); it != nodeList.end(); ++it) {
474 if ((*it) == &target) {
475 nodeList.erase(it);
476 return;
477 }
478 }
479
480 DEBUG_ASSERT(false, "CG internal error, erase node fail.");
481 }
482
483 /* Dump all node of availableReadyList schedule information in current cycle. */
DumpDebugInfo(const ScheduleProcessInfo & scheduleInfo)484 void AArch64Schedule::DumpDebugInfo(const ScheduleProcessInfo &scheduleInfo)
485 {
486 LogInfo::MapleLogger() << "Current cycle[ " << scheduleInfo.GetCurrCycle() << " ], Available in readyList is : \n";
487 for (auto node : scheduleInfo.GetAvailableReadyList()) {
488 LogInfo::MapleLogger() << "NodeIndex[ " << node->GetIndex() << " ], Estart[ " << node->GetEStart()
489 << " ], Lstart[ ";
490 LogInfo::MapleLogger() << node->GetLStart() << " ], slot[ ";
491 LogInfo::MapleLogger() << (node->GetReservation() == nullptr ? "SlotNone"
492 : node->GetReservation()->GetSlotName())
493 << " ], ";
494 LogInfo::MapleLogger() << "succNodeNum[ " << node->GetSuccs().size() << " ], ";
495 node->GetInsn()->Dump();
496 LogInfo::MapleLogger() << '\n';
497 }
498 }
499
500 /*
501 * Select a node from availableReadyList according to some heuristic rules, then:
502 * 1. change targetNode's schedule information;
503 * 2. try to add successors of targetNode to readyList;
504 * 3. update unscheduled node set, when targetNode is last kNodeTypeSeparator;
505 * 4. update AdvanceCycle.
506 */
SelectNode(AArch64ScheduleProcessInfo & scheduleInfo)507 void AArch64Schedule::SelectNode(AArch64ScheduleProcessInfo &scheduleInfo)
508 {
509 auto &availableReadyList = scheduleInfo.GetAvailableReadyList();
510 auto it = availableReadyList.begin();
511 DepNode *targetNode = *it;
512 if (availableReadyList.size() > 1) {
513 CalculateMaxUnitKindCount(scheduleInfo);
514 if (GetConsiderRegPressure()) {
515 UpdateReleaseRegInfo(scheduleInfo);
516 }
517 ++it;
518 for (; it != availableReadyList.end(); ++it) {
519 if (CompareDepNode(**it, *targetNode, scheduleInfo)) {
520 targetNode = *it;
521 }
522 }
523 }
524 /* The priority of free-reg node is higher than pipeline */
525 while (!targetNode->IsResourceIdle()) {
526 scheduleInfo.IncCurrCycle();
527 mad->AdvanceOneCycleForAll();
528 }
529 if (GetConsiderRegPressure() && !scheduleInfo.IsFirstSeparator()) {
530 UpdateLiveRegSet(scheduleInfo, *targetNode);
531 }
532 /* push target node into scheduled nodes and turn it into kScheduled state */
533 scheduleInfo.PushElemIntoScheduledNodes(targetNode);
534
535 EraseNodeFromReadyList(*targetNode);
536
537 if (CGOptions::IsDebugSched()) {
538 LogInfo::MapleLogger() << "TargetNode : ";
539 targetNode->GetInsn()->Dump();
540 LogInfo::MapleLogger() << "\n";
541 }
542
543 /* Update readyList. */
544 UpdateReadyList(*targetNode, readyList, true);
545
546 if (targetNode->GetType() == kNodeTypeSeparator) {
547 /* If target node is separator node, update lastSeparatorIndex and calculate those depNodes's estart and lstart
548 * between current separator node and new Separator node.
549 */
550 if (!scheduleInfo.IsFirstSeparator()) {
551 lastSeparatorIndex += kMaxDependenceNum;
552 UpdateELStartsOnCycle(scheduleInfo.GetCurrCycle());
553 } else {
554 scheduleInfo.ResetIsFirstSeparator();
555 }
556 }
557
558 UpdateAdvanceCycle(scheduleInfo, *targetNode);
559 }
560
UpdateAdvanceCycle(AArch64ScheduleProcessInfo & scheduleInfo,const DepNode & targetNode) const561 void AArch64Schedule::UpdateAdvanceCycle(AArch64ScheduleProcessInfo &scheduleInfo, const DepNode &targetNode) const
562 {
563 switch (targetNode.GetInsn()->GetLatencyType()) {
564 case kLtClinit:
565 scheduleInfo.SetAdvanceCycle(kClinitAdvanceCycle);
566 break;
567 case kLtAdrpLdr:
568 scheduleInfo.SetAdvanceCycle(kAdrpLdrAdvanceCycle);
569 break;
570 case kLtClinitTail:
571 scheduleInfo.SetAdvanceCycle(kClinitTailAdvanceCycle);
572 break;
573 default:
574 break;
575 }
576
577 if ((scheduleInfo.GetAdvanceCycle() == 0) && mad->IsFullIssued()) {
578 if (targetNode.GetEStart() > scheduleInfo.GetCurrCycle()) {
579 scheduleInfo.SetAdvanceCycle(1 + targetNode.GetEStart() - scheduleInfo.GetCurrCycle());
580 } else {
581 scheduleInfo.SetAdvanceCycle(1);
582 }
583 }
584 }
585
586 /*
587 * Advance mad's cycle until info's advanceCycle equal zero,
588 * and then clear info's availableReadyList.
589 */
UpdateScheduleProcessInfo(AArch64ScheduleProcessInfo & info) const590 void AArch64Schedule::UpdateScheduleProcessInfo(AArch64ScheduleProcessInfo &info) const
591 {
592 while (info.GetAdvanceCycle() > 0) {
593 info.IncCurrCycle();
594 mad->AdvanceOneCycleForAll();
595 info.DecAdvanceCycle();
596 }
597 info.ClearAvailableReadyList();
598 }
599
600 /*
601 * Forward traversal readyList, if a node in readyList can be Schedule, add it to availableReadyList.
602 * Return true, if availableReadyList is not empty.
603 */
CheckSchedulable(AArch64ScheduleProcessInfo & info) const604 bool AArch64Schedule::CheckSchedulable(AArch64ScheduleProcessInfo &info) const
605 {
606 for (auto node : readyList) {
607 if (GetConsiderRegPressure()) {
608 info.PushElemIntoAvailableReadyList(node);
609 } else {
610 if (node->IsResourceIdle() && node->GetEStart() <= info.GetCurrCycle()) {
611 info.PushElemIntoAvailableReadyList(node);
612 }
613 }
614 }
615 return info.AvailableReadyListIsEmpty() ? false : true;
616 }
617
618 /*
619 * Calculate estimated machine cycle count for an input node series
620 */
CalSeriesCycles(const MapleVector<DepNode * > & nodes) const621 int AArch64Schedule::CalSeriesCycles(const MapleVector<DepNode *> &nodes) const
622 {
623 int currentCycle = 0;
624 /* after an instruction is issued, the minimum cycle count for the next instruction is 1 */
625 int instructionBaseCycleCount = 1;
626 std::map<DepNode *, int> scheduledCycleMap;
627 for (auto node : nodes) {
628 int latencyCycle = 0;
629 /* calculate the latest begin time of this node based on its predecessor's issue time and latency */
630 for (auto pred : node->GetPreds()) {
631 DepNode &from = pred->GetFrom();
632 int latency = static_cast<int>(pred->GetLatency());
633 int fromCycle = scheduledCycleMap[&from];
634 if (fromCycle + latency > latencyCycle) {
635 latencyCycle = fromCycle + latency;
636 }
637 }
638 /* the issue time of this node is the max value between the next cycle and latest begin time */
639 if (currentCycle + instructionBaseCycleCount >= latencyCycle) {
640 currentCycle = currentCycle + instructionBaseCycleCount;
641 } else {
642 currentCycle = latencyCycle;
643 }
644 /* record this node's issue cycle */
645 scheduledCycleMap[node] = currentCycle;
646 }
647 return currentCycle;
648 }
649
650 /* After building dependence graph, schedule insns. */
DoSchedule()651 uint32 AArch64Schedule::DoSchedule()
652 {
653 AArch64ScheduleProcessInfo scheduleInfo(nodeSize);
654 Init();
655 UpdateELStartsOnCycle(scheduleInfo.GetCurrCycle());
656 InitLiveRegSet(scheduleInfo);
657 while (!readyList.empty()) {
658 UpdateScheduleProcessInfo(scheduleInfo);
659 /* Check if schedulable */
660 if (!CheckSchedulable(scheduleInfo)) {
661 /* Advance cycle. */
662 scheduleInfo.SetAdvanceCycle(1);
663 continue;
664 }
665
666 if (scheduleInfo.GetLastUpdateCycle() < scheduleInfo.GetCurrCycle()) {
667 scheduleInfo.SetLastUpdateCycle(scheduleInfo.GetCurrCycle());
668 }
669
670 if (CGOptions::IsDebugSched()) {
671 DumpDebugInfo(scheduleInfo);
672 }
673
674 /* Select a node to scheduling */
675 SelectNode(scheduleInfo);
676 }
677
678 DEBUG_ASSERT(scheduleInfo.SizeOfScheduledNodes() == nodes.size(), "CG internal error, Not all nodes scheduled.");
679
680 nodes.clear();
681 (void)nodes.insert(nodes.cbegin(), scheduleInfo.GetScheduledNodes().cbegin(),
682 scheduleInfo.GetScheduledNodes().cend());
683 /* the second to last node is the true last node, because the last is kNodeTypeSeparator node */
684 DEBUG_ASSERT(nodes.size() - 2 >= 0, "size of nodes should be greater than or equal 2"); // size not less than 2
685 return (nodes[nodes.size() - 2]->GetSchedCycle()); // size-2 get the second to last node
686 }
687
688 struct RegisterInfoUnit {
RegisterInfoUnitmaplebe::RegisterInfoUnit689 RegisterInfoUnit() : intRegNum(0), fpRegNum(0), ccRegNum(0) {}
690 uint32 intRegNum = 0;
691 uint32 fpRegNum = 0;
692 uint32 ccRegNum = 0;
693 };
694
GetDepNodeDefType(const DepNode & depNode,const CGFunc & f)695 RegisterInfoUnit GetDepNodeDefType(const DepNode &depNode, const CGFunc &f)
696 {
697 RegisterInfoUnit rIU;
698 for (auto defRegNO : depNode.GetDefRegnos()) {
699 RegType defRegTy = AArch64ScheduleProcessInfo::GetRegisterType(f, defRegNO);
700 if (defRegTy == kRegTyInt) {
701 rIU.intRegNum++;
702 } else if (defRegTy == kRegTyFloat) {
703 rIU.fpRegNum++;
704 } else if (defRegTy == kRegTyCc) {
705 rIU.ccRegNum++;
706 DEBUG_ASSERT(rIU.ccRegNum <= 1, "spill cc reg?");
707 } else {
708 CHECK_FATAL(false, "NIY aarch64 register type");
709 }
710 }
711 /* call node will not increase reg def pressure */
712 if (depNode.GetInsn() != nullptr && depNode.GetInsn()->IsCall()) {
713 rIU.intRegNum = 0;
714 rIU.fpRegNum = 0;
715 }
716 return rIU;
717 }
718
DoCSR(DepNode & node1,DepNode & node2,AArch64ScheduleProcessInfo & scheduleInfo) const719 AArch64Schedule::CSRResult AArch64Schedule::DoCSR(DepNode &node1, DepNode &node2,
720 AArch64ScheduleProcessInfo &scheduleInfo) const
721 {
722 RegisterInfoUnit defRIU1 = GetDepNodeDefType(node1, cgFunc);
723 RegisterInfoUnit defRIU2 = GetDepNodeDefType(node2, cgFunc);
724 /* do not increase callee save pressure before call */
725 if (static_cast<int>(scheduleInfo.SizeOfCalleeSaveLiveRegister(true)) >= intCalleeSaveThreshold) {
726 if (defRIU1.intRegNum > 0 && defRIU2.intRegNum > 0) {
727 CSRResult csrInfo = ScheduleCrossCall(node1, node2);
728 if ((csrInfo == kNode1 && defRIU1.intRegNum >= scheduleInfo.GetFreeIntRegs(node1)) ||
729 (csrInfo == kNode2 && defRIU2.intRegNum >= scheduleInfo.GetFreeIntRegs(node2))) {
730 return csrInfo;
731 }
732 }
733 }
734 if (static_cast<int>(scheduleInfo.SizeOfCalleeSaveLiveRegister(false)) >= fpCalleeSaveThreshold) {
735 if (defRIU1.fpRegNum > 0 && defRIU2.fpRegNum > 0) {
736 CSRResult csrInfo = ScheduleCrossCall(node1, node2);
737 if ((csrInfo == kNode1 && defRIU1.fpRegNum >= scheduleInfo.GetFreeFpRegs(node1)) ||
738 (csrInfo == kNode2 && defRIU2.fpRegNum >= scheduleInfo.GetFreeFpRegs(node2))) {
739 return csrInfo;
740 }
741 }
742 }
743 auto findFreeRegNode = [&scheduleInfo, &node1, &node2](bool isInt) -> CSRResult {
744 auto freeRegNodes = isInt ? scheduleInfo.GetFreeIntRegNodeSet() : scheduleInfo.GetFreeFpRegNodeSet();
745 if (freeRegNodes.find(&node1) != freeRegNodes.end() && freeRegNodes.find(&node2) == freeRegNodes.end()) {
746 return kNode1;
747 }
748 if (freeRegNodes.find(&node1) == freeRegNodes.end() && freeRegNodes.find(&node2) != freeRegNodes.end()) {
749 return kNode2;
750 }
751 return kDoCSP;
752 };
753 if (static_cast<int>(scheduleInfo.SizeOfIntLiveRegSet()) >= intRegPressureThreshold) {
754 if (findFreeRegNode(true) != kDoCSP) {
755 return findFreeRegNode(true);
756 }
757 }
758 if (static_cast<int>(scheduleInfo.SizeOfFpLiveRegSet()) >= fpRegPressureThreshold) {
759 if (findFreeRegNode(false) != kDoCSP) {
760 return findFreeRegNode(false);
761 }
762 }
763
764 bool canDoCSPFurther = false;
765 if (static_cast<int>(scheduleInfo.SizeOfIntLiveRegSet()) >= intRegPressureThreshold) {
766 if (defRIU1.intRegNum != defRIU2.intRegNum) {
767 return defRIU1.intRegNum < defRIU2.intRegNum ? kNode1 : kNode2;
768 } else {
769 canDoCSPFurther = defRIU1.intRegNum == 0;
770 }
771 }
772 if (static_cast<int>(scheduleInfo.SizeOfFpLiveRegSet()) >= fpRegPressureThreshold) {
773 if (defRIU1.fpRegNum != defRIU2.fpRegNum) {
774 return defRIU1.fpRegNum < defRIU2.fpRegNum ? kNode1 : kNode2;
775 } else {
776 canDoCSPFurther = (defRIU1.fpRegNum == 0 && canDoCSPFurther);
777 }
778 }
779 /* if both nodes are going to increase reg pressure, do not do CSP further */
780 return canDoCSPFurther ? kDoCSP : (node1.GetInsn()->GetId() < node2.GetInsn()->GetId() ? kNode1 : kNode2);
781 }
782
ScheduleCrossCall(const DepNode & node1,const DepNode & node2) const783 AArch64Schedule::CSRResult AArch64Schedule::ScheduleCrossCall(const DepNode &node1, const DepNode &node2) const
784 {
785 uint32 node1ID = node1.GetInsn()->GetId();
786 uint32 node2ID = node2.GetInsn()->GetId();
787 bool order = node1ID < node2ID; /* true -- node1 before node2 false -- node1 after node2 */
788 Insn *beginInsn = order ? node1.GetInsn() : node2.GetInsn();
789 uint32 finialId = order ? node2ID : node1ID;
790 for (Insn *checkInsn = beginInsn; (checkInsn != nullptr && checkInsn->GetId() <= finialId);
791 checkInsn = checkInsn->GetNextMachineInsn()) {
792 if (checkInsn->IsCall()) {
793 return order ? kNode1 : kNode2;
794 }
795 }
796 return kDoCSP;
797 };
798
799 /*
800 * Comparing priorities of node1 and node2 according to some heuristic rules
801 * return true if node1's priority is higher
802 * crp -- consider reg pressure
803 */
CompareDepNode(DepNode & node1,DepNode & node2,AArch64ScheduleProcessInfo & scheduleInfo) const804 bool AArch64Schedule::CompareDepNode(DepNode &node1, DepNode &node2, AArch64ScheduleProcessInfo &scheduleInfo) const
805 {
806 /*
807 * strategy CSR -- code schedule for register pressure
808 * if pressure is above the threshold, select the node which can reduce register pressure
809 */
810 if (GetConsiderRegPressure()) {
811 switch (DoCSR(node1, node2, scheduleInfo)) {
812 case kNode1:
813 return true;
814 case kNode2:
815 return false;
816 default:
817 break;
818 }
819 }
820 /* strategy CSP -- code schedule for CPU pipeline */
821 /* less LStart first */
822 if (node1.GetLStart() != node2.GetLStart()) {
823 return node1.GetLStart() < node2.GetLStart();
824 }
825
826 /* max unit kind use */
827 bool use1 = IfUseUnitKind(node1, maxUnitIndex);
828 bool use2 = IfUseUnitKind(node2, maxUnitIndex);
829 if (use1 != use2) {
830 return use1;
831 }
832
833 /* slot0 first */
834 SlotType slotType1 = node1.GetReservation()->GetSlot();
835 SlotType slotType2 = node2.GetReservation()->GetSlot();
836 if (slotType1 == kSlots) {
837 slotType1 = kSlot0;
838 }
839 if (slotType2 == kSlots) {
840 slotType2 = kSlot0;
841 }
842 if (slotType1 != slotType2) {
843 return slotType1 < slotType2;
844 }
845
846 /* more succNodes fisrt */
847 if (node1.GetSuccs().size() != node2.GetSuccs().size()) {
848 return node1.GetSuccs().size() > node2.GetSuccs().size();
849 }
850
851 /* default order */
852 return node1.GetInsn()->GetId() < node2.GetInsn()->GetId();
853 }
854
855 /*
856 * Calculate number of every unit that used by avaliableReadyList's nodes and save the max in maxUnitIndex
857 */
CalculateMaxUnitKindCount(ScheduleProcessInfo & scheduleInfo) const858 void AArch64Schedule::CalculateMaxUnitKindCount(ScheduleProcessInfo &scheduleInfo) const
859 {
860 uint32 unitKindCount[kUnitKindLast] = {0};
861 for (auto node : scheduleInfo.GetAvailableReadyList()) {
862 CountUnitKind(*node, unitKindCount, kUnitKindLast);
863 }
864
865 uint32 maxCount = 0;
866 maxUnitIndex = 0;
867 for (size_t i = 1; i < kUnitKindLast; ++i) {
868 if (maxCount < unitKindCount[i]) {
869 maxCount = unitKindCount[i];
870 maxUnitIndex = i;
871 }
872 }
873 }
874
875 /*
876 * Update the release reg node set
877 * When node in this set is scheduled, register pressure can be reduced
878 */
UpdateReleaseRegInfo(AArch64ScheduleProcessInfo & scheduleInfo)879 void AArch64Schedule::UpdateReleaseRegInfo(AArch64ScheduleProcessInfo &scheduleInfo)
880 {
881 auto &availableReadyList = scheduleInfo.GetAvailableReadyList();
882 scheduleInfo.ClearALLFreeRegNodeSet();
883 /* Traverse availableReadyList and add those can reduce register pressure to release reg node set */
884 for (auto node : availableReadyList) {
885 std::set<regno_t> freeRegNO = CanFreeRegister(*node);
886 if (!freeRegNO.empty()) {
887 scheduleInfo.VaryFreeRegSet(cgFunc, freeRegNO, *node);
888 }
889 }
890 }
891
892 /*
893 * return registers which an instruction can release after being scheduled
894 */
CanFreeRegister(const DepNode & node) const895 std::set<regno_t> AArch64Schedule::CanFreeRegister(const DepNode &node) const
896 {
897 std::set<regno_t> freeRegSet;
898 for (auto reg : node.GetUseRegnos()) {
899 if (RegPressureSchedule::IsLastUse(node, reg)) {
900 freeRegSet.emplace(reg);
901 }
902 }
903 return freeRegSet;
904 }
905
906 /*
907 * After an instruction is scheduled, update live reg set
908 */
UpdateLiveRegSet(AArch64ScheduleProcessInfo & scheduleInfo,const DepNode & node)909 void AArch64Schedule::UpdateLiveRegSet(AArch64ScheduleProcessInfo &scheduleInfo, const DepNode &node)
910 {
911 /* dealing with def reg, add def reg into the live reg set */
912 size_t i = 1;
913 for (auto &defReg : node.GetDefRegnos()) {
914 if (scheduleInfo.FindIntLiveReg(defReg) == 0 && scheduleInfo.FindFpLiveReg(defReg) == 0) {
915 scheduleInfo.VaryLiveRegSet(cgFunc, defReg, true);
916 }
917 /* delete dead def reg from live reg set because its live range is only 1 cycle */
918 if (node.GetRegDefs(i) == nullptr && liveOutRegNo.find(defReg) == liveOutRegNo.end()) {
919 scheduleInfo.VaryLiveRegSet(cgFunc, defReg, false);
920 }
921 ++i;
922 }
923 /* dealing with use reg, delete use reg from live reg set if this instruction is last use of it */
924 for (auto &useReg : node.GetUseRegnos()) {
925 if (RegPressureSchedule::IsLastUse(node, useReg)) {
926 if ((scheduleInfo.FindIntLiveReg(useReg) != 0 || scheduleInfo.FindFpLiveReg(useReg) != 0) &&
927 liveOutRegNo.find(useReg) == liveOutRegNo.end()) {
928 scheduleInfo.VaryLiveRegSet(cgFunc, useReg, false);
929 }
930 }
931 }
932 }
933
934 /*
935 * Initialize the live reg set based on the live in reg information
936 */
InitLiveRegSet(AArch64ScheduleProcessInfo & scheduleInfo)937 void AArch64Schedule::InitLiveRegSet(AArch64ScheduleProcessInfo &scheduleInfo)
938 {
939 if (GetConsiderRegPressure()) {
940 for (auto reg : liveInRegNo) {
941 scheduleInfo.VaryLiveRegSet(cgFunc, reg, true);
942 }
943 }
944 }
945
946 /*
947 * A simulated schedule:
948 * scheduling instruction in original order to calculate original execute cycles.
949 */
SimulateOnly()950 uint32 AArch64Schedule::SimulateOnly()
951 {
952 uint32 currCycle = 0;
953 uint32 advanceCycle = 0;
954 Init();
955
956 for (uint32 i = 0; i < nodes.size();) {
957 while (advanceCycle > 0) {
958 ++currCycle;
959 mad->AdvanceOneCycleForAll();
960 --advanceCycle;
961 }
962
963 DepNode *targetNode = nodes[i];
964 if ((currCycle >= targetNode->GetEStart()) && targetNode->IsResourceIdle()) {
965 targetNode->SetSimulateCycle(currCycle);
966 targetNode->OccupyRequiredUnits();
967
968 /* Update estart. */
969 for (auto succLink : targetNode->GetSuccs()) {
970 DepNode &succNode = succLink->GetTo();
971 uint32 eStart = currCycle + succLink->GetLatency();
972 if (succNode.GetEStart() < eStart) {
973 succNode.SetEStart(eStart);
974 }
975 }
976
977 if (CGOptions::IsDebugSched()) {
978 LogInfo::MapleLogger() << "[Simulate] TargetNode : ";
979 targetNode->GetInsn()->Dump();
980 LogInfo::MapleLogger() << "\n";
981 }
982
983 switch (targetNode->GetInsn()->GetLatencyType()) {
984 case kLtClinit:
985 advanceCycle = kClinitAdvanceCycle;
986 break;
987 case kLtAdrpLdr:
988 advanceCycle = kAdrpLdrAdvanceCycle;
989 break;
990 case kLtClinitTail:
991 advanceCycle = kClinitTailAdvanceCycle;
992 break;
993 default:
994 break;
995 }
996
997 ++i;
998 } else {
999 advanceCycle = 1;
1000 }
1001 }
1002 /* the second to last node is the true last node, because the last is kNodeTypeSeparator nod */
1003 DEBUG_ASSERT(nodes.size() - kSecondToLastNode >= 0, "size of nodes should be greater than or equal 2");
1004 return (nodes[nodes.size() - kSecondToLastNode]->GetSimulateCycle());
1005 }
1006
1007 /* Restore dependence graph to normal CGIR. */
FinalizeScheduling(BB & bb,const DepAnalysis & depAnalysis)1008 void AArch64Schedule::FinalizeScheduling(BB &bb, const DepAnalysis &depAnalysis)
1009 {
1010 bb.ClearInsns();
1011
1012 const Insn *prevLocInsn = (bb.GetPrev() != nullptr ? bb.GetPrev()->GetLastLoc() : nullptr);
1013 for (auto node : nodes) {
1014 /* Append comments first. */
1015 for (auto comment : node->GetComments()) {
1016 if (comment->GetPrev() != nullptr && comment->GetPrev()->IsDbgInsn()) {
1017 bb.AppendInsn(*comment->GetPrev());
1018 }
1019 bb.AppendInsn(*comment);
1020 }
1021 /* Append cfi instructions. */
1022 for (auto cfi : node->GetCfiInsns()) {
1023 bb.AppendInsn(*cfi);
1024 }
1025 /* Append insn. */
1026 if (!node->GetClinitInsns().empty()) {
1027 for (auto clinit : node->GetClinitInsns()) {
1028 bb.AppendInsn(*clinit);
1029 }
1030 } else if (node->GetType() == kNodeTypeNormal) {
1031 if (node->GetInsn()->GetPrev() != nullptr && node->GetInsn()->GetPrev()->IsDbgInsn()) {
1032 bb.AppendInsn(*node->GetInsn()->GetPrev());
1033 }
1034 bb.AppendInsn(*node->GetInsn());
1035 }
1036 }
1037 bb.SetLastLoc(prevLocInsn);
1038
1039 for (auto lastComment : depAnalysis.GetLastComments()) {
1040 bb.AppendInsn(*lastComment);
1041 }
1042 }
1043
1044 /* For every node of nodes, update it's bruteForceSchedCycle. */
UpdateBruteForceSchedCycle()1045 void AArch64Schedule::UpdateBruteForceSchedCycle()
1046 {
1047 for (auto node : nodes) {
1048 node->SetBruteForceSchedCycle(node->GetSchedCycle());
1049 }
1050 }
1051
1052 /* Recursively schedule all of the possible node. */
IterateBruteForce(DepNode & targetNode,MapleVector<DepNode * > & readyList,uint32 currCycle,MapleVector<DepNode * > & scheduledNodes,uint32 & maxCycleCount,MapleVector<DepNode * > & optimizedScheduledNodes)1053 void AArch64Schedule::IterateBruteForce(DepNode &targetNode, MapleVector<DepNode *> &readyList, uint32 currCycle,
1054 MapleVector<DepNode *> &scheduledNodes, uint32 &maxCycleCount,
1055 MapleVector<DepNode *> &optimizedScheduledNodes)
1056 {
1057 /* Save states. */
1058 constexpr int32 unitSize = 31;
1059 DEBUG_ASSERT(unitSize == mad->GetAllUnitsSize(), "CG internal error.");
1060 constexpr int32 bitSetLength = 32;
1061 std::vector<std::bitset<bitSetLength>> occupyTable;
1062 occupyTable.resize(unitSize, 0);
1063 mad->SaveStates(occupyTable, unitSize);
1064
1065 /* Schedule targetNode first. */
1066 targetNode.SetState(kScheduled);
1067 targetNode.SetSchedCycle(currCycle);
1068 scheduledNodes.emplace_back(&targetNode);
1069
1070 MapleVector<DepNode *> tempList = readyList;
1071 EraseNodeFromNodeList(targetNode, tempList);
1072 targetNode.OccupyRequiredUnits();
1073
1074 /* Update readyList. */
1075 UpdateReadyList(targetNode, tempList, true);
1076
1077 if (targetNode.GetType() == kNodeTypeSeparator) {
1078 /* If target node is separator node, update lastSeparatorIndex. */
1079 lastSeparatorIndex += kMaxDependenceNum;
1080 }
1081
1082 if (tempList.empty()) {
1083 DEBUG_ASSERT(scheduledNodes.size() == nodes.size(), "CG internal error, Not all nodes scheduled.");
1084 if (currCycle < maxCycleCount) {
1085 maxCycleCount = currCycle;
1086 UpdateBruteForceSchedCycle();
1087 optimizedScheduledNodes = scheduledNodes;
1088 }
1089 } else {
1090 uint32 advanceCycle = 0;
1091 switch (targetNode.GetInsn()->GetLatencyType()) {
1092 case kLtClinit:
1093 advanceCycle = kClinitAdvanceCycle;
1094 break;
1095 case kLtAdrpLdr:
1096 advanceCycle = kAdrpLdrAdvanceCycle;
1097 break;
1098 case kLtClinitTail:
1099 advanceCycle = kClinitTailAdvanceCycle;
1100 break;
1101 default:
1102 break;
1103 }
1104
1105 do {
1106 std::vector<DepNode *> availableReadyList;
1107 std::vector<DepNode *> tempAvailableList;
1108 while (advanceCycle > 0) {
1109 ++currCycle;
1110 mad->AdvanceOneCycleForAll();
1111 --advanceCycle;
1112 }
1113 /* Check EStart. */
1114 for (auto node : tempList) {
1115 if (node->GetEStart() <= currCycle) {
1116 tempAvailableList.emplace_back(node);
1117 }
1118 }
1119
1120 if (tempAvailableList.empty()) {
1121 /* Advance cycle. */
1122 advanceCycle = 1;
1123 continue;
1124 }
1125
1126 /* Check if schedulable */
1127 for (auto node : tempAvailableList) {
1128 if (node->IsResourceIdle()) {
1129 availableReadyList.emplace_back(node);
1130 }
1131 }
1132
1133 if (availableReadyList.empty()) {
1134 /* Advance cycle. */
1135 advanceCycle = 1;
1136 continue;
1137 }
1138
1139 for (auto node : availableReadyList) {
1140 IterateBruteForce(*node, tempList, currCycle, scheduledNodes, maxCycleCount, optimizedScheduledNodes);
1141 }
1142
1143 break;
1144 } while (true);
1145 }
1146
1147 /*
1148 * Recover states.
1149 * Restore targetNode first.
1150 */
1151 targetNode.SetState(kReady);
1152 targetNode.SetSchedCycle(0);
1153 scheduledNodes.pop_back();
1154 mad->RestoreStates(occupyTable, unitSize);
1155
1156 /* Update readyList. */
1157 for (auto succLink : targetNode.GetSuccs()) {
1158 DepNode &succNode = succLink->GetTo();
1159 succNode.IncreaseValidPredsSize();
1160 succNode.SetState(kNormal);
1161 }
1162
1163 if (targetNode.GetType() == kNodeTypeSeparator) {
1164 /* If target node is separator node, update lastSeparatorIndex. */
1165 lastSeparatorIndex -= kMaxDependenceNum;
1166 }
1167 }
1168
1169 /*
1170 * Brute force schedule:
1171 * Finding all possibile schedule list of current bb, and calculate every list's execute cycles,
1172 * return the optimal schedule list and it's cycles.
1173 */
DoBruteForceSchedule()1174 uint32 AArch64Schedule::DoBruteForceSchedule()
1175 {
1176 MapleVector<DepNode *> scheduledNodes(alloc.Adapter());
1177 MapleVector<DepNode *> optimizedScheduledNodes(alloc.Adapter());
1178
1179 uint32 currCycle = 0;
1180 uint32 maxCycleCount = 0xFFFFFFFF;
1181 Init();
1182
1183 /* Schedule First separator. */
1184 DepNode *targetNode = readyList.front();
1185 targetNode->SetState(kScheduled);
1186 targetNode->SetSchedCycle(currCycle);
1187 scheduledNodes.emplace_back(targetNode);
1188 readyList.clear();
1189
1190 /* Update readyList. */
1191 UpdateReadyList(*targetNode, readyList, false);
1192
1193 DEBUG_ASSERT(targetNode->GetType() == kNodeTypeSeparator, "The first node should be separator node.");
1194 DEBUG_ASSERT(!readyList.empty(), "readyList should not be empty.");
1195
1196 for (auto targetNodeTemp : readyList) {
1197 IterateBruteForce(*targetNodeTemp, readyList, currCycle, scheduledNodes, maxCycleCount,
1198 optimizedScheduledNodes);
1199 }
1200
1201 nodes = optimizedScheduledNodes;
1202 return maxCycleCount;
1203 }
1204
1205 /*
1206 * Update ready list after the targetNode has been scheduled.
1207 * For every targetNode's successor, if it's all predecessors have been scheduled,
1208 * add it to ready list and update it's information (like state, estart).
1209 */
UpdateReadyList(DepNode & targetNode,MapleVector<DepNode * > & readyList,bool updateEStart)1210 void AArch64Schedule::UpdateReadyList(DepNode &targetNode, MapleVector<DepNode *> &readyList, bool updateEStart)
1211 {
1212 for (auto succLink : targetNode.GetSuccs()) {
1213 DepNode &succNode = succLink->GetTo();
1214 succNode.DecreaseValidPredsSize();
1215 if (succNode.GetValidPredsSize() == 0) {
1216 readyList.emplace_back(&succNode);
1217 succNode.SetState(kReady);
1218
1219 /* Set eStart. */
1220 if (updateEStart) {
1221 uint32 maxEstart = 0;
1222 for (auto predLink : succNode.GetPreds()) {
1223 DepNode &predNode = predLink->GetFrom();
1224 uint32 eStart = predNode.GetSchedCycle() + predLink->GetLatency();
1225 maxEstart = (maxEstart < eStart ? eStart : maxEstart);
1226 }
1227 succNode.SetEStart(maxEstart);
1228 }
1229 }
1230 }
1231 }
1232
1233 /* For every node of nodes, dump it's Depdence information. */
DumpDepGraph(const MapleVector<DepNode * > & nodes) const1234 void AArch64Schedule::DumpDepGraph(const MapleVector<DepNode *> &nodes) const
1235 {
1236 for (auto node : nodes) {
1237 depAnalysis->DumpDepNode(*node);
1238 LogInfo::MapleLogger() << "---------- preds ----------"
1239 << "\n";
1240 for (auto pred : node->GetPreds()) {
1241 depAnalysis->DumpDepLink(*pred, &(pred->GetFrom()));
1242 }
1243 LogInfo::MapleLogger() << "---------- succs ----------"
1244 << "\n";
1245 for (auto succ : node->GetSuccs()) {
1246 depAnalysis->DumpDepLink(*succ, &(succ->GetTo()));
1247 }
1248 LogInfo::MapleLogger() << "---------------------------"
1249 << "\n";
1250 }
1251 }
1252
1253 /* For every node of nodes, dump it's schedule time according simulate type and instruction information. */
DumpScheduleResult(const MapleVector<DepNode * > & nodes,SimulateType type) const1254 void AArch64Schedule::DumpScheduleResult(const MapleVector<DepNode *> &nodes, SimulateType type) const
1255 {
1256 for (auto node : nodes) {
1257 LogInfo::MapleLogger() << "cycle[ ";
1258 switch (type) {
1259 case kListSchedule:
1260 LogInfo::MapleLogger() << node->GetSchedCycle();
1261 break;
1262 case kBruteForce:
1263 LogInfo::MapleLogger() << node->GetBruteForceSchedCycle();
1264 break;
1265 case kSimulateOnly:
1266 LogInfo::MapleLogger() << node->GetSimulateCycle();
1267 break;
1268 }
1269 LogInfo::MapleLogger() << " ] ";
1270 node->GetInsn()->Dump();
1271 LogInfo::MapleLogger() << "\n";
1272 }
1273 }
1274
1275 /* Print bb's dependence dot graph information to a file. */
GenerateDot(const BB & bb,const MapleVector<DepNode * > & nodes) const1276 void AArch64Schedule::GenerateDot(const BB &bb, const MapleVector<DepNode *> &nodes) const
1277 {
1278 std::streambuf *coutBuf = std::cout.rdbuf(); /* keep original cout buffer */
1279 std::ofstream dgFile;
1280 std::streambuf *buf = dgFile.rdbuf();
1281 std::cout.rdbuf(buf);
1282
1283 /* construct the file name */
1284 std::string fileName;
1285 fileName.append(phaseName);
1286 fileName.append("_");
1287 fileName.append(cgFunc.GetName());
1288 fileName.append("_BB");
1289 auto str = std::to_string(bb.GetId());
1290 fileName.append(str);
1291 fileName.append("_dep_graph.dot");
1292
1293 dgFile.open(FileUtils::GetRealPath(fileName), std::ios::trunc);
1294 if (!dgFile.is_open()) {
1295 LogInfo::MapleLogger(kLlWarn) << "fileName:" << fileName << " open failure.\n";
1296 return;
1297 }
1298 dgFile << "digraph {\n";
1299 for (auto node : nodes) {
1300 for (auto succ : node->GetSuccs()) {
1301 dgFile << "insn" << node->GetInsn() << " -> "
1302 << "insn" << succ->GetTo().GetInsn();
1303 dgFile << " [";
1304 if (succ->GetDepType() == kDependenceTypeTrue) {
1305 dgFile << "color=red,";
1306 }
1307 dgFile << "label= \"" << succ->GetLatency() << "\"";
1308 dgFile << "];\n";
1309 }
1310 }
1311
1312 for (auto node : nodes) {
1313 MOperator mOp = node->GetInsn()->GetMachineOpcode();
1314 const InsnDesc *md = &AArch64CG::kMd[mOp];
1315 dgFile << "insn" << node->GetInsn() << "[";
1316 dgFile << "shape=box,label= \" " << node->GetInsn()->GetId() << ":\n";
1317 dgFile << "{ ";
1318 dgFile << md->name << "\n";
1319 dgFile << "}\"];\n";
1320 }
1321 dgFile << "}\n";
1322 dgFile.flush();
1323 dgFile.close();
1324 std::cout.rdbuf(coutBuf);
1325 }
1326
GetRegisterType(const CGFunc & f,regno_t regNO)1327 RegType AArch64ScheduleProcessInfo::GetRegisterType(const CGFunc &f, regno_t regNO)
1328 {
1329 if (AArch64isa::IsPhysicalRegister(regNO)) {
1330 if (AArch64isa::IsGPRegister(static_cast<AArch64reg>(regNO))) {
1331 return kRegTyInt;
1332 } else if (AArch64isa::IsFPSIMDRegister(static_cast<AArch64reg>(regNO))) {
1333 return kRegTyFloat;
1334 } else {
1335 CHECK_FATAL(false, "unknown physical reg");
1336 }
1337 } else {
1338 RegOperand *curRegOpnd = f.GetVirtualRegisterOperand(regNO);
1339 DEBUG_ASSERT(curRegOpnd != nullptr, "register which is not physical and virtual");
1340 return curRegOpnd->GetRegisterType();
1341 }
1342 }
1343
VaryLiveRegSet(const CGFunc & f,regno_t regNO,bool isInc)1344 void AArch64ScheduleProcessInfo::VaryLiveRegSet(const CGFunc &f, regno_t regNO, bool isInc)
1345 {
1346 RegType registerTy = GetRegisterType(f, regNO);
1347 if (registerTy == kRegTyInt || registerTy == kRegTyVary) {
1348 isInc ? IncIntLiveRegSet(regNO) : DecIntLiveRegSet(regNO);
1349 } else if (registerTy == kRegTyFloat) {
1350 isInc ? IncFpLiveRegSet(regNO) : DecFpLiveRegSet(regNO);
1351 }
1352 /* consider other type register */
1353 }
1354
VaryFreeRegSet(const CGFunc & f,std::set<regno_t> regNOs,DepNode & node)1355 void AArch64ScheduleProcessInfo::VaryFreeRegSet(const CGFunc &f, std::set<regno_t> regNOs, DepNode &node)
1356 {
1357 for (auto regNO : regNOs) {
1358 RegType registerTy = GetRegisterType(f, regNO);
1359 if (registerTy == kRegTyInt || registerTy == kRegTyVary /* memory base register must be int */) {
1360 IncFreeIntRegNode(node);
1361 } else if (registerTy == kRegTyFloat) {
1362 IncFreeFpRegNode(node);
1363 } else if (registerTy == kRegTyCc) {
1364 /* do not count CC reg */
1365 return;
1366 } else {
1367 /* consider other type register */
1368 CHECK_FATAL(false, "do not support this type of register");
1369 }
1370 }
1371 }
1372
1373 /* Do brute force scheduling and dump scheduling information */
BruteForceScheduling(const BB & bb)1374 void AArch64Schedule::BruteForceScheduling(const BB &bb)
1375 {
1376 LogInfo::MapleLogger() << "\n\n$$ Function: " << cgFunc.GetName();
1377 LogInfo::MapleLogger() << "\n BB id = " << bb.GetId() << "; nodes.size = " << nodes.size() << "\n";
1378
1379 constexpr uint32 maxBruteForceNum = 50;
1380 if (nodes.size() < maxBruteForceNum) {
1381 GenerateDot(bb, nodes);
1382 uint32 maxBruteForceCycle = DoBruteForceSchedule();
1383 MapleVector<DepNode *> bruteNodes = nodes;
1384 uint32 maxSchedCycle = DoSchedule();
1385 if (maxBruteForceCycle < maxSchedCycle) {
1386 LogInfo::MapleLogger() << "maxBruteForceCycle = " << maxBruteForceCycle << "; maxSchedCycle = ";
1387 LogInfo::MapleLogger() << maxSchedCycle << "\n";
1388 LogInfo::MapleLogger() << "\n ## Dump dependence graph ## "
1389 << "\n";
1390 DumpDepGraph(nodes);
1391 LogInfo::MapleLogger() << "\n ** Dump bruteForce scheduling result."
1392 << "\n";
1393 DumpScheduleResult(bruteNodes, kBruteForce);
1394 LogInfo::MapleLogger() << "\n ^^ Dump list scheduling result."
1395 << "\n";
1396 DumpScheduleResult(nodes, kListSchedule);
1397 }
1398 } else {
1399 LogInfo::MapleLogger() << "Skip BruteForce scheduling."
1400 << "\n";
1401 DoSchedule();
1402 }
1403 }
1404
1405 /* Do simulate scheduling and dump scheduling information */
SimulateScheduling(const BB & bb)1406 void AArch64Schedule::SimulateScheduling(const BB &bb)
1407 {
1408 uint32 originCycle = SimulateOnly();
1409 MapleVector<DepNode *> oldNodes = nodes;
1410 uint32 schedCycle = DoSchedule();
1411 if (originCycle < schedCycle) {
1412 LogInfo::MapleLogger() << "Worse cycle [ " << (schedCycle - originCycle) << " ]; ";
1413 LogInfo::MapleLogger() << "originCycle = " << originCycle << "; schedCycle = ";
1414 LogInfo::MapleLogger() << schedCycle << "; nodes.size = " << nodes.size();
1415 LogInfo::MapleLogger() << "; $$ Function: " << cgFunc.GetName();
1416 LogInfo::MapleLogger() << "; BB id = " << bb.GetId() << "\n";
1417 LogInfo::MapleLogger() << "\n ** Dump original result."
1418 << "\n";
1419 DumpScheduleResult(oldNodes, kSimulateOnly);
1420 LogInfo::MapleLogger() << "\n ^^ Dump list scheduling result."
1421 << "\n";
1422 DumpScheduleResult(nodes, kListSchedule);
1423 } else if (originCycle > schedCycle) {
1424 LogInfo::MapleLogger() << "Advance cycle [ " << (originCycle - schedCycle) << " ]; ";
1425 LogInfo::MapleLogger() << "originCycle = " << originCycle << "; schedCycle = ";
1426 LogInfo::MapleLogger() << schedCycle << "; nodes.size = " << nodes.size();
1427 LogInfo::MapleLogger() << "; $$ Function: " << cgFunc.GetName();
1428 LogInfo::MapleLogger() << "; BB id = " << bb.GetId() << "\n";
1429 } else {
1430 LogInfo::MapleLogger() << "Equal cycle [ 0 ]; originCycle = " << originCycle;
1431 LogInfo::MapleLogger() << " ], ignore. nodes.size = " << nodes.size() << "\n";
1432 }
1433 }
1434
1435 /*
1436 * A local list scheduling.
1437 * Schedule insns in basic blocks.
1438 */
ListScheduling(bool beforeRA)1439 void AArch64Schedule::ListScheduling(bool beforeRA)
1440 {
1441 InitIDAndLoc();
1442
1443 mad = Globals::GetInstance()->GetMAD();
1444 if (beforeRA) {
1445 RegPressure::SetMaxRegClassNum(kRegisterLast);
1446 }
1447 depAnalysis = memPool.New<AArch64DepAnalysis>(cgFunc, memPool, *mad, beforeRA);
1448 FOR_ALL_BB(bb, &cgFunc)
1449 {
1450 if (bb->IsUnreachable()) {
1451 continue;
1452 }
1453 if (bb->IsAtomicBuiltInBB()) {
1454 continue;
1455 }
1456 depAnalysis->Run(*bb, nodes);
1457
1458 if (LIST_SCHED_DUMP_REF) {
1459 GenerateDot(*bb, nodes);
1460 DumpDepGraph(nodes);
1461 }
1462 if (beforeRA) {
1463 liveInRegNo = bb->GetLiveInRegNO();
1464 liveOutRegNo = bb->GetLiveOutRegNO();
1465 if (bb->GetKind() != BB::kBBReturn) {
1466 SetConsiderRegPressure();
1467 DoSchedule();
1468 } else {
1469 RegPressureScheduling(*bb, nodes);
1470 }
1471 } else {
1472 ClinitPairOpt();
1473 MemoryAccessPairOpt();
1474 if (CGOptions::IsDruteForceSched()) {
1475 BruteForceScheduling(*bb);
1476 } else if (CGOptions::IsSimulateSched()) {
1477 SimulateScheduling(*bb);
1478 } else {
1479 DoSchedule();
1480 }
1481 }
1482
1483 FinalizeScheduling(*bb, *depAnalysis);
1484 }
1485 }
1486 } /* namespace maplebe */
1487