1 /*
2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "scheduler.h"
17
18 #include "optimizer/analysis/alias_analysis.h"
19 #include "optimizer/ir/analysis.h"
20 #include "optimizer/ir/basicblock.h"
21 #include "compiler_logger.h"
22
23 namespace ark::compiler {
24 /* Instruction scheduling.
25 * Current decisions/limitations
26 *
27 * 1. Scheduler pass is placed immediately before register allocator.
28 * 2. It rearranges instructions only inside the basic block, but not between them.
29 * 3. No liveness analysis, only calculating dependencies using barrier/users/alias information.
30 * 4. No CPU pipeline/resource modeling, only having dependency costs.
31 * 5. Forward list scheduling algorithm with standart critical-path-based priority.
32 */
RunImpl()33 bool Scheduler::RunImpl()
34 {
35 COMPILER_LOG(DEBUG, SCHEDULER) << "Run " << GetPassName();
36 SaveStateBridgesBuilder ssb;
37 bool result = false;
38 for (auto bb : GetGraph()->GetBlocksRPO()) {
39 if (!bb->IsEmpty() && !bb->IsStartBlock()) {
40 if (ScheduleBasicBlock(bb)) {
41 ssb.FixSaveStatesInBB(bb);
42 result = true;
43 }
44 }
45 }
46 COMPILER_LOG(DEBUG, SCHEDULER) << GetPassName() << " completed";
47 return result;
48 }
49
50 // Dependency helper function
AddDep(uint32_t * prio,Inst * from,Inst * to,uint32_t latency,Inst * barrier)51 void Scheduler::AddDep(uint32_t *prio, Inst *from, Inst *to, uint32_t latency, Inst *barrier)
52 {
53 if (from == to) {
54 return;
55 }
56 COMPILER_LOG(DEBUG, SCHEDULER) << "Found dependency " << from->GetId() << " -> " << to->GetId() << " latency "
57 << latency;
58 // Estimate old cycle (without scheduling)
59 ocycle_[from] = std::max(ocycle_[from], latency + ocycle_[to]);
60
61 // Update instruction priority - "how high instruction is in dependency tree"
62 *prio = std::max(*prio, latency + prio_[to]);
63
64 // Do not add cross-barrier dependencies into deps_
65 if (barrier == nullptr || old_[to] > old_[barrier]) {
66 if (deps_.at(from).count(to) == 1) {
67 uint32_t oldLatency = deps_.at(from).at(to);
68 if (oldLatency >= latency) {
69 return;
70 }
71 } else {
72 numDeps_[to]++;
73 }
74 deps_.at(from)[to] = latency;
75 }
76 }
77
78 // Calculate priority and dependencies
BuildAllDeps(BasicBlock * bb)79 bool Scheduler::BuildAllDeps(BasicBlock *bb)
80 {
81 auto markerHolder = MarkerHolder(GetGraph());
82 mrk_ = markerHolder.GetMarker();
83
84 oprev_ = 0;
85 numBarriers_ = 0;
86 maxPrio_ = 0;
87
88 static constexpr uint32_t TOO_LONG_BB = 64;
89 uint32_t numInst = 0;
90 uint32_t numBetween = 0;
91 uint32_t numSpecial = 0;
92 Inst *lastBarrier = nullptr;
93 for (auto inst : bb->InstsSafeReverse()) {
94 ProcessInst(inst, &numInst, &numBetween, &numSpecial, &lastBarrier);
95
96 if (numSpecial > TOO_LONG_BB || numBetween > TOO_LONG_BB) {
97 COMPILER_LOG(DEBUG, SCHEDULER) << "Block " << bb->GetId() << " seems too big for scheduling, skipping";
98 Cleanup();
99 return false;
100 }
101 }
102 return true;
103 }
104
105 // One instruction deps
ProcessInst(Inst * inst,uint32_t * numInst,uint32_t * numBetween,uint32_t * numSpecial,Inst ** lastBarrier)106 void Scheduler::ProcessInst(Inst *inst, uint32_t *numInst, uint32_t *numBetween, uint32_t *numSpecial,
107 Inst **lastBarrier)
108 {
109 uint32_t prio = 0;
110 uint32_t instLatency = inst->Latency();
111 bool barrier = inst->IsBarrier();
112
113 (*numBetween)++;
114 old_.insert({inst, (*numInst)++});
115 ocycle_.insert({inst, ++oprev_});
116 numDeps_.insert({inst, 0U});
117 deps_.emplace(inst, GetGraph()->GetLocalAllocator()->Adapter());
118
119 // Dependency to the barrier
120 if (*lastBarrier != nullptr) {
121 AddDep(&prio, inst, *lastBarrier, 1U, *lastBarrier);
122 }
123
124 // Dependency from barrier
125 if (barrier) {
126 Inst *oldLastBarrier = *lastBarrier;
127 *lastBarrier = inst;
128 numBarriers_++;
129 *numBetween = 0;
130 for (auto user = inst->GetNext(); user != oldLastBarrier; user = user->GetNext()) {
131 AddDep(&prio, inst, user, 1U, *lastBarrier);
132 }
133 }
134
135 // Users
136 for (auto &userItem : inst->GetUsers()) {
137 auto user = userItem.GetInst();
138 if (user->IsMarked(mrk_)) {
139 AddDep(&prio, inst, user, instLatency, *lastBarrier);
140 }
141 }
142
143 if (inst->IsMovableObject()) {
144 // We take all SaveState, that has RuntimeCall users, under this reference instruction and create dependence
145 // between SaveState and this instruction. See also GraphChecker::CheckSaveStatesWithRuntimeCallUsers.
146 for (auto &ss : ssWithRuntimeCall_) {
147 AddDep(&prio, inst, ss, 1U, *lastBarrier);
148 }
149 }
150
151 if (inst->IsMemory() || inst->IsRefSpecial()) {
152 ProcessMemory(inst, &prio, *lastBarrier);
153 (*numSpecial)++;
154 }
155
156 if (inst->CanThrow() || inst->IsRuntimeCall() || inst->IsSaveState()) {
157 ProcessSpecial(inst, &prio, *lastBarrier);
158 (*numSpecial)++;
159 }
160
161 inst->SetMarker(mrk_);
162 prio_.insert({inst, prio});
163 maxPrio_ = std::max(maxPrio_, prio);
164 oprev_ = ocycle_[inst];
165 }
166
167 // Memory
ProcessMemory(Inst * inst,uint32_t * prio,Inst * lastBarrier)168 void Scheduler::ProcessMemory(Inst *inst, uint32_t *prio, Inst *lastBarrier)
169 {
170 if (inst->IsRefSpecial()) {
171 loads_.push_back(inst);
172 return;
173 }
174 for (auto mem : stores_) {
175 if (GetGraph()->CheckInstAlias(inst, mem) != AliasType::NO_ALIAS) {
176 AddDep(prio, inst, mem, 1U, lastBarrier);
177 }
178 }
179 if (inst->IsStore()) {
180 for (auto mem : loads_) {
181 if (mem->IsLoad() && GetGraph()->CheckInstAlias(inst, mem) != AliasType::NO_ALIAS) {
182 AddDep(prio, inst, mem, 1U, lastBarrier);
183 }
184 }
185 for (auto ct : special_) {
186 AddDep(prio, inst, ct, 1U, lastBarrier);
187 }
188 stores_.push_back(inst);
189 } else { // means inst->IsLoad()
190 loads_.push_back(inst);
191 }
192 }
193
ProcessSpecialBoundsCheckI(Inst * inst,uint32_t * prio,Inst * lastBarrier)194 void Scheduler::ProcessSpecialBoundsCheckI(Inst *inst, uint32_t *prio, Inst *lastBarrier)
195 {
196 auto value = inst->CastToBoundsCheckI()->GetImm();
197 // Remove loads with same immediate. No easy way to check arrays are same.
198 for (auto load : loads_) {
199 if (load->GetOpcode() == Opcode::LoadArrayPairI) {
200 auto imm = load->CastToLoadArrayPairI()->GetImm();
201 if (imm == value || imm + 1 == value) {
202 AddDep(prio, inst, load, 1U, lastBarrier);
203 }
204 } else if (load->GetOpcode() == Opcode::LoadArrayI && load->CastToLoadArrayI()->GetImm() == value) {
205 AddDep(prio, inst, load, 1U, lastBarrier);
206 }
207 }
208 }
209
210 // CanThrow or SaveState can't be rearranged, and stores can't be moved over them
ProcessSpecial(Inst * inst,uint32_t * prio,Inst * lastBarrier)211 void Scheduler::ProcessSpecial(Inst *inst, uint32_t *prio, Inst *lastBarrier)
212 {
213 for (auto mem : stores_) {
214 AddDep(prio, inst, mem, 1U, lastBarrier);
215 }
216 for (auto ct : special_) {
217 AddDep(prio, inst, ct, 1U, lastBarrier);
218 }
219
220 if (inst->IsSaveState() && inst != inst->GetBasicBlock()->GetFirstInst()) {
221 // SaveStates that have RuntimeCall users are being added into a vector to create dependencies from preceding
222 // reference instructions
223 for (auto &user : inst->GetUsers()) {
224 auto userInst = user.GetInst();
225 if (userInst->IsRuntimeCall()) {
226 ssWithRuntimeCall_.push_back(inst);
227 break;
228 }
229 }
230 }
231 // 1. SafePoint also has this flag
232 // 2. GC triggered inside can poison loaded reference value
233 if (inst->IsRuntimeCall()) {
234 for (auto mem : loads_) {
235 if (mem->GetType() == DataType::REFERENCE || mem->GetType() == DataType::ANY) {
236 AddDep(prio, inst, mem, 1U, lastBarrier);
237 }
238 }
239 }
240 // We have to "restore" BoundsCheckI -> LoadArrayI dependency
241 if (inst->GetOpcode() == Opcode::BoundsCheckI) {
242 ProcessSpecialBoundsCheckI(inst, prio, lastBarrier);
243 }
244 special_.push_back(inst);
245 }
246
ScheduleBarrierInst(Inst ** inst)247 void Scheduler::ScheduleBarrierInst(Inst **inst)
248 {
249 sched_.push_back((*inst));
250 if ((*inst)->WithGluedInsts()) {
251 auto next = (*inst)->GetNext();
252 auto nnext = next->GetNext();
253 ASSERT(next->GetOpcode() == Opcode::LoadPairPart && nnext->GetOpcode() == Opcode::LoadPairPart);
254 sched_.push_back(next);
255 sched_.push_back(nnext);
256 *inst = nnext;
257 for (auto pair : deps_.at(next)) {
258 uint32_t numDeps = numDeps_[pair.first];
259 ASSERT(numDeps > 0);
260 numDeps--;
261 numDeps_[pair.first] = numDeps;
262 }
263 for (auto pair : deps_.at(nnext)) {
264 uint32_t numDeps = numDeps_[pair.first];
265 ASSERT(numDeps > 0);
266 numDeps--;
267 numDeps_[pair.first] = numDeps;
268 }
269 }
270 }
271
272 // Rearranges instructions in the basic block using list scheduling algorithm.
ScheduleBasicBlock(BasicBlock * bb)273 bool Scheduler::ScheduleBasicBlock(BasicBlock *bb)
274 {
275 COMPILER_LOG(DEBUG, SCHEDULER) << "Schedule BB " << bb->GetId();
276 if (!BuildAllDeps(bb)) {
277 return false;
278 }
279
280 // Schedule intervals between barriers
281 uint32_t cycle = 0;
282 uint32_t numInst = 0;
283 Inst *first = nullptr;
284 for (auto inst = bb->GetFirstInst(); inst != nullptr; inst = inst->GetNext()) {
285 bool barrier = inst->IsBarrier();
286 numInst++;
287 inst->ClearMarkers();
288 if (first == nullptr) {
289 first = inst;
290 }
291 if (barrier || inst == bb->GetLastInst()) {
292 Inst *last = nullptr;
293 if (barrier) {
294 last = inst->GetPrev();
295 numInst--;
296 } else {
297 last = inst;
298 }
299 if (numInst > 1) {
300 cycle += ScheduleInstsBetweenBarriers(first, last);
301 } else if (numInst == 1) {
302 ASSERT(first->GetOpcode() != Opcode::LoadPairPart);
303 sched_.push_back(first);
304 cycle++;
305 }
306 if (barrier) {
307 ASSERT(inst->GetOpcode() != Opcode::LoadPairPart);
308 ScheduleBarrierInst(&inst);
309 cycle++;
310 }
311 numInst = 0;
312 first = nullptr;
313 }
314 }
315 return FinalizeBB(bb, cycle);
316 }
317
FinalizeBB(BasicBlock * bb,uint32_t cycle)318 bool Scheduler::FinalizeBB(BasicBlock *bb, uint32_t cycle)
319 {
320 // Rearrange instructions in basic block according to schedule
321 bool result = false;
322 bool hasPrev = false;
323 uint32_t prev;
324 for (auto inst : sched_) {
325 auto cur = old_[inst];
326 if (hasPrev && prev - 1 != cur) {
327 result = true;
328 }
329 prev = cur;
330 hasPrev = true;
331
332 bb->EraseInst(inst);
333 bb->AppendInst(inst);
334 }
335
336 if (result) {
337 COMPILER_LOG(DEBUG, SCHEDULER) << "Stats for block " << bb->GetId() << ": old cycles = " << oprev_
338 << ", num barriers = " << numBarriers_ << ", critical path = " << maxPrio_
339 << ", scheduled = " << cycle;
340 GetGraph()->GetEventWriter().EventScheduler(bb->GetId(), bb->GetGuestPc(), oprev_, numBarriers_, maxPrio_,
341 cycle);
342 }
343
344 Cleanup();
345 return result;
346 }
347
Cleanup()348 void Scheduler::Cleanup()
349 {
350 sched_.clear();
351 loads_.clear();
352 stores_.clear();
353 special_.clear();
354 old_.clear();
355 ocycle_.clear();
356 numDeps_.clear();
357 prio_.clear();
358 deps_.clear();
359 ssWithRuntimeCall_.clear();
360 }
361
362 // Rearranges instructions between [first..last] inclusive, none of them are barriers.
ScheduleInstsBetweenBarriers(Inst * first,Inst * last)363 uint32_t Scheduler::ScheduleInstsBetweenBarriers(Inst *first, Inst *last)
364 {
365 COMPILER_LOG(DEBUG, SCHEDULER) << "SchedBetween " << first->GetId() << " and " << last->GetId();
366
367 // Compare function for 'waiting' queue
368 auto cmpAsap = [this](Inst *left, Inst *right) {
369 return asap_[left] > asap_[right] || (asap_[left] == asap_[right] && old_[left] < old_[right]);
370 };
371 // Queue of instructions, which dependencies are scheduled already, but they are still not finished yet
372 SchedulerPriorityQueue waiting(cmpAsap, GetGraph()->GetLocalAllocator()->Adapter());
373
374 // Compare function for 'ready' queue
375 auto cmpPrio = [this](Inst *left, Inst *right) {
376 return prio_[left] < prio_[right] || (prio_[left] == prio_[right] && old_[left] < old_[right]);
377 };
378 // Queue of ready instructions
379 SchedulerPriorityQueue ready(cmpPrio, GetGraph()->GetLocalAllocator()->Adapter());
380
381 // Initialization, add leafs into 'waiting' queue
382 uint32_t numInst = 0;
383 for (auto inst = first; inst != last->GetNext(); inst = inst->GetNext()) {
384 asap_.insert({inst, 1U});
385 if (numDeps_[inst] == 0) {
386 COMPILER_LOG(DEBUG, SCHEDULER) << "Queue wait add " << inst->GetId();
387 waiting.push(inst);
388 }
389 numInst++;
390 }
391
392 // Scheduling
393 uint32_t cycle = 1;
394 while (numInst > 0) {
395 if (ready.empty()) {
396 ASSERT(!waiting.empty());
397 uint32_t nearest = asap_[waiting.top()];
398 // Skipping cycles where we can't schedule any instruction
399 if (nearest > cycle) {
400 cycle = nearest;
401 }
402 }
403
404 // Move from 'waiting' to 'ready'
405 while (!waiting.empty()) {
406 Inst *soonest = waiting.top();
407 if (asap_[soonest] <= cycle) {
408 waiting.pop();
409 COMPILER_LOG(DEBUG, SCHEDULER) << "Queue ready moving " << soonest->GetId();
410 ready.push(soonest);
411 } else {
412 break;
413 }
414 }
415 ASSERT(!ready.empty());
416
417 // Schedule top 'ready' instruction (together with glued, when necessary)
418 numInst -= SchedWithGlued(ready.top(), &waiting, cycle++);
419 ready.pop();
420 }
421
422 // Cleanup
423 asap_.clear();
424 return cycle;
425 }
426
SchedWithGlued(Inst * inst,SchedulerPriorityQueue * waiting,uint32_t cycle)427 uint32_t Scheduler::SchedWithGlued(Inst *inst, SchedulerPriorityQueue *waiting, uint32_t cycle)
428 {
429 uint32_t amount = 0;
430 // Compare function for 'now' queue
431 auto cmpOld = [this](Inst *left, Inst *right) { return old_[left] < old_[right]; };
432 // Queue of instructions to schedule at current cycle
433 SchedulerPriorityQueue now(cmpOld, GetGraph()->GetLocalAllocator()->Adapter());
434 // Add inst into 'now'
435 ASSERT(now.empty());
436 now.push(inst);
437
438 // Add glued instructions into 'now'
439 if (inst->WithGluedInsts()) {
440 for (auto &userItem : inst->GetUsers()) {
441 auto user = userItem.GetInst();
442 ASSERT(user->GetOpcode() == Opcode::LoadPairPart);
443 now.push(user);
444 }
445 }
446
447 [[maybe_unused]] constexpr auto MAX_NOW_SIZE = 3;
448 ASSERT(now.size() <= MAX_NOW_SIZE);
449
450 // Schedule them
451 while (!now.empty()) {
452 auto cur = now.top();
453 now.pop();
454 COMPILER_LOG(DEBUG, SCHEDULER) << "Scheduling " << cur->GetId() << " at cycle " << cycle;
455
456 // Adjust all dependent instructions
457 for (auto pair : deps_.at(cur)) {
458 // Adjust asap
459 uint32_t asap = asap_[pair.first];
460 asap = std::max(asap, cycle + pair.second);
461 asap_[pair.first] = asap;
462
463 // Adjust numDeps
464 uint32_t numDeps = numDeps_[pair.first];
465 ASSERT(numDeps > 0);
466 numDeps--;
467 numDeps_[pair.first] = numDeps;
468
469 // Glued instructions scheduled immediately, so they should not go into queue
470 if (numDeps == 0 && pair.first->GetOpcode() != Opcode::LoadPairPart) {
471 COMPILER_LOG(DEBUG, SCHEDULER) << "Queue wait add " << pair.first->GetId();
472 waiting->push(pair.first);
473 }
474 }
475
476 // Add into schedule
477 sched_.push_back(cur);
478 amount++;
479 }
480
481 return amount;
482 }
483 } // namespace ark::compiler
484