1 /**
2 * Copyright (c) 2021-2022 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "scheduler.h"
17
18 #include "optimizer/analysis/alias_analysis.h"
19 #include "optimizer/ir/basicblock.h"
20 #include "compiler_logger.h"
21
22 namespace panda::compiler {
23 /* Instruction scheduling.
24 * Current decisions/limitations
25 *
26 * 1. Scheduler pass is placed immediately before register allocator.
27 * 2. It rearranges instructions only inside the basic block, but not between them.
28 * 3. No liveness analysis, only calculating dependencies using barrier/users/alias information.
29 * 4. No CPU pipeline/resource modeling, only having dependency costs.
30 * 5. Forward list scheduling algorithm with standart critical-path-based priority.
31 */
RunImpl()32 bool Scheduler::RunImpl()
33 {
34 COMPILER_LOG(DEBUG, SCHEDULER) << "Run " << GetPassName();
35 bool result = false;
36 for (auto bb : GetGraph()->GetBlocksRPO()) {
37 if (!bb->IsEmpty() && !bb->IsStartBlock()) {
38 result |= ScheduleBasicBlock(bb);
39 }
40 }
41 COMPILER_LOG(DEBUG, SCHEDULER) << GetPassName() << " complete";
42 #ifndef NDEBUG
43 GetGraph()->SetSchedulerComplete();
44 #endif // NDEBUG
45 return result;
46 }
47
48 // Dependency helper function
AddDep(uint32_t * prio,Inst * from,Inst * to,uint32_t latency,Inst * barrier)49 void Scheduler::AddDep(uint32_t *prio, Inst *from, Inst *to, uint32_t latency, Inst *barrier)
50 {
51 if (from == to) {
52 return;
53 }
54 COMPILER_LOG(DEBUG, SCHEDULER) << "Found dependency " << from->GetId() << " -> " << to->GetId() << " latency "
55 << latency;
56 // Estimate old cycle (without scheduling)
57 ocycle_[from] = std::max(ocycle_[from], latency + ocycle_[to]);
58
59 // Update instruction priority - "how high instruction is in dependency tree"
60 *prio = std::max(*prio, latency + prio_[to]);
61
62 // Do not add cross-barrier depenedencies into deps_
63 if (barrier == nullptr || old_[to] > old_[barrier]) {
64 if (deps_.at(from).count(to) == 1) {
65 uint32_t old_latency = deps_.at(from).at(to);
66 if (old_latency >= latency) {
67 return;
68 }
69 } else {
70 num_deps_[to]++;
71 }
72 deps_.at(from)[to] = latency;
73 }
74 }
75
76 // Calculate priority and dependencies
BuildAllDeps(BasicBlock * bb)77 bool Scheduler::BuildAllDeps(BasicBlock *bb)
78 {
79 auto marker_holder = MarkerHolder(GetGraph());
80 auto mrk = marker_holder.GetMarker();
81
82 oprev_ = 0;
83 num_barriers_ = 0;
84 max_prio_ = 0;
85
86 static constexpr uint32_t TOO_LONG_BB = 256;
87 uint32_t num_inst = 0;
88 uint32_t num_between = 0;
89 uint32_t num_special = 0;
90 Inst *last_barrier = nullptr;
91 for (auto inst : bb->InstsSafeReverse()) {
92 ProcessInst(inst, mrk, &num_inst, &num_between, &num_special, &last_barrier);
93
94 if (num_special > TOO_LONG_BB || num_between > TOO_LONG_BB) {
95 COMPILER_LOG(DEBUG, SCHEDULER) << "Block " << bb->GetId() << " seems too big for scheduling, skipping";
96 Cleanup();
97 return false;
98 }
99 }
100 return true;
101 }
102
103 // One instruction deps
ProcessInst(Inst * inst,Marker mrk,uint32_t * num_inst,uint32_t * num_between,uint32_t * num_special,Inst ** last_barrier)104 void Scheduler::ProcessInst(Inst *inst, Marker mrk, uint32_t *num_inst, uint32_t *num_between, uint32_t *num_special,
105 Inst **last_barrier)
106 {
107 uint32_t prio = 0;
108 uint32_t inst_latency = inst->Latency();
109 bool barrier = inst->IsBarrier();
110
111 (*num_between)++;
112 old_.insert({inst, (*num_inst)++});
113 ocycle_.insert({inst, ++oprev_});
114 num_deps_.insert({inst, 0U});
115 deps_.emplace(inst, GetGraph()->GetLocalAllocator()->Adapter());
116
117 // Dependency to the barrier
118 if (*last_barrier != nullptr) {
119 AddDep(&prio, inst, *last_barrier, 1U, *last_barrier);
120 }
121
122 // Dependency from barrier
123 if (barrier) {
124 Inst *old_last_barrier = *last_barrier;
125 *last_barrier = inst;
126 num_barriers_++;
127 *num_between = 0;
128 for (auto user = inst->GetNext(); user != old_last_barrier; user = user->GetNext()) {
129 AddDep(&prio, inst, user, 1U, *last_barrier);
130 }
131 }
132
133 // Users
134 for (auto &user_item : inst->GetUsers()) {
135 auto user = user_item.GetInst();
136 if (user->IsMarked(mrk)) {
137 AddDep(&prio, inst, user, inst_latency, *last_barrier);
138 }
139 }
140
141 if (inst->IsMemory() || inst->IsRefSpecial()) {
142 ProcessMemory(inst, &prio, *last_barrier);
143 (*num_special)++;
144 }
145
146 if (inst->CanThrow() || inst->IsRuntimeCall() || inst->IsSaveState()) {
147 ProcessSpecial(inst, &prio, *last_barrier);
148 (*num_special)++;
149 }
150
151 inst->SetMarker(mrk);
152 prio_.insert({inst, prio});
153 max_prio_ = std::max(max_prio_, prio);
154 oprev_ = ocycle_[inst];
155 }
156
157 // Memory
ProcessMemory(Inst * inst,uint32_t * prio,Inst * last_barrier)158 void Scheduler::ProcessMemory(Inst *inst, uint32_t *prio, Inst *last_barrier)
159 {
160 if (inst->IsRefSpecial()) {
161 loads_.push_back(inst);
162 return;
163 }
164 for (auto mem : stores_) {
165 if (GetGraph()->CheckInstAlias(inst, mem) != AliasType::NO_ALIAS) {
166 AddDep(prio, inst, mem, 1U, last_barrier);
167 }
168 }
169 if (inst->IsStore()) {
170 for (auto mem : loads_) {
171 if (mem->IsLoad() && GetGraph()->CheckInstAlias(inst, mem) != AliasType::NO_ALIAS) {
172 AddDep(prio, inst, mem, 1U, last_barrier);
173 }
174 }
175 for (auto ct : special_) {
176 AddDep(prio, inst, ct, 1U, last_barrier);
177 }
178 stores_.push_back(inst);
179 } else { // means inst->IsLoad()
180 loads_.push_back(inst);
181 }
182 }
183
184 // CanThrow or SaveState can't be rearranged, and stores can't be moved over them
ProcessSpecial(Inst * inst,uint32_t * prio,Inst * last_barrier)185 void Scheduler::ProcessSpecial(Inst *inst, uint32_t *prio, Inst *last_barrier)
186 {
187 for (auto mem : stores_) {
188 AddDep(prio, inst, mem, 1U, last_barrier);
189 }
190 for (auto ct : special_) {
191 AddDep(prio, inst, ct, 1U, last_barrier);
192 }
193 // 1. SafePoint also has this flag
194 // 2. GC triggered inside can poison loaded reference value
195 if (inst->IsRuntimeCall()) {
196 for (auto mem : loads_) {
197 if (mem->GetType() == DataType::REFERENCE) {
198 AddDep(prio, inst, mem, 1U, last_barrier);
199 }
200 }
201 }
202 // We have to "restore" BoundsCheckI -> LoadArrayI dependency
203 if (inst->GetOpcode() == Opcode::BoundsCheckI) {
204 auto value = inst->CastToBoundsCheckI()->GetImm();
205 // Remove loads with same immediate. No easy way to check arrays are same.
206 for (auto load : loads_) {
207 if (load->GetOpcode() == Opcode::LoadArrayPairI) {
208 auto imm = load->CastToLoadArrayPairI()->GetImm();
209 if (imm == value || imm + 1 == value) {
210 AddDep(prio, inst, load, 1U, last_barrier);
211 }
212 } else if (load->GetOpcode() == Opcode::LoadArrayI && load->CastToLoadArrayI()->GetImm() == value) {
213 AddDep(prio, inst, load, 1U, last_barrier);
214 }
215 }
216 }
217 special_.push_back(inst);
218 }
219
220 // Rearranges instructions in the basic block using list scheduling algorithm.
ScheduleBasicBlock(BasicBlock * bb)221 bool Scheduler::ScheduleBasicBlock(BasicBlock *bb)
222 {
223 COMPILER_LOG(DEBUG, SCHEDULER) << "Schedule BB " << bb->GetId();
224
225 if (!BuildAllDeps(bb)) {
226 return false;
227 }
228
229 // Schedule intervals between barriers
230 uint32_t cycle = 0;
231 uint32_t num_inst = 0;
232 Inst *first = nullptr;
233 for (auto inst = bb->GetFirstInst(); inst != nullptr; inst = inst->GetNext()) {
234 bool barrier = inst->IsBarrier();
235 num_inst++;
236 inst->ClearMarkers();
237 if (first == nullptr) {
238 first = inst;
239 }
240 if (barrier || inst == bb->GetLastInst()) {
241 Inst *last = nullptr;
242 if (barrier) {
243 last = inst->GetPrev();
244 num_inst--;
245 } else {
246 last = inst;
247 }
248 if (num_inst > 1) {
249 cycle += ScheduleInstsBetweenBarriers(first, last);
250 } else if (num_inst == 1) {
251 ASSERT(first->GetOpcode() != Opcode::LoadPairPart);
252 sched_.push_back(first);
253 cycle++;
254 }
255 if (barrier) {
256 ASSERT(inst->GetOpcode() != Opcode::LoadPairPart);
257 sched_.push_back(inst);
258 cycle++;
259 }
260 num_inst = 0;
261 first = nullptr;
262 }
263 }
264 return FinalizeBB(bb, cycle);
265 }
266
FinalizeBB(BasicBlock * bb,uint32_t cycle)267 bool Scheduler::FinalizeBB(BasicBlock *bb, uint32_t cycle)
268 {
269 // Rearrange instructions in basic block according to schedule
270 bool result = false;
271 bool has_prev = false;
272 uint32_t prev;
273 for (auto inst : sched_) {
274 auto cur = old_[inst];
275 if (has_prev && prev - 1 != cur) {
276 result = true;
277 }
278 prev = cur;
279 has_prev = true;
280
281 bb->EraseInst(inst);
282 bb->AppendInst(inst);
283 }
284
285 if (result) {
286 COMPILER_LOG(DEBUG, SCHEDULER) << "Stats for block " << bb->GetId() << ": old cycles = " << oprev_
287 << ", num barriers = " << num_barriers_ << ", critical path = " << max_prio_
288 << ", scheduled = " << cycle;
289 GetGraph()->GetEventWriter().EventScheduler(bb->GetId(), bb->GetGuestPc(), oprev_, num_barriers_, max_prio_,
290 cycle);
291 }
292
293 Cleanup();
294 return result;
295 }
296
Cleanup()297 void Scheduler::Cleanup()
298 {
299 sched_.clear();
300 loads_.clear();
301 stores_.clear();
302 special_.clear();
303 old_.clear();
304 ocycle_.clear();
305 num_deps_.clear();
306 prio_.clear();
307 deps_.clear();
308 }
309
310 // Rearranges instructions between [first..last] inclusive, none of them are barriers.
ScheduleInstsBetweenBarriers(Inst * first,Inst * last)311 uint32_t Scheduler::ScheduleInstsBetweenBarriers(Inst *first, Inst *last)
312 {
313 COMPILER_LOG(DEBUG, SCHEDULER) << "SchedBetween " << first->GetId() << " and " << last->GetId();
314
315 // Compare function for 'waiting' queue
316 auto cmp_asap = [this](Inst *left, Inst *right) {
317 return asap_[left] > asap_[right] || (asap_[left] == asap_[right] && old_[left] < old_[right]);
318 };
319 // Queue of instructions, which dependencies are scheduled already, but they are still not finished yet
320 SchedulerPriorityQueue waiting(cmp_asap, GetGraph()->GetLocalAllocator()->Adapter());
321
322 // Compare function for 'ready' queue
323 auto cmp_prio = [this](Inst *left, Inst *right) {
324 return prio_[left] < prio_[right] || (prio_[left] == prio_[right] && old_[left] < old_[right]);
325 };
326 // Queue of ready instructions
327 SchedulerPriorityQueue ready(cmp_prio, GetGraph()->GetLocalAllocator()->Adapter());
328
329 // Initialization, add leafs into 'waiting' queue
330 uint32_t num_inst = 0;
331 for (auto inst = first; inst != last->GetNext(); inst = inst->GetNext()) {
332 asap_.insert({inst, 1U});
333 if (num_deps_[inst] == 0) {
334 COMPILER_LOG(DEBUG, SCHEDULER) << "Queue wait add " << inst->GetId();
335 waiting.push(inst);
336 }
337 num_inst++;
338 }
339
340 // Scheduling
341 uint32_t cycle = 1;
342 while (num_inst > 0) {
343 if (ready.empty()) {
344 ASSERT(!waiting.empty());
345 uint32_t nearest = asap_[waiting.top()];
346 // Skipping cycles where we can't schedule any instruction
347 if (nearest > cycle) {
348 cycle = nearest;
349 }
350 }
351
352 // Move from 'waiting' to 'ready'
353 while (!waiting.empty()) {
354 Inst *soonest = waiting.top();
355 if (asap_[soonest] <= cycle) {
356 waiting.pop();
357 COMPILER_LOG(DEBUG, SCHEDULER) << "Queue ready moving " << soonest->GetId();
358 ready.push(soonest);
359 } else {
360 break;
361 }
362 }
363 ASSERT(!ready.empty());
364
365 // Schedule top 'ready' instruction (together with glued, when necessary)
366 num_inst -= SchedWithGlued(ready.top(), &waiting, cycle++);
367 ready.pop();
368 }
369
370 // Cleanup
371 asap_.clear();
372 return cycle;
373 }
374
SchedWithGlued(Inst * inst,SchedulerPriorityQueue * waiting,uint32_t cycle)375 uint32_t Scheduler::SchedWithGlued(Inst *inst, SchedulerPriorityQueue *waiting, uint32_t cycle)
376 {
377 uint32_t amount = 0;
378 // Compare function for 'now' queue
379 auto cmp_old = [this](Inst *left, Inst *right) { return old_[left] < old_[right]; };
380 // Queue of instructions to schedule at current cycle
381 SchedulerPriorityQueue now(cmp_old, GetGraph()->GetLocalAllocator()->Adapter());
382 // Add inst into 'now'
383 ASSERT(now.empty());
384 now.push(inst);
385
386 // Add glued instructions into 'now'
387 if (inst->GetOpcode() == Opcode::LoadArrayPair || inst->GetOpcode() == Opcode::LoadArrayPairI) {
388 for (auto &user_item : inst->GetUsers()) {
389 auto user = user_item.GetInst();
390 ASSERT(user->GetOpcode() == Opcode::LoadPairPart);
391 now.push(user);
392 }
393 }
394
395 [[maybe_unused]] constexpr auto MAX_NOW_SIZE = 3;
396 ASSERT(now.size() <= MAX_NOW_SIZE);
397
398 // Schedule them
399 while (!now.empty()) {
400 auto cur = now.top();
401 now.pop();
402 COMPILER_LOG(DEBUG, SCHEDULER) << "Scheduling " << cur->GetId() << " at cycle " << cycle;
403
404 // Adjust all dependent instructions
405 for (auto pair : deps_.at(cur)) {
406 // Adjust asap
407 uint32_t asap = asap_[pair.first];
408 asap = std::max(asap, cycle + pair.second);
409 asap_[pair.first] = asap;
410
411 // Adjust num_deps
412 uint32_t num_deps = num_deps_[pair.first];
413 ASSERT(num_deps > 0);
414 num_deps--;
415 num_deps_[pair.first] = num_deps;
416
417 // Glued instructions scheduled immediately, so they should not go into queue
418 if (num_deps == 0 && pair.first->GetOpcode() != Opcode::LoadPairPart) {
419 COMPILER_LOG(DEBUG, SCHEDULER) << "Queue wait add " << pair.first->GetId();
420 waiting->push(pair.first);
421 }
422 }
423
424 // Add into schedule
425 sched_.push_back(cur);
426 amount++;
427 }
428
429 return amount;
430 }
431 } // namespace panda::compiler
432