1 // Copyright (c) 2018 Google LLC.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "source/opt/loop_fission.h"
16
17 #include <set>
18
19 #include "source/opt/register_pressure.h"
20
21 // Implement loop fission with an optional parameter to split only
22 // if the register pressure in a given loop meets a certain criteria. This is
23 // controlled via the constructors of LoopFissionPass.
24 //
25 // 1 - Build a list of loops to be split, these are top level loops (loops
26 // without child loops themselves) which meet the register pressure criteria, as
27 // determined by the ShouldSplitLoop method of LoopFissionPass.
28 //
29 // 2 - For each loop in the list, group each instruction into a set of related
30 // instructions by traversing each instructions users and operands recursively.
31 // We stop if we encounter an instruction we have seen before or an instruction
32 // which we don't consider relevant (i.e OpLoopMerge). We then group these
33 // groups into two different sets, one for the first loop and one for the
34 // second.
35 //
36 // 3 - We then run CanPerformSplit to check that it would be legal to split a
37 // loop using those two sets. We check that we haven't altered the relative
38 // order load/stores appear in the binary and that we aren't breaking any
39 // dependency between load/stores by splitting them into two loops. We also
40 // check that none of the OpBranch instructions are dependent on a load as we
41 // leave control flow structure intact and move only instructions in the body so
42 // we want to avoid any loads with side affects or aliasing.
43 //
44 // 4 - We then split the loop by calling SplitLoop. This function clones the
45 // loop and attaches it to the preheader and connects the new loops merge block
46 // to the current loop header block. We then use the two sets built in step 2 to
47 // remove instructions from each loop. If an instruction appears in the first
48 // set it is removed from the second loop and vice versa.
49 //
50 // 5 - If the multiple split passes flag is set we check if each of the loops
51 // still meet the register pressure criteria. If they do then we add them to the
52 // list of loops to be split (created in step one) to allow for loops to be
53 // split multiple times.
54 //
55
56 namespace spvtools {
57 namespace opt {
58
59 class LoopFissionImpl {
60 public:
LoopFissionImpl(IRContext * context,Loop * loop)61 LoopFissionImpl(IRContext* context, Loop* loop)
62 : context_(context), loop_(loop), load_used_in_condition_(false) {}
63
64 // Group each instruction in the loop into sets of instructions related by
65 // their usedef chains. An instruction which uses another will appear in the
66 // same set. Then merge those sets into just two sets. Returns false if there
67 // was one or less sets created.
68 bool GroupInstructionsByUseDef();
69
70 // Check if the sets built by GroupInstructionsByUseDef violate any data
71 // dependence rules.
72 bool CanPerformSplit();
73
74 // Split the loop and return a pointer to the new loop.
75 Loop* SplitLoop();
76
77 // Checks if |inst| is safe to move. We can only move instructions which don't
78 // have any side effects and OpLoads and OpStores.
79 bool MovableInstruction(const Instruction& inst) const;
80
81 private:
82 // Traverse the def use chain of |inst| and add the users and uses of |inst|
83 // which are in the same loop to the |returned_set|.
84 void TraverseUseDef(Instruction* inst, std::set<Instruction*>* returned_set,
85 bool ignore_phi_users = false, bool report_loads = false);
86
87 // We group the instructions in the block into two different groups, the
88 // instructions to be kept in the original loop and the ones to be cloned into
89 // the new loop. As the cloned loop is attached to the preheader it will be
90 // the first loop and the second loop will be the original.
91 std::set<Instruction*> cloned_loop_instructions_;
92 std::set<Instruction*> original_loop_instructions_;
93
94 // We need a set of all the instructions to be seen so we can break any
95 // recursion and also so we can ignore certain instructions by preemptively
96 // adding them to this set.
97 std::set<Instruction*> seen_instructions_;
98
99 // A map of instructions to their relative position in the function.
100 std::map<Instruction*, size_t> instruction_order_;
101
102 IRContext* context_;
103
104 Loop* loop_;
105
106 // This is set to true by TraverseUseDef when traversing the instructions
107 // related to the loop condition and any if conditions should any of those
108 // instructions be a load.
109 bool load_used_in_condition_;
110 };
111
MovableInstruction(const Instruction & inst) const112 bool LoopFissionImpl::MovableInstruction(const Instruction& inst) const {
113 return inst.opcode() == SpvOp::SpvOpLoad ||
114 inst.opcode() == SpvOp::SpvOpStore ||
115 inst.opcode() == SpvOp::SpvOpSelectionMerge ||
116 inst.opcode() == SpvOp::SpvOpPhi || inst.IsOpcodeCodeMotionSafe();
117 }
118
TraverseUseDef(Instruction * inst,std::set<Instruction * > * returned_set,bool ignore_phi_users,bool report_loads)119 void LoopFissionImpl::TraverseUseDef(Instruction* inst,
120 std::set<Instruction*>* returned_set,
121 bool ignore_phi_users, bool report_loads) {
122 assert(returned_set && "Set to be returned cannot be null.");
123
124 analysis::DefUseManager* def_use = context_->get_def_use_mgr();
125 std::set<Instruction*>& inst_set = *returned_set;
126
127 // We create this functor to traverse the use def chain to build the
128 // grouping of related instructions. The lambda captures the std::function
129 // to allow it to recurse.
130 std::function<void(Instruction*)> traverser_functor;
131 traverser_functor = [this, def_use, &inst_set, &traverser_functor,
132 ignore_phi_users, report_loads](Instruction* user) {
133 // If we've seen the instruction before or it is not inside the loop end the
134 // traversal.
135 if (!user || seen_instructions_.count(user) != 0 ||
136 !context_->get_instr_block(user) ||
137 !loop_->IsInsideLoop(context_->get_instr_block(user))) {
138 return;
139 }
140
141 // Don't include labels or loop merge instructions in the instruction sets.
142 // Including them would mean we group instructions related only by using the
143 // same labels (i.e phis). We already preempt the inclusion of
144 // OpSelectionMerge by adding related instructions to the seen_instructions_
145 // set.
146 if (user->opcode() == SpvOp::SpvOpLoopMerge ||
147 user->opcode() == SpvOp::SpvOpLabel)
148 return;
149
150 // If the |report_loads| flag is set, set the class field
151 // load_used_in_condition_ to false. This is used to check that none of the
152 // condition checks in the loop rely on loads.
153 if (user->opcode() == SpvOp::SpvOpLoad && report_loads) {
154 load_used_in_condition_ = true;
155 }
156
157 // Add the instruction to the set of instructions already seen, this breaks
158 // recursion and allows us to ignore certain instructions.
159 seen_instructions_.insert(user);
160
161 inst_set.insert(user);
162
163 // Wrapper functor to traverse the operands of each instruction.
164 auto traverse_operand = [&traverser_functor, def_use](const uint32_t* id) {
165 traverser_functor(def_use->GetDef(*id));
166 };
167 user->ForEachInOperand(traverse_operand);
168
169 // For the first traversal we want to ignore the users of the phi.
170 if (ignore_phi_users && user->opcode() == SpvOp::SpvOpPhi) return;
171
172 // Traverse each user with this lambda.
173 def_use->ForEachUser(user, traverser_functor);
174
175 // Wrapper functor for the use traversal.
176 auto traverse_use = [&traverser_functor](Instruction* use, uint32_t) {
177 traverser_functor(use);
178 };
179 def_use->ForEachUse(user, traverse_use);
180
181 };
182
183 // We start the traversal of the use def graph by invoking the above
184 // lambda with the |inst| parameter.
185 traverser_functor(inst);
186 }
187
GroupInstructionsByUseDef()188 bool LoopFissionImpl::GroupInstructionsByUseDef() {
189 std::vector<std::set<Instruction*>> sets{};
190
191 // We want to ignore all the instructions stemming from the loop condition
192 // instruction.
193 BasicBlock* condition_block = loop_->FindConditionBlock();
194
195 if (!condition_block) return false;
196 Instruction* condition = &*condition_block->tail();
197
198 // We iterate over the blocks via iterating over all the blocks in the
199 // function, we do this so we are iterating in the same order which the blocks
200 // appear in the binary.
201 Function& function = *loop_->GetHeaderBlock()->GetParent();
202
203 // Create a temporary set to ignore certain groups of instructions within the
204 // loop. We don't want any instructions related to control flow to be removed
205 // from either loop only instructions within the control flow bodies.
206 std::set<Instruction*> instructions_to_ignore{};
207 TraverseUseDef(condition, &instructions_to_ignore, true, true);
208
209 // Traverse control flow instructions to ensure they are added to the
210 // seen_instructions_ set and will be ignored when it it called with actual
211 // sets.
212 for (BasicBlock& block : function) {
213 if (!loop_->IsInsideLoop(block.id())) continue;
214
215 for (Instruction& inst : block) {
216 // Ignore all instructions related to control flow.
217 if (inst.opcode() == SpvOp::SpvOpSelectionMerge || inst.IsBranch()) {
218 TraverseUseDef(&inst, &instructions_to_ignore, true, true);
219 }
220 }
221 }
222
223 // Traverse the instructions and generate the sets, automatically ignoring any
224 // instructions in instructions_to_ignore.
225 for (BasicBlock& block : function) {
226 if (!loop_->IsInsideLoop(block.id()) ||
227 loop_->GetHeaderBlock()->id() == block.id())
228 continue;
229
230 for (Instruction& inst : block) {
231 // Record the order that each load/store is seen.
232 if (inst.opcode() == SpvOp::SpvOpLoad ||
233 inst.opcode() == SpvOp::SpvOpStore) {
234 instruction_order_[&inst] = instruction_order_.size();
235 }
236
237 // Ignore instructions already seen in a traversal.
238 if (seen_instructions_.count(&inst) != 0) {
239 continue;
240 }
241
242 // Build the set.
243 std::set<Instruction*> inst_set{};
244 TraverseUseDef(&inst, &inst_set);
245 if (!inst_set.empty()) sets.push_back(std::move(inst_set));
246 }
247 }
248
249 // If we have one or zero sets return false to indicate that due to
250 // insufficient instructions we couldn't split the loop into two groups and
251 // thus the loop can't be split any further.
252 if (sets.size() < 2) {
253 return false;
254 }
255
256 // Merge the loop sets into two different sets. In CanPerformSplit we will
257 // validate that we don't break the relative ordering of loads/stores by doing
258 // this.
259 for (size_t index = 0; index < sets.size() / 2; ++index) {
260 cloned_loop_instructions_.insert(sets[index].begin(), sets[index].end());
261 }
262 for (size_t index = sets.size() / 2; index < sets.size(); ++index) {
263 original_loop_instructions_.insert(sets[index].begin(), sets[index].end());
264 }
265
266 return true;
267 }
268
CanPerformSplit()269 bool LoopFissionImpl::CanPerformSplit() {
270 // Return false if any of the condition instructions in the loop depend on a
271 // load.
272 if (load_used_in_condition_) {
273 return false;
274 }
275
276 // Build a list of all parent loops of this loop. Loop dependence analysis
277 // needs this structure.
278 std::vector<const Loop*> loops;
279 Loop* parent_loop = loop_;
280 while (parent_loop) {
281 loops.push_back(parent_loop);
282 parent_loop = parent_loop->GetParent();
283 }
284
285 LoopDependenceAnalysis analysis{context_, loops};
286
287 // A list of all the stores in the cloned loop.
288 std::vector<Instruction*> set_one_stores{};
289
290 // A list of all the loads in the cloned loop.
291 std::vector<Instruction*> set_one_loads{};
292
293 // Populate the above lists.
294 for (Instruction* inst : cloned_loop_instructions_) {
295 if (inst->opcode() == SpvOp::SpvOpStore) {
296 set_one_stores.push_back(inst);
297 } else if (inst->opcode() == SpvOp::SpvOpLoad) {
298 set_one_loads.push_back(inst);
299 }
300
301 // If we find any instruction which we can't move (such as a barrier),
302 // return false.
303 if (!MovableInstruction(*inst)) return false;
304 }
305
306 // We need to calculate the depth of the loop to create the loop dependency
307 // distance vectors.
308 const size_t loop_depth = loop_->GetDepth();
309
310 // Check the dependencies between loads in the cloned loop and stores in the
311 // original and vice versa.
312 for (Instruction* inst : original_loop_instructions_) {
313 // If we find any instruction which we can't move (such as a barrier),
314 // return false.
315 if (!MovableInstruction(*inst)) return false;
316
317 // Look at the dependency between the loads in the original and stores in
318 // the cloned loops.
319 if (inst->opcode() == SpvOp::SpvOpLoad) {
320 for (Instruction* store : set_one_stores) {
321 DistanceVector vec{loop_depth};
322
323 // If the store actually should appear after the load, return false.
324 // This means the store has been placed in the wrong grouping.
325 if (instruction_order_[store] > instruction_order_[inst]) {
326 return false;
327 }
328 // If not independent check the distance vector.
329 if (!analysis.GetDependence(store, inst, &vec)) {
330 for (DistanceEntry& entry : vec.GetEntries()) {
331 // A distance greater than zero means that the store in the cloned
332 // loop has a dependency on the load in the original loop.
333 if (entry.distance > 0) return false;
334 }
335 }
336 }
337 } else if (inst->opcode() == SpvOp::SpvOpStore) {
338 for (Instruction* load : set_one_loads) {
339 DistanceVector vec{loop_depth};
340
341 // If the load actually should appear after the store, return false.
342 if (instruction_order_[load] > instruction_order_[inst]) {
343 return false;
344 }
345
346 // If not independent check the distance vector.
347 if (!analysis.GetDependence(inst, load, &vec)) {
348 for (DistanceEntry& entry : vec.GetEntries()) {
349 // A distance less than zero means the load in the cloned loop is
350 // dependent on the store instruction in the original loop.
351 if (entry.distance < 0) return false;
352 }
353 }
354 }
355 }
356 }
357 return true;
358 }
359
SplitLoop()360 Loop* LoopFissionImpl::SplitLoop() {
361 // Clone the loop.
362 LoopUtils util{context_, loop_};
363 LoopUtils::LoopCloningResult clone_results;
364 Loop* cloned_loop = util.CloneAndAttachLoopToHeader(&clone_results);
365
366 // Update the OpLoopMerge in the cloned loop.
367 cloned_loop->UpdateLoopMergeInst();
368
369 // Add the loop_ to the module.
370 // TODO(1841): Handle failure to create pre-header.
371 Function::iterator it =
372 util.GetFunction()->FindBlock(loop_->GetOrCreatePreHeaderBlock()->id());
373 util.GetFunction()->AddBasicBlocks(clone_results.cloned_bb_.begin(),
374 clone_results.cloned_bb_.end(), ++it);
375 loop_->SetPreHeaderBlock(cloned_loop->GetMergeBlock());
376
377 std::vector<Instruction*> instructions_to_kill{};
378
379 // Kill all the instructions which should appear in the cloned loop but not in
380 // the original loop.
381 for (uint32_t id : loop_->GetBlocks()) {
382 BasicBlock* block = context_->cfg()->block(id);
383
384 for (Instruction& inst : *block) {
385 // If the instruction appears in the cloned loop instruction group, kill
386 // it.
387 if (cloned_loop_instructions_.count(&inst) == 1 &&
388 original_loop_instructions_.count(&inst) == 0) {
389 instructions_to_kill.push_back(&inst);
390 if (inst.opcode() == SpvOp::SpvOpPhi) {
391 context_->ReplaceAllUsesWith(
392 inst.result_id(), clone_results.value_map_[inst.result_id()]);
393 }
394 }
395 }
396 }
397
398 // Kill all instructions which should appear in the original loop and not in
399 // the cloned loop.
400 for (uint32_t id : cloned_loop->GetBlocks()) {
401 BasicBlock* block = context_->cfg()->block(id);
402 for (Instruction& inst : *block) {
403 Instruction* old_inst = clone_results.ptr_map_[&inst];
404 // If the instruction belongs to the original loop instruction group, kill
405 // it.
406 if (cloned_loop_instructions_.count(old_inst) == 0 &&
407 original_loop_instructions_.count(old_inst) == 1) {
408 instructions_to_kill.push_back(&inst);
409 }
410 }
411 }
412
413 for (Instruction* i : instructions_to_kill) {
414 context_->KillInst(i);
415 }
416
417 return cloned_loop;
418 }
419
LoopFissionPass(const size_t register_threshold_to_split,bool split_multiple_times)420 LoopFissionPass::LoopFissionPass(const size_t register_threshold_to_split,
421 bool split_multiple_times)
422 : split_multiple_times_(split_multiple_times) {
423 // Split if the number of registers in the loop exceeds
424 // |register_threshold_to_split|.
425 split_criteria_ =
426 [register_threshold_to_split](
427 const RegisterLiveness::RegionRegisterLiveness& liveness) {
428 return liveness.used_registers_ > register_threshold_to_split;
429 };
430 }
431
LoopFissionPass()432 LoopFissionPass::LoopFissionPass() : split_multiple_times_(false) {
433 // Split by default.
434 split_criteria_ = [](const RegisterLiveness::RegionRegisterLiveness&) {
435 return true;
436 };
437 }
438
ShouldSplitLoop(const Loop & loop,IRContext * c)439 bool LoopFissionPass::ShouldSplitLoop(const Loop& loop, IRContext* c) {
440 LivenessAnalysis* analysis = c->GetLivenessAnalysis();
441
442 RegisterLiveness::RegionRegisterLiveness liveness{};
443
444 Function* function = loop.GetHeaderBlock()->GetParent();
445 analysis->Get(function)->ComputeLoopRegisterPressure(loop, &liveness);
446
447 return split_criteria_(liveness);
448 }
449
Process()450 Pass::Status LoopFissionPass::Process() {
451 bool changed = false;
452
453 for (Function& f : *context()->module()) {
454 // We collect all the inner most loops in the function and run the loop
455 // splitting util on each. The reason we do this is to allow us to iterate
456 // over each, as creating new loops will invalidate the loop iterator.
457 std::vector<Loop*> inner_most_loops{};
458 LoopDescriptor& loop_descriptor = *context()->GetLoopDescriptor(&f);
459 for (Loop& loop : loop_descriptor) {
460 if (!loop.HasChildren() && ShouldSplitLoop(loop, context())) {
461 inner_most_loops.push_back(&loop);
462 }
463 }
464
465 // List of new loops which meet the criteria to be split again.
466 std::vector<Loop*> new_loops_to_split{};
467
468 while (!inner_most_loops.empty()) {
469 for (Loop* loop : inner_most_loops) {
470 LoopFissionImpl impl{context(), loop};
471
472 // Group the instructions in the loop into two different sets of related
473 // instructions. If we can't group the instructions into the two sets
474 // then we can't split the loop any further.
475 if (!impl.GroupInstructionsByUseDef()) {
476 continue;
477 }
478
479 if (impl.CanPerformSplit()) {
480 Loop* second_loop = impl.SplitLoop();
481 changed = true;
482 context()->InvalidateAnalysesExceptFor(
483 IRContext::kAnalysisLoopAnalysis);
484
485 // If the newly created loop meets the criteria to be split, split it
486 // again.
487 if (ShouldSplitLoop(*second_loop, context()))
488 new_loops_to_split.push_back(second_loop);
489
490 // If the original loop (now split) still meets the criteria to be
491 // split, split it again.
492 if (ShouldSplitLoop(*loop, context()))
493 new_loops_to_split.push_back(loop);
494 }
495 }
496
497 // If the split multiple times flag has been set add the new loops which
498 // meet the splitting criteria into the list of loops to be split on the
499 // next iteration.
500 if (split_multiple_times_) {
501 inner_most_loops = std::move(new_loops_to_split);
502 } else {
503 break;
504 }
505 }
506 }
507
508 return changed ? Pass::Status::SuccessWithChange
509 : Pass::Status::SuccessWithoutChange;
510 }
511
512 } // namespace opt
513 } // namespace spvtools
514