1 //===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements loop unroll and jam as a routine, much like
11 // LoopUnroll.cpp implements loop unroll.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "llvm/ADT/SmallPtrSet.h"
16 #include "llvm/ADT/Statistic.h"
17 #include "llvm/Analysis/AssumptionCache.h"
18 #include "llvm/Analysis/DependenceAnalysis.h"
19 #include "llvm/Analysis/InstructionSimplify.h"
20 #include "llvm/Analysis/LoopAnalysisManager.h"
21 #include "llvm/Analysis/LoopIterator.h"
22 #include "llvm/Analysis/LoopPass.h"
23 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
24 #include "llvm/Analysis/ScalarEvolution.h"
25 #include "llvm/Analysis/ScalarEvolutionExpander.h"
26 #include "llvm/Analysis/Utils/Local.h"
27 #include "llvm/IR/BasicBlock.h"
28 #include "llvm/IR/DataLayout.h"
29 #include "llvm/IR/DebugInfoMetadata.h"
30 #include "llvm/IR/Dominators.h"
31 #include "llvm/IR/IntrinsicInst.h"
32 #include "llvm/IR/LLVMContext.h"
33 #include "llvm/Support/Debug.h"
34 #include "llvm/Support/raw_ostream.h"
35 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
36 #include "llvm/Transforms/Utils/Cloning.h"
37 #include "llvm/Transforms/Utils/LoopSimplify.h"
38 #include "llvm/Transforms/Utils/LoopUtils.h"
39 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
40 #include "llvm/Transforms/Utils/UnrollLoop.h"
41 using namespace llvm;
42
43 #define DEBUG_TYPE "loop-unroll-and-jam"
44
45 STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
46 STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
47
48 typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet;
49
50 // Partition blocks in an outer/inner loop pair into blocks before and after
51 // the loop
partitionOuterLoopBlocks(Loop * L,Loop * SubLoop,BasicBlockSet & ForeBlocks,BasicBlockSet & SubLoopBlocks,BasicBlockSet & AftBlocks,DominatorTree * DT)52 static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
53 BasicBlockSet &ForeBlocks,
54 BasicBlockSet &SubLoopBlocks,
55 BasicBlockSet &AftBlocks,
56 DominatorTree *DT) {
57 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
58 SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
59
60 for (BasicBlock *BB : L->blocks()) {
61 if (!SubLoop->contains(BB)) {
62 if (DT->dominates(SubLoopLatch, BB))
63 AftBlocks.insert(BB);
64 else
65 ForeBlocks.insert(BB);
66 }
67 }
68
69 // Check that all blocks in ForeBlocks together dominate the subloop
70 // TODO: This might ideally be done better with a dominator/postdominators.
71 BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
72 for (BasicBlock *BB : ForeBlocks) {
73 if (BB == SubLoopPreHeader)
74 continue;
75 TerminatorInst *TI = BB->getTerminator();
76 for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
77 if (!ForeBlocks.count(TI->getSuccessor(i)))
78 return false;
79 }
80
81 return true;
82 }
83
84 // Looks at the phi nodes in Header for values coming from Latch. For these
85 // instructions and all their operands calls Visit on them, keeping going for
86 // all the operands in AftBlocks. Returns false if Visit returns false,
87 // otherwise returns true. This is used to process the instructions in the
88 // Aft blocks that need to be moved before the subloop. It is used in two
89 // places. One to check that the required set of instructions can be moved
90 // before the loop. Then to collect the instructions to actually move in
91 // moveHeaderPhiOperandsToForeBlocks.
92 template <typename T>
processHeaderPhiOperands(BasicBlock * Header,BasicBlock * Latch,BasicBlockSet & AftBlocks,T Visit)93 static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch,
94 BasicBlockSet &AftBlocks, T Visit) {
95 SmallVector<Instruction *, 8> Worklist;
96 for (auto &Phi : Header->phis()) {
97 Value *V = Phi.getIncomingValueForBlock(Latch);
98 if (Instruction *I = dyn_cast<Instruction>(V))
99 Worklist.push_back(I);
100 }
101
102 while (!Worklist.empty()) {
103 Instruction *I = Worklist.back();
104 Worklist.pop_back();
105 if (!Visit(I))
106 return false;
107
108 if (AftBlocks.count(I->getParent()))
109 for (auto &U : I->operands())
110 if (Instruction *II = dyn_cast<Instruction>(U))
111 Worklist.push_back(II);
112 }
113
114 return true;
115 }
116
117 // Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
moveHeaderPhiOperandsToForeBlocks(BasicBlock * Header,BasicBlock * Latch,Instruction * InsertLoc,BasicBlockSet & AftBlocks)118 static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header,
119 BasicBlock *Latch,
120 Instruction *InsertLoc,
121 BasicBlockSet &AftBlocks) {
122 // We need to ensure we move the instructions in the correct order,
123 // starting with the earliest required instruction and moving forward.
124 std::vector<Instruction *> Visited;
125 processHeaderPhiOperands(Header, Latch, AftBlocks,
126 [&Visited, &AftBlocks](Instruction *I) {
127 if (AftBlocks.count(I->getParent()))
128 Visited.push_back(I);
129 return true;
130 });
131
132 // Move all instructions in program order to before the InsertLoc
133 BasicBlock *InsertLocBB = InsertLoc->getParent();
134 for (Instruction *I : reverse(Visited)) {
135 if (I->getParent() != InsertLocBB)
136 I->moveBefore(InsertLoc);
137 }
138 }
139
140 /*
141 This method performs Unroll and Jam. For a simple loop like:
142 for (i = ..)
143 Fore(i)
144 for (j = ..)
145 SubLoop(i, j)
146 Aft(i)
147
148 Instead of doing normal inner or outer unrolling, we do:
149 for (i = .., i+=2)
150 Fore(i)
151 Fore(i+1)
152 for (j = ..)
153 SubLoop(i, j)
154 SubLoop(i+1, j)
155 Aft(i)
156 Aft(i+1)
157
158 So the outer loop is essetially unrolled and then the inner loops are fused
159 ("jammed") together into a single loop. This can increase speed when there
160 are loads in SubLoop that are invariant to i, as they become shared between
161 the now jammed inner loops.
162
163 We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
164 Fore blocks are those before the inner loop, Aft are those after. Normal
165 Unroll code is used to copy each of these sets of blocks and the results are
166 combined together into the final form above.
167
168 isSafeToUnrollAndJam should be used prior to calling this to make sure the
169 unrolling will be valid. Checking profitablility is also advisable.
170 */
171 LoopUnrollResult
UnrollAndJamLoop(Loop * L,unsigned Count,unsigned TripCount,unsigned TripMultiple,bool UnrollRemainder,LoopInfo * LI,ScalarEvolution * SE,DominatorTree * DT,AssumptionCache * AC,OptimizationRemarkEmitter * ORE)172 llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
173 unsigned TripMultiple, bool UnrollRemainder,
174 LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
175 AssumptionCache *AC, OptimizationRemarkEmitter *ORE) {
176
177 // When we enter here we should have already checked that it is safe
178 BasicBlock *Header = L->getHeader();
179 assert(L->getSubLoops().size() == 1);
180 Loop *SubLoop = *L->begin();
181
182 // Don't enter the unroll code if there is nothing to do.
183 if (TripCount == 0 && Count < 2) {
184 LLVM_DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
185 return LoopUnrollResult::Unmodified;
186 }
187
188 assert(Count > 0);
189 assert(TripMultiple > 0);
190 assert(TripCount == 0 || TripCount % TripMultiple == 0);
191
192 // Are we eliminating the loop control altogether?
193 bool CompletelyUnroll = (Count == TripCount);
194
195 // We use the runtime remainder in cases where we don't know trip multiple
196 if (TripMultiple == 1 || TripMultiple % Count != 0) {
197 if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
198 /*UseEpilogRemainder*/ true,
199 UnrollRemainder, LI, SE, DT, AC, true)) {
200 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
201 "generated when assuming runtime trip count\n");
202 return LoopUnrollResult::Unmodified;
203 }
204 }
205
206 // Notify ScalarEvolution that the loop will be substantially changed,
207 // if not outright eliminated.
208 if (SE) {
209 SE->forgetLoop(L);
210 SE->forgetLoop(SubLoop);
211 }
212
213 using namespace ore;
214 // Report the unrolling decision.
215 if (CompletelyUnroll) {
216 LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
217 << Header->getName() << " with trip count " << TripCount
218 << "!\n");
219 ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
220 L->getHeader())
221 << "completely unroll and jammed loop with "
222 << NV("UnrollCount", TripCount) << " iterations");
223 } else {
224 auto DiagBuilder = [&]() {
225 OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
226 L->getHeader());
227 return Diag << "unroll and jammed loop by a factor of "
228 << NV("UnrollCount", Count);
229 };
230
231 LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
232 << " by " << Count);
233 if (TripMultiple != 1) {
234 LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
235 ORE->emit([&]() {
236 return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
237 << " trips per branch";
238 });
239 } else {
240 LLVM_DEBUG(dbgs() << " with run-time trip count");
241 ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
242 }
243 LLVM_DEBUG(dbgs() << "!\n");
244 }
245
246 BasicBlock *Preheader = L->getLoopPreheader();
247 BasicBlock *LatchBlock = L->getLoopLatch();
248 BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
249 assert(Preheader && LatchBlock && Header);
250 assert(BI && !BI->isUnconditional());
251 bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
252 BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
253 bool SubLoopContinueOnTrue = SubLoop->contains(
254 SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
255
256 // Partition blocks in an outer/inner loop pair into blocks before and after
257 // the loop
258 BasicBlockSet SubLoopBlocks;
259 BasicBlockSet ForeBlocks;
260 BasicBlockSet AftBlocks;
261 partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
262 DT);
263
264 // We keep track of the entering/first and exiting/last block of each of
265 // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
266 // blocks easier.
267 std::vector<BasicBlock *> ForeBlocksFirst;
268 std::vector<BasicBlock *> ForeBlocksLast;
269 std::vector<BasicBlock *> SubLoopBlocksFirst;
270 std::vector<BasicBlock *> SubLoopBlocksLast;
271 std::vector<BasicBlock *> AftBlocksFirst;
272 std::vector<BasicBlock *> AftBlocksLast;
273 ForeBlocksFirst.push_back(Header);
274 ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
275 SubLoopBlocksFirst.push_back(SubLoop->getHeader());
276 SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
277 AftBlocksFirst.push_back(SubLoop->getExitBlock());
278 AftBlocksLast.push_back(L->getExitingBlock());
279 // Maps Blocks[0] -> Blocks[It]
280 ValueToValueMapTy LastValueMap;
281
282 // Move any instructions from fore phi operands from AftBlocks into Fore.
283 moveHeaderPhiOperandsToForeBlocks(
284 Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(),
285 AftBlocks);
286
287 // The current on-the-fly SSA update requires blocks to be processed in
288 // reverse postorder so that LastValueMap contains the correct value at each
289 // exit.
290 LoopBlocksDFS DFS(L);
291 DFS.perform(LI);
292 // Stash the DFS iterators before adding blocks to the loop.
293 LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
294 LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
295
296 if (Header->getParent()->isDebugInfoForProfiling())
297 for (BasicBlock *BB : L->getBlocks())
298 for (Instruction &I : *BB)
299 if (!isa<DbgInfoIntrinsic>(&I))
300 if (const DILocation *DIL = I.getDebugLoc())
301 I.setDebugLoc(DIL->cloneWithDuplicationFactor(Count));
302
303 // Copy all blocks
304 for (unsigned It = 1; It != Count; ++It) {
305 std::vector<BasicBlock *> NewBlocks;
306 // Maps Blocks[It] -> Blocks[It-1]
307 DenseMap<Value *, Value *> PrevItValueMap;
308
309 for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
310 ValueToValueMapTy VMap;
311 BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
312 Header->getParent()->getBasicBlockList().push_back(New);
313
314 if (ForeBlocks.count(*BB)) {
315 L->addBasicBlockToLoop(New, *LI);
316
317 if (*BB == ForeBlocksFirst[0])
318 ForeBlocksFirst.push_back(New);
319 if (*BB == ForeBlocksLast[0])
320 ForeBlocksLast.push_back(New);
321 } else if (SubLoopBlocks.count(*BB)) {
322 SubLoop->addBasicBlockToLoop(New, *LI);
323
324 if (*BB == SubLoopBlocksFirst[0])
325 SubLoopBlocksFirst.push_back(New);
326 if (*BB == SubLoopBlocksLast[0])
327 SubLoopBlocksLast.push_back(New);
328 } else if (AftBlocks.count(*BB)) {
329 L->addBasicBlockToLoop(New, *LI);
330
331 if (*BB == AftBlocksFirst[0])
332 AftBlocksFirst.push_back(New);
333 if (*BB == AftBlocksLast[0])
334 AftBlocksLast.push_back(New);
335 } else {
336 llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
337 }
338
339 // Update our running maps of newest clones
340 PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
341 LastValueMap[*BB] = New;
342 for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
343 VI != VE; ++VI) {
344 PrevItValueMap[VI->second] =
345 const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
346 LastValueMap[VI->first] = VI->second;
347 }
348
349 NewBlocks.push_back(New);
350
351 // Update DomTree:
352 if (*BB == ForeBlocksFirst[0])
353 DT->addNewBlock(New, ForeBlocksLast[It - 1]);
354 else if (*BB == SubLoopBlocksFirst[0])
355 DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
356 else if (*BB == AftBlocksFirst[0])
357 DT->addNewBlock(New, AftBlocksLast[It - 1]);
358 else {
359 // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
360 // structure.
361 auto BBDomNode = DT->getNode(*BB);
362 auto BBIDom = BBDomNode->getIDom();
363 BasicBlock *OriginalBBIDom = BBIDom->getBlock();
364 assert(OriginalBBIDom);
365 assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
366 DT->addNewBlock(
367 New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
368 }
369 }
370
371 // Remap all instructions in the most recent iteration
372 for (BasicBlock *NewBlock : NewBlocks) {
373 for (Instruction &I : *NewBlock) {
374 ::remapInstruction(&I, LastValueMap);
375 if (auto *II = dyn_cast<IntrinsicInst>(&I))
376 if (II->getIntrinsicID() == Intrinsic::assume)
377 AC->registerAssumption(II);
378 }
379 }
380
381 // Alter the ForeBlocks phi's, pointing them at the latest version of the
382 // value from the previous iteration's phis
383 for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
384 Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
385 assert(OldValue && "should have incoming edge from Aft[It]");
386 Value *NewValue = OldValue;
387 if (Value *PrevValue = PrevItValueMap[OldValue])
388 NewValue = PrevValue;
389
390 assert(Phi.getNumOperands() == 2);
391 Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
392 Phi.setIncomingValue(0, NewValue);
393 Phi.removeIncomingValue(1);
394 }
395 }
396
397 // Now that all the basic blocks for the unrolled iterations are in place,
398 // finish up connecting the blocks and phi nodes. At this point LastValueMap
399 // is the last unrolled iterations values.
400
401 // Update Phis in BB from OldBB to point to NewBB
402 auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB,
403 BasicBlock *NewBB) {
404 for (PHINode &Phi : BB->phis()) {
405 int I = Phi.getBasicBlockIndex(OldBB);
406 Phi.setIncomingBlock(I, NewBB);
407 }
408 };
409 // Update Phis in BB from OldBB to point to NewBB and use the latest value
410 // from LastValueMap
411 auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
412 BasicBlock *NewBB,
413 ValueToValueMapTy &LastValueMap) {
414 for (PHINode &Phi : BB->phis()) {
415 for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
416 if (Phi.getIncomingBlock(b) == OldBB) {
417 Value *OldValue = Phi.getIncomingValue(b);
418 if (Value *LastValue = LastValueMap[OldValue])
419 Phi.setIncomingValue(b, LastValue);
420 Phi.setIncomingBlock(b, NewBB);
421 break;
422 }
423 }
424 }
425 };
426 // Move all the phis from Src into Dest
427 auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
428 Instruction *insertPoint = Dest->getFirstNonPHI();
429 while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
430 Phi->moveBefore(insertPoint);
431 };
432
433 // Update the PHI values outside the loop to point to the last block
434 updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
435 LastValueMap);
436
437 // Update ForeBlocks successors and phi nodes
438 BranchInst *ForeTerm =
439 cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
440 BasicBlock *Dest = SubLoopBlocksFirst[0];
441 ForeTerm->setSuccessor(0, Dest);
442
443 if (CompletelyUnroll) {
444 while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
445 Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
446 Phi->getParent()->getInstList().erase(Phi);
447 }
448 } else {
449 // Update the PHI values to point to the last aft block
450 updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
451 AftBlocksLast.back(), LastValueMap);
452 }
453
454 for (unsigned It = 1; It != Count; It++) {
455 // Remap ForeBlock successors from previous iteration to this
456 BranchInst *ForeTerm =
457 cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
458 BasicBlock *Dest = ForeBlocksFirst[It];
459 ForeTerm->setSuccessor(0, Dest);
460 }
461
462 // Subloop successors and phis
463 BranchInst *SubTerm =
464 cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
465 SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
466 SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
467 updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0],
468 ForeBlocksLast.back());
469 updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0],
470 SubLoopBlocksLast.back());
471
472 for (unsigned It = 1; It != Count; It++) {
473 // Replace the conditional branch of the previous iteration subloop with an
474 // unconditional one to this one
475 BranchInst *SubTerm =
476 cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
477 BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
478 SubTerm->eraseFromParent();
479
480 updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It],
481 ForeBlocksLast.back());
482 updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It],
483 SubLoopBlocksLast.back());
484 movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
485 }
486
487 // Aft blocks successors and phis
488 BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
489 if (CompletelyUnroll) {
490 BranchInst::Create(LoopExit, Term);
491 Term->eraseFromParent();
492 } else {
493 Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
494 }
495 updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0],
496 SubLoopBlocksLast.back());
497
498 for (unsigned It = 1; It != Count; It++) {
499 // Replace the conditional branch of the previous iteration subloop with an
500 // unconditional one to this one
501 BranchInst *AftTerm =
502 cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
503 BranchInst::Create(AftBlocksFirst[It], AftTerm);
504 AftTerm->eraseFromParent();
505
506 updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It],
507 SubLoopBlocksLast.back());
508 movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
509 }
510
511 // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
512 // new ones required.
513 if (Count != 1) {
514 SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
515 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
516 SubLoopBlocksFirst[0]);
517 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
518 SubLoopBlocksLast[0], AftBlocksFirst[0]);
519
520 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
521 ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
522 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
523 SubLoopBlocksLast.back(), AftBlocksFirst[0]);
524 DT->applyUpdates(DTUpdates);
525 }
526
527 // Merge adjacent basic blocks, if possible.
528 SmallPtrSet<BasicBlock *, 16> MergeBlocks;
529 MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
530 MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
531 MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
532 while (!MergeBlocks.empty()) {
533 BasicBlock *BB = *MergeBlocks.begin();
534 BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
535 if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) {
536 BasicBlock *Dest = Term->getSuccessor(0);
537 if (BasicBlock *Fold = foldBlockIntoPredecessor(Dest, LI, SE, DT)) {
538 // Don't remove BB and add Fold as they are the same BB
539 assert(Fold == BB);
540 (void)Fold;
541 MergeBlocks.erase(Dest);
542 } else
543 MergeBlocks.erase(BB);
544 } else
545 MergeBlocks.erase(BB);
546 }
547
548 // At this point, the code is well formed. We now do a quick sweep over the
549 // inserted code, doing constant propagation and dead code elimination as we
550 // go.
551 simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC);
552 simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC);
553
554 NumCompletelyUnrolledAndJammed += CompletelyUnroll;
555 ++NumUnrolledAndJammed;
556
557 #ifndef NDEBUG
558 // We shouldn't have done anything to break loop simplify form or LCSSA.
559 Loop *OuterL = L->getParentLoop();
560 Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop);
561 assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
562 if (!CompletelyUnroll)
563 assert(L->isLoopSimplifyForm());
564 assert(SubLoop->isLoopSimplifyForm());
565 assert(DT->verify());
566 #endif
567
568 // Update LoopInfo if the loop is completely removed.
569 if (CompletelyUnroll)
570 LI->erase(L);
571
572 return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
573 : LoopUnrollResult::PartiallyUnrolled;
574 }
575
getLoadsAndStores(BasicBlockSet & Blocks,SmallVector<Value *,4> & MemInstr)576 static bool getLoadsAndStores(BasicBlockSet &Blocks,
577 SmallVector<Value *, 4> &MemInstr) {
578 // Scan the BBs and collect legal loads and stores.
579 // Returns false if non-simple loads/stores are found.
580 for (BasicBlock *BB : Blocks) {
581 for (Instruction &I : *BB) {
582 if (auto *Ld = dyn_cast<LoadInst>(&I)) {
583 if (!Ld->isSimple())
584 return false;
585 MemInstr.push_back(&I);
586 } else if (auto *St = dyn_cast<StoreInst>(&I)) {
587 if (!St->isSimple())
588 return false;
589 MemInstr.push_back(&I);
590 } else if (I.mayReadOrWriteMemory()) {
591 return false;
592 }
593 }
594 }
595 return true;
596 }
597
checkDependencies(SmallVector<Value *,4> & Earlier,SmallVector<Value *,4> & Later,unsigned LoopDepth,bool InnerLoop,DependenceInfo & DI)598 static bool checkDependencies(SmallVector<Value *, 4> &Earlier,
599 SmallVector<Value *, 4> &Later,
600 unsigned LoopDepth, bool InnerLoop,
601 DependenceInfo &DI) {
602 // Use DA to check for dependencies between loads and stores that make unroll
603 // and jam invalid
604 for (Value *I : Earlier) {
605 for (Value *J : Later) {
606 Instruction *Src = cast<Instruction>(I);
607 Instruction *Dst = cast<Instruction>(J);
608 if (Src == Dst)
609 continue;
610 // Ignore Input dependencies.
611 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
612 continue;
613
614 // Track dependencies, and if we find them take a conservative approach
615 // by allowing only = or < (not >), altough some > would be safe
616 // (depending upon unroll width).
617 // For the inner loop, we need to disallow any (> <) dependencies
618 // FIXME: Allow > so long as distance is less than unroll width
619 if (auto D = DI.depends(Src, Dst, true)) {
620 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
621
622 if (D->isConfused())
623 return false;
624 if (!InnerLoop) {
625 if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT)
626 return false;
627 } else {
628 assert(LoopDepth + 1 <= D->getLevels());
629 if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT &&
630 D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT)
631 return false;
632 }
633 }
634 }
635 }
636 return true;
637 }
638
checkDependencies(Loop * L,BasicBlockSet & ForeBlocks,BasicBlockSet & SubLoopBlocks,BasicBlockSet & AftBlocks,DependenceInfo & DI)639 static bool checkDependencies(Loop *L, BasicBlockSet &ForeBlocks,
640 BasicBlockSet &SubLoopBlocks,
641 BasicBlockSet &AftBlocks, DependenceInfo &DI) {
642 // Get all loads/store pairs for each blocks
643 SmallVector<Value *, 4> ForeMemInstr;
644 SmallVector<Value *, 4> SubLoopMemInstr;
645 SmallVector<Value *, 4> AftMemInstr;
646 if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) ||
647 !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) ||
648 !getLoadsAndStores(AftBlocks, AftMemInstr))
649 return false;
650
651 // Check for dependencies between any blocks that may change order
652 unsigned LoopDepth = L->getLoopDepth();
653 return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false,
654 DI) &&
655 checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) &&
656 checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false,
657 DI) &&
658 checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true,
659 DI);
660 }
661
isSafeToUnrollAndJam(Loop * L,ScalarEvolution & SE,DominatorTree & DT,DependenceInfo & DI)662 bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
663 DependenceInfo &DI) {
664 /* We currently handle outer loops like this:
665 |
666 ForeFirst <----\ }
667 Blocks | } ForeBlocks
668 ForeLast | }
669 | |
670 SubLoopFirst <\ | }
671 Blocks | | } SubLoopBlocks
672 SubLoopLast -/ | }
673 | |
674 AftFirst | }
675 Blocks | } AftBlocks
676 AftLast ------/ }
677 |
678
679 There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
680 and AftBlocks, providing that there is one edge from Fores to SubLoops,
681 one edge from SubLoops to Afts and a single outer loop exit (from Afts).
682 In practice we currently limit Aft blocks to a single block, and limit
683 things further in the profitablility checks of the unroll and jam pass.
684
685 Because of the way we rearrange basic blocks, we also require that
686 the Fore blocks on all unrolled iterations are safe to move before the
687 SubLoop blocks of all iterations. So we require that the phi node looping
688 operands of ForeHeader can be moved to at least the end of ForeEnd, so that
689 we can arrange cloned Fore Blocks before the subloop and match up Phi's
690 correctly.
691
692 i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2.
693 It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2.
694
695 There are then a number of checks along the lines of no calls, no
696 exceptions, inner loop IV is consistent, etc. Note that for loops requiring
697 runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
698 UnrollAndJamLoop if the trip count cannot be easily calculated.
699 */
700
701 if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
702 return false;
703 Loop *SubLoop = L->getSubLoops()[0];
704 if (!SubLoop->isLoopSimplifyForm())
705 return false;
706
707 BasicBlock *Header = L->getHeader();
708 BasicBlock *Latch = L->getLoopLatch();
709 BasicBlock *Exit = L->getExitingBlock();
710 BasicBlock *SubLoopHeader = SubLoop->getHeader();
711 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
712 BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
713
714 if (Latch != Exit)
715 return false;
716 if (SubLoopLatch != SubLoopExit)
717 return false;
718
719 if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken())
720 return false;
721
722 // Split blocks into Fore/SubLoop/Aft based on dominators
723 BasicBlockSet SubLoopBlocks;
724 BasicBlockSet ForeBlocks;
725 BasicBlockSet AftBlocks;
726 if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks,
727 AftBlocks, &DT))
728 return false;
729
730 // Aft blocks may need to move instructions to fore blocks, which becomes more
731 // difficult if there are multiple (potentially conditionally executed)
732 // blocks. For now we just exclude loops with multiple aft blocks.
733 if (AftBlocks.size() != 1)
734 return false;
735
736 // Check inner loop IV is consistent between all iterations
737 const SCEV *SubLoopBECountSC = SE.getExitCount(SubLoop, SubLoopLatch);
738 if (isa<SCEVCouldNotCompute>(SubLoopBECountSC) ||
739 !SubLoopBECountSC->getType()->isIntegerTy())
740 return false;
741 ScalarEvolution::LoopDisposition LD =
742 SE.getLoopDisposition(SubLoopBECountSC, L);
743 if (LD != ScalarEvolution::LoopInvariant)
744 return false;
745
746 // Check the loop safety info for exceptions.
747 LoopSafetyInfo LSI;
748 computeLoopSafetyInfo(&LSI, L);
749 if (LSI.MayThrow)
750 return false;
751
752 // We've ruled out the easy stuff and now need to check that there are no
753 // interdependencies which may prevent us from moving the:
754 // ForeBlocks before Subloop and AftBlocks.
755 // Subloop before AftBlocks.
756 // ForeBlock phi operands before the subloop
757
758 // Make sure we can move all instructions we need to before the subloop
759 if (!processHeaderPhiOperands(
760 Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) {
761 if (SubLoop->contains(I->getParent()))
762 return false;
763 if (AftBlocks.count(I->getParent())) {
764 // If we hit a phi node in afts we know we are done (probably
765 // LCSSA)
766 if (isa<PHINode>(I))
767 return false;
768 // Can't move instructions with side effects or memory
769 // reads/writes
770 if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
771 return false;
772 }
773 // Keep going
774 return true;
775 }))
776 return false;
777
778 // Check for memory dependencies which prohibit the unrolling we are doing.
779 // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
780 // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
781 if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI))
782 return false;
783
784 return true;
785 }
786