• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===- BufferDeallocation.cpp - the impl for buffer deallocation ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements logic for computing correct alloc and dealloc positions.
10 // Furthermore, buffer placement also adds required new alloc and copy
11 // operations to ensure that all buffers are deallocated. The main class is the
12 // BufferDeallocationPass class that implements the underlying algorithm. In
13 // order to put allocations and deallocations at safe positions, it is
14 // significantly important to put them into the correct blocks. However, the
15 // liveness analysis does not pay attention to aliases, which can occur due to
16 // branches (and their associated block arguments) in general. For this purpose,
17 // BufferDeallocation firstly finds all possible aliases for a single value
18 // (using the BufferAliasAnalysis class). Consider the following
19 // example:
20 //
21 // ^bb0(%arg0):
22 //   cond_br %cond, ^bb1, ^bb2
23 // ^bb1:
24 //   br ^exit(%arg0)
25 // ^bb2:
26 //   %new_value = ...
27 //   br ^exit(%new_value)
28 // ^exit(%arg1):
29 //   return %arg1;
30 //
31 // We should place the dealloc for %new_value in exit. However, we have to free
32 // the buffer in the same block, because it cannot be freed in the post
33 // dominator. However, this requires a new copy buffer for %arg1 that will
34 // contain the actual contents. Using the class BufferAliasAnalysis, we
35 // will find out that %new_value has a potential alias %arg1. In order to find
36 // the dealloc position we have to find all potential aliases, iterate over
37 // their uses and find the common post-dominator block (note that additional
38 // copies and buffers remove potential aliases and will influence the placement
39 // of the deallocs). In all cases, the computed block can be safely used to free
40 // the %new_value buffer (may be exit or bb2) as it will die and we can use
41 // liveness information to determine the exact operation after which we have to
42 // insert the dealloc. However, the algorithm supports introducing copy buffers
43 // and placing deallocs in safe locations to ensure that all buffers will be
44 // freed in the end.
45 //
46 // TODO:
47 // The current implementation does not support explicit-control-flow loops and
48 // the resulting code will be invalid with respect to program semantics.
49 // However, structured control-flow loops are fully supported. Furthermore, it
50 // doesn't accept functions which return buffers already.
51 //
52 //===----------------------------------------------------------------------===//
53 
54 #include "PassDetail.h"
55 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
56 #include "mlir/Dialect/StandardOps/IR/Ops.h"
57 #include "mlir/IR/Operation.h"
58 #include "mlir/Interfaces/ControlFlowInterfaces.h"
59 #include "mlir/Interfaces/LoopLikeInterface.h"
60 #include "mlir/Pass/Pass.h"
61 #include "mlir/Transforms/BufferUtils.h"
62 #include "mlir/Transforms/Passes.h"
63 #include "llvm/ADT/SetOperations.h"
64 
65 using namespace mlir;
66 
67 /// Walks over all immediate return-like terminators in the given region.
68 template <typename FuncT>
walkReturnOperations(Region * region,const FuncT & func)69 static void walkReturnOperations(Region *region, const FuncT &func) {
70   for (Block &block : *region)
71     for (Operation &operation : block) {
72       // Skip non-return-like terminators.
73       if (operation.hasTrait<OpTrait::ReturnLike>())
74         func(&operation);
75     }
76 }
77 
78 namespace {
79 
80 //===----------------------------------------------------------------------===//
81 // Backedges analysis
82 //===----------------------------------------------------------------------===//
83 
84 /// A straight-forward program analysis which detects loop backedges induced by
85 /// explicit control flow.
86 class Backedges {
87 public:
88   using BlockSetT = SmallPtrSet<Block *, 16>;
89   using BackedgeSetT = llvm::DenseSet<std::pair<Block *, Block *>>;
90 
91 public:
92   /// Constructs a new backedges analysis using the op provided.
Backedges(Operation * op)93   Backedges(Operation *op) { recurse(op, op->getBlock()); }
94 
95   /// Returns the number of backedges formed by explicit control flow.
size() const96   size_t size() const { return edgeSet.size(); }
97 
98   /// Returns the start iterator to loop over all backedges.
begin() const99   BackedgeSetT::const_iterator begin() const { return edgeSet.begin(); }
100 
101   /// Returns the end iterator to loop over all backedges.
end() const102   BackedgeSetT::const_iterator end() const { return edgeSet.end(); }
103 
104 private:
105   /// Enters the current block and inserts a backedge into the `edgeSet` if we
106   /// have already visited the current block. The inserted edge links the given
107   /// `predecessor` with the `current` block.
enter(Block & current,Block * predecessor)108   bool enter(Block &current, Block *predecessor) {
109     bool inserted = visited.insert(&current).second;
110     if (!inserted)
111       edgeSet.insert(std::make_pair(predecessor, &current));
112     return inserted;
113   }
114 
115   /// Leaves the current block.
exit(Block & current)116   void exit(Block &current) { visited.erase(&current); }
117 
118   /// Recurses into the given operation while taking all attached regions into
119   /// account.
recurse(Operation * op,Block * predecessor)120   void recurse(Operation *op, Block *predecessor) {
121     Block *current = op->getBlock();
122     // If the current op implements the `BranchOpInterface`, there can be
123     // cycles in the scope of all successor blocks.
124     if (isa<BranchOpInterface>(op)) {
125       for (Block *succ : current->getSuccessors())
126         recurse(*succ, current);
127     }
128     // Recurse into all distinct regions and check for explicit control-flow
129     // loops.
130     for (Region &region : op->getRegions())
131       recurse(region.front(), current);
132   }
133 
134   /// Recurses into explicit control-flow structures that are given by
135   /// the successor relation defined on the block level.
recurse(Block & block,Block * predecessor)136   void recurse(Block &block, Block *predecessor) {
137     // Try to enter the current block. If this is not possible, we are
138     // currently processing this block and can safely return here.
139     if (!enter(block, predecessor))
140       return;
141 
142     // Recurse into all operations and successor blocks.
143     for (Operation &op : block.getOperations())
144       recurse(&op, predecessor);
145 
146     // Leave the current block.
147     exit(block);
148   }
149 
150   /// Stores all blocks that are currently visited and on the processing stack.
151   BlockSetT visited;
152 
153   /// Stores all backedges in the format (source, target).
154   BackedgeSetT edgeSet;
155 };
156 
157 //===----------------------------------------------------------------------===//
158 // BufferDeallocation
159 //===----------------------------------------------------------------------===//
160 
161 /// The buffer deallocation transformation which ensures that all allocs in the
162 /// program have a corresponding de-allocation. As a side-effect, it might also
163 /// introduce copies that in turn leads to additional allocs and de-allocations.
164 class BufferDeallocation : BufferPlacementTransformationBase {
165 public:
BufferDeallocation(Operation * op)166   BufferDeallocation(Operation *op)
167       : BufferPlacementTransformationBase(op), dominators(op),
168         postDominators(op) {}
169 
170   /// Performs the actual placement/creation of all temporary alloc, copy and
171   /// dealloc nodes.
deallocate()172   void deallocate() {
173     // Add additional allocations and copies that are required.
174     introduceCopies();
175     // Place deallocations for all allocation entries.
176     placeDeallocs();
177   }
178 
179 private:
180   /// Introduces required allocs and copy operations to avoid memory leaks.
introduceCopies()181   void introduceCopies() {
182     // Initialize the set of values that require a dedicated memory free
183     // operation since their operands cannot be safely deallocated in a post
184     // dominator.
185     SmallPtrSet<Value, 8> valuesToFree;
186     llvm::SmallDenseSet<std::tuple<Value, Block *>> visitedValues;
187     SmallVector<std::tuple<Value, Block *>, 8> toProcess;
188 
189     // Check dominance relation for proper dominance properties. If the given
190     // value node does not dominate an alias, we will have to create a copy in
191     // order to free all buffers that can potentially leak into a post
192     // dominator.
193     auto findUnsafeValues = [&](Value source, Block *definingBlock) {
194       auto it = aliases.find(source);
195       if (it == aliases.end())
196         return;
197       for (Value value : it->second) {
198         if (valuesToFree.count(value) > 0)
199           continue;
200         Block *parentBlock = value.getParentBlock();
201         // Check whether we have to free this particular block argument or
202         // generic value. We have to free the current alias if it is either
203         // defined in a non-dominated block or it is defined in the same block
204         // but the current value is not dominated by the source value.
205         if (!dominators.dominates(definingBlock, parentBlock) ||
206             (definingBlock == parentBlock && value.isa<BlockArgument>())) {
207           toProcess.emplace_back(value, parentBlock);
208           valuesToFree.insert(value);
209         } else if (visitedValues.insert(std::make_tuple(value, definingBlock))
210                        .second)
211           toProcess.emplace_back(value, definingBlock);
212       }
213     };
214 
215     // Detect possibly unsafe aliases starting from all allocations.
216     for (BufferPlacementAllocs::AllocEntry &entry : allocs) {
217       Value allocValue = std::get<0>(entry);
218       findUnsafeValues(allocValue, allocValue.getDefiningOp()->getBlock());
219     }
220     // Try to find block arguments that require an explicit free operation
221     // until we reach a fix point.
222     while (!toProcess.empty()) {
223       auto current = toProcess.pop_back_val();
224       findUnsafeValues(std::get<0>(current), std::get<1>(current));
225     }
226 
227     // Update buffer aliases to ensure that we free all buffers and block
228     // arguments at the correct locations.
229     aliases.remove(valuesToFree);
230 
231     // Add new allocs and additional copy operations.
232     for (Value value : valuesToFree) {
233       if (auto blockArg = value.dyn_cast<BlockArgument>())
234         introduceBlockArgCopy(blockArg);
235       else
236         introduceValueCopyForRegionResult(value);
237 
238       // Register the value to require a final dealloc. Note that we do not have
239       // to assign a block here since we do not want to move the allocation node
240       // to another location.
241       allocs.registerAlloc(std::make_tuple(value, nullptr));
242     }
243   }
244 
245   /// Introduces temporary allocs in all predecessors and copies the source
246   /// values into the newly allocated buffers.
introduceBlockArgCopy(BlockArgument blockArg)247   void introduceBlockArgCopy(BlockArgument blockArg) {
248     // Allocate a buffer for the current block argument in the block of
249     // the associated value (which will be a predecessor block by
250     // definition).
251     Block *block = blockArg.getOwner();
252     for (auto it = block->pred_begin(), e = block->pred_end(); it != e; ++it) {
253       // Get the terminator and the value that will be passed to our
254       // argument.
255       Operation *terminator = (*it)->getTerminator();
256       auto branchInterface = cast<BranchOpInterface>(terminator);
257       // Query the associated source value.
258       Value sourceValue =
259           branchInterface.getSuccessorOperands(it.getSuccessorIndex())
260               .getValue()[blockArg.getArgNumber()];
261       // Create a new alloc and copy at the current location of the terminator.
262       Value alloc = introduceBufferCopy(sourceValue, terminator);
263       // Wire new alloc and successor operand.
264       auto mutableOperands =
265           branchInterface.getMutableSuccessorOperands(it.getSuccessorIndex());
266       if (!mutableOperands.hasValue())
267         terminator->emitError() << "terminators with immutable successor "
268                                    "operands are not supported";
269       else
270         mutableOperands.getValue()
271             .slice(blockArg.getArgNumber(), 1)
272             .assign(alloc);
273     }
274 
275     // Check whether the block argument has implicitly defined predecessors via
276     // the RegionBranchOpInterface. This can be the case if the current block
277     // argument belongs to the first block in a region and the parent operation
278     // implements the RegionBranchOpInterface.
279     Region *argRegion = block->getParent();
280     Operation *parentOp = argRegion->getParentOp();
281     RegionBranchOpInterface regionInterface;
282     if (!argRegion || &argRegion->front() != block ||
283         !(regionInterface = dyn_cast<RegionBranchOpInterface>(parentOp)))
284       return;
285 
286     introduceCopiesForRegionSuccessors(
287         regionInterface, argRegion->getParentOp()->getRegions(), blockArg,
288         [&](RegionSuccessor &successorRegion) {
289           // Find a predecessor of our argRegion.
290           return successorRegion.getSuccessor() == argRegion;
291         });
292 
293     // Check whether the block argument belongs to an entry region of the
294     // parent operation. In this case, we have to introduce an additional copy
295     // for buffer that is passed to the argument.
296     SmallVector<RegionSuccessor, 2> successorRegions;
297     regionInterface.getSuccessorRegions(/*index=*/llvm::None, successorRegions);
298     auto *it =
299         llvm::find_if(successorRegions, [&](RegionSuccessor &successorRegion) {
300           return successorRegion.getSuccessor() == argRegion;
301         });
302     if (it == successorRegions.end())
303       return;
304 
305     // Determine the actual operand to introduce a copy for and rewire the
306     // operand to point to the copy instead.
307     Value operand =
308         regionInterface.getSuccessorEntryOperands(argRegion->getRegionNumber())
309             [llvm::find(it->getSuccessorInputs(), blockArg).getIndex()];
310     Value copy = introduceBufferCopy(operand, parentOp);
311 
312     auto op = llvm::find(parentOp->getOperands(), operand);
313     assert(op != parentOp->getOperands().end() &&
314            "parentOp does not contain operand");
315     parentOp->setOperand(op.getIndex(), copy);
316   }
317 
318   /// Introduces temporary allocs in front of all associated nested-region
319   /// terminators and copies the source values into the newly allocated buffers.
introduceValueCopyForRegionResult(Value value)320   void introduceValueCopyForRegionResult(Value value) {
321     // Get the actual result index in the scope of the parent terminator.
322     Operation *operation = value.getDefiningOp();
323     auto regionInterface = cast<RegionBranchOpInterface>(operation);
324     // Filter successors that return to the parent operation.
325     auto regionPredicate = [&](RegionSuccessor &successorRegion) {
326       // If the RegionSuccessor has no associated successor, it will return to
327       // its parent operation.
328       return !successorRegion.getSuccessor();
329     };
330     // Introduce a copy for all region "results" that are returned to the parent
331     // operation. This is required since the parent's result value has been
332     // considered critical. Therefore, the algorithm assumes that a copy of a
333     // previously allocated buffer is returned by the operation (like in the
334     // case of a block argument).
335     introduceCopiesForRegionSuccessors(regionInterface, operation->getRegions(),
336                                        value, regionPredicate);
337   }
338 
339   /// Introduces buffer copies for all terminators in the given regions. The
340   /// regionPredicate is applied to every successor region in order to restrict
341   /// the copies to specific regions.
342   template <typename TPredicate>
introduceCopiesForRegionSuccessors(RegionBranchOpInterface regionInterface,MutableArrayRef<Region> regions,Value argValue,const TPredicate & regionPredicate)343   void introduceCopiesForRegionSuccessors(
344       RegionBranchOpInterface regionInterface, MutableArrayRef<Region> regions,
345       Value argValue, const TPredicate &regionPredicate) {
346     for (Region &region : regions) {
347       // Query the regionInterface to get all successor regions of the current
348       // one.
349       SmallVector<RegionSuccessor, 2> successorRegions;
350       regionInterface.getSuccessorRegions(region.getRegionNumber(),
351                                           successorRegions);
352       // Try to find a matching region successor.
353       RegionSuccessor *regionSuccessor =
354           llvm::find_if(successorRegions, regionPredicate);
355       if (regionSuccessor == successorRegions.end())
356         continue;
357       // Get the operand index in the context of the current successor input
358       // bindings.
359       size_t operandIndex =
360           llvm::find(regionSuccessor->getSuccessorInputs(), argValue)
361               .getIndex();
362 
363       // Iterate over all immediate terminator operations to introduce
364       // new buffer allocations. Thereby, the appropriate terminator operand
365       // will be adjusted to point to the newly allocated buffer instead.
366       walkReturnOperations(&region, [&](Operation *terminator) {
367         // Extract the source value from the current terminator.
368         Value sourceValue = terminator->getOperand(operandIndex);
369         // Create a new alloc at the current location of the terminator.
370         Value alloc = introduceBufferCopy(sourceValue, terminator);
371         // Wire alloc and terminator operand.
372         terminator->setOperand(operandIndex, alloc);
373       });
374     }
375   }
376 
377   /// Creates a new memory allocation for the given source value and copies
378   /// its content into the newly allocated buffer. The terminator operation is
379   /// used to insert the alloc and copy operations at the right places.
introduceBufferCopy(Value sourceValue,Operation * terminator)380   Value introduceBufferCopy(Value sourceValue, Operation *terminator) {
381     // Avoid multiple copies of the same source value. This can happen in the
382     // presence of loops when a branch acts as a backedge while also having
383     // another successor that returns to its parent operation. Note: that
384     // copying copied buffers can introduce memory leaks since the invariant of
385     // BufferPlacement assumes that a buffer will be only copied once into a
386     // temporary buffer. Hence, the construction of copy chains introduces
387     // additional allocations that are not tracked automatically by the
388     // algorithm.
389     if (copiedValues.contains(sourceValue))
390       return sourceValue;
391     // Create a new alloc at the current location of the terminator.
392     auto memRefType = sourceValue.getType().cast<MemRefType>();
393     OpBuilder builder(terminator);
394 
395     // Extract information about dynamically shaped types by
396     // extracting their dynamic dimensions.
397     SmallVector<Value, 4> dynamicOperands;
398     for (auto shapeElement : llvm::enumerate(memRefType.getShape())) {
399       if (!ShapedType::isDynamic(shapeElement.value()))
400         continue;
401       dynamicOperands.push_back(builder.create<DimOp>(
402           terminator->getLoc(), sourceValue, shapeElement.index()));
403     }
404 
405     // TODO: provide a generic interface to create dialect-specific
406     // Alloc and CopyOp nodes.
407     auto alloc = builder.create<AllocOp>(terminator->getLoc(), memRefType,
408                                          dynamicOperands);
409 
410     // Create a new copy operation that copies to contents of the old
411     // allocation to the new one.
412     builder.create<linalg::CopyOp>(terminator->getLoc(), sourceValue, alloc);
413 
414     // Remember the copy of original source value.
415     copiedValues.insert(alloc);
416     return alloc;
417   }
418 
419   /// Finds correct dealloc positions according to the algorithm described at
420   /// the top of the file for all alloc nodes and block arguments that can be
421   /// handled by this analysis.
placeDeallocs() const422   void placeDeallocs() const {
423     // Move or insert deallocs using the previously computed information.
424     // These deallocations will be linked to their associated allocation nodes
425     // since they don't have any aliases that can (potentially) increase their
426     // liveness.
427     for (const BufferPlacementAllocs::AllocEntry &entry : allocs) {
428       Value alloc = std::get<0>(entry);
429       auto aliasesSet = aliases.resolve(alloc);
430       assert(aliasesSet.size() > 0 && "must contain at least one alias");
431 
432       // Determine the actual block to place the dealloc and get liveness
433       // information.
434       Block *placementBlock =
435           findCommonDominator(alloc, aliasesSet, postDominators);
436       const LivenessBlockInfo *livenessInfo =
437           liveness.getLiveness(placementBlock);
438 
439       // We have to ensure that the dealloc will be after the last use of all
440       // aliases of the given value. We first assume that there are no uses in
441       // the placementBlock and that we can safely place the dealloc at the
442       // beginning.
443       Operation *endOperation = &placementBlock->front();
444 
445       // Iterate over all aliases and ensure that the endOperation will point
446       // to the last operation of all potential aliases in the placementBlock.
447       for (Value alias : aliasesSet) {
448         // Ensure that the start operation is at least the defining operation of
449         // the current alias to avoid invalid placement of deallocs for aliases
450         // without any uses.
451         Operation *beforeOp = endOperation;
452         if (alias.getDefiningOp() &&
453             !(beforeOp = placementBlock->findAncestorOpInBlock(
454                   *alias.getDefiningOp())))
455           continue;
456 
457         Operation *aliasEndOperation =
458             livenessInfo->getEndOperation(alias, beforeOp);
459         // Check whether the aliasEndOperation lies in the desired block and
460         // whether it is behind the current endOperation. If yes, this will be
461         // the new endOperation.
462         if (aliasEndOperation->getBlock() == placementBlock &&
463             endOperation->isBeforeInBlock(aliasEndOperation))
464           endOperation = aliasEndOperation;
465       }
466       // endOperation is the last operation behind which we can safely store
467       // the dealloc taking all potential aliases into account.
468 
469       // If there is an existing dealloc, move it to the right place.
470       Operation *deallocOperation = std::get<1>(entry);
471       if (deallocOperation) {
472         deallocOperation->moveAfter(endOperation);
473       } else {
474         // If the Dealloc position is at the terminator operation of the
475         // block, then the value should escape from a deallocation.
476         Operation *nextOp = endOperation->getNextNode();
477         if (!nextOp)
478           continue;
479         // If there is no dealloc node, insert one in the right place.
480         OpBuilder builder(nextOp);
481         builder.create<DeallocOp>(alloc.getLoc(), alloc);
482       }
483     }
484   }
485 
486   /// The dominator info to find the appropriate start operation to move the
487   /// allocs.
488   DominanceInfo dominators;
489 
490   /// The post dominator info to move the dependent allocs in the right
491   /// position.
492   PostDominanceInfo postDominators;
493 
494   /// Stores already copied allocations to avoid additional copies of copies.
495   ValueSetT copiedValues;
496 };
497 
498 //===----------------------------------------------------------------------===//
499 // BufferDeallocationPass
500 //===----------------------------------------------------------------------===//
501 
502 /// The actual buffer deallocation pass that inserts and moves dealloc nodes
503 /// into the right positions. Furthermore, it inserts additional allocs and
504 /// copies if necessary. It uses the algorithm described at the top of the file.
505 struct BufferDeallocationPass : BufferDeallocationBase<BufferDeallocationPass> {
506 
runOnFunction__anon97c32e470111::BufferDeallocationPass507   void runOnFunction() override {
508     // Ensure that there are supported loops only.
509     Backedges backedges(getFunction());
510     if (backedges.size()) {
511       getFunction().emitError(
512           "Structured control-flow loops are supported only.");
513       return;
514     }
515 
516     // Place all required temporary alloc, copy and dealloc nodes.
517     BufferDeallocation deallocation(getFunction());
518     deallocation.deallocate();
519   }
520 };
521 
522 } // end anonymous namespace
523 
524 //===----------------------------------------------------------------------===//
525 // BufferDeallocationPass construction
526 //===----------------------------------------------------------------------===//
527 
createBufferDeallocationPass()528 std::unique_ptr<Pass> mlir::createBufferDeallocationPass() {
529   return std::make_unique<BufferDeallocationPass>();
530 }
531