1 //===- BufferDeallocation.cpp - the impl for buffer deallocation ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements logic for computing correct alloc and dealloc positions.
10 // Furthermore, buffer placement also adds required new alloc and copy
11 // operations to ensure that all buffers are deallocated. The main class is the
12 // BufferDeallocationPass class that implements the underlying algorithm. In
13 // order to put allocations and deallocations at safe positions, it is
14 // significantly important to put them into the correct blocks. However, the
15 // liveness analysis does not pay attention to aliases, which can occur due to
16 // branches (and their associated block arguments) in general. For this purpose,
17 // BufferDeallocation firstly finds all possible aliases for a single value
18 // (using the BufferAliasAnalysis class). Consider the following
19 // example:
20 //
21 // ^bb0(%arg0):
22 // cond_br %cond, ^bb1, ^bb2
23 // ^bb1:
24 // br ^exit(%arg0)
25 // ^bb2:
26 // %new_value = ...
27 // br ^exit(%new_value)
28 // ^exit(%arg1):
29 // return %arg1;
30 //
31 // We should place the dealloc for %new_value in exit. However, we have to free
32 // the buffer in the same block, because it cannot be freed in the post
33 // dominator. However, this requires a new copy buffer for %arg1 that will
34 // contain the actual contents. Using the class BufferAliasAnalysis, we
35 // will find out that %new_value has a potential alias %arg1. In order to find
36 // the dealloc position we have to find all potential aliases, iterate over
37 // their uses and find the common post-dominator block (note that additional
38 // copies and buffers remove potential aliases and will influence the placement
39 // of the deallocs). In all cases, the computed block can be safely used to free
40 // the %new_value buffer (may be exit or bb2) as it will die and we can use
41 // liveness information to determine the exact operation after which we have to
42 // insert the dealloc. However, the algorithm supports introducing copy buffers
43 // and placing deallocs in safe locations to ensure that all buffers will be
44 // freed in the end.
45 //
46 // TODO:
47 // The current implementation does not support explicit-control-flow loops and
48 // the resulting code will be invalid with respect to program semantics.
49 // However, structured control-flow loops are fully supported. Furthermore, it
50 // doesn't accept functions which return buffers already.
51 //
52 //===----------------------------------------------------------------------===//
53
54 #include "PassDetail.h"
55 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
56 #include "mlir/Dialect/StandardOps/IR/Ops.h"
57 #include "mlir/IR/Operation.h"
58 #include "mlir/Interfaces/ControlFlowInterfaces.h"
59 #include "mlir/Interfaces/LoopLikeInterface.h"
60 #include "mlir/Pass/Pass.h"
61 #include "mlir/Transforms/BufferUtils.h"
62 #include "mlir/Transforms/Passes.h"
63 #include "llvm/ADT/SetOperations.h"
64
65 using namespace mlir;
66
67 /// Walks over all immediate return-like terminators in the given region.
68 template <typename FuncT>
walkReturnOperations(Region * region,const FuncT & func)69 static void walkReturnOperations(Region *region, const FuncT &func) {
70 for (Block &block : *region)
71 for (Operation &operation : block) {
72 // Skip non-return-like terminators.
73 if (operation.hasTrait<OpTrait::ReturnLike>())
74 func(&operation);
75 }
76 }
77
78 namespace {
79
80 //===----------------------------------------------------------------------===//
81 // Backedges analysis
82 //===----------------------------------------------------------------------===//
83
84 /// A straight-forward program analysis which detects loop backedges induced by
85 /// explicit control flow.
86 class Backedges {
87 public:
88 using BlockSetT = SmallPtrSet<Block *, 16>;
89 using BackedgeSetT = llvm::DenseSet<std::pair<Block *, Block *>>;
90
91 public:
92 /// Constructs a new backedges analysis using the op provided.
Backedges(Operation * op)93 Backedges(Operation *op) { recurse(op, op->getBlock()); }
94
95 /// Returns the number of backedges formed by explicit control flow.
size() const96 size_t size() const { return edgeSet.size(); }
97
98 /// Returns the start iterator to loop over all backedges.
begin() const99 BackedgeSetT::const_iterator begin() const { return edgeSet.begin(); }
100
101 /// Returns the end iterator to loop over all backedges.
end() const102 BackedgeSetT::const_iterator end() const { return edgeSet.end(); }
103
104 private:
105 /// Enters the current block and inserts a backedge into the `edgeSet` if we
106 /// have already visited the current block. The inserted edge links the given
107 /// `predecessor` with the `current` block.
enter(Block & current,Block * predecessor)108 bool enter(Block ¤t, Block *predecessor) {
109 bool inserted = visited.insert(¤t).second;
110 if (!inserted)
111 edgeSet.insert(std::make_pair(predecessor, ¤t));
112 return inserted;
113 }
114
115 /// Leaves the current block.
exit(Block & current)116 void exit(Block ¤t) { visited.erase(¤t); }
117
118 /// Recurses into the given operation while taking all attached regions into
119 /// account.
recurse(Operation * op,Block * predecessor)120 void recurse(Operation *op, Block *predecessor) {
121 Block *current = op->getBlock();
122 // If the current op implements the `BranchOpInterface`, there can be
123 // cycles in the scope of all successor blocks.
124 if (isa<BranchOpInterface>(op)) {
125 for (Block *succ : current->getSuccessors())
126 recurse(*succ, current);
127 }
128 // Recurse into all distinct regions and check for explicit control-flow
129 // loops.
130 for (Region ®ion : op->getRegions())
131 recurse(region.front(), current);
132 }
133
134 /// Recurses into explicit control-flow structures that are given by
135 /// the successor relation defined on the block level.
recurse(Block & block,Block * predecessor)136 void recurse(Block &block, Block *predecessor) {
137 // Try to enter the current block. If this is not possible, we are
138 // currently processing this block and can safely return here.
139 if (!enter(block, predecessor))
140 return;
141
142 // Recurse into all operations and successor blocks.
143 for (Operation &op : block.getOperations())
144 recurse(&op, predecessor);
145
146 // Leave the current block.
147 exit(block);
148 }
149
150 /// Stores all blocks that are currently visited and on the processing stack.
151 BlockSetT visited;
152
153 /// Stores all backedges in the format (source, target).
154 BackedgeSetT edgeSet;
155 };
156
157 //===----------------------------------------------------------------------===//
158 // BufferDeallocation
159 //===----------------------------------------------------------------------===//
160
161 /// The buffer deallocation transformation which ensures that all allocs in the
162 /// program have a corresponding de-allocation. As a side-effect, it might also
163 /// introduce copies that in turn leads to additional allocs and de-allocations.
164 class BufferDeallocation : BufferPlacementTransformationBase {
165 public:
BufferDeallocation(Operation * op)166 BufferDeallocation(Operation *op)
167 : BufferPlacementTransformationBase(op), dominators(op),
168 postDominators(op) {}
169
170 /// Performs the actual placement/creation of all temporary alloc, copy and
171 /// dealloc nodes.
deallocate()172 void deallocate() {
173 // Add additional allocations and copies that are required.
174 introduceCopies();
175 // Place deallocations for all allocation entries.
176 placeDeallocs();
177 }
178
179 private:
180 /// Introduces required allocs and copy operations to avoid memory leaks.
introduceCopies()181 void introduceCopies() {
182 // Initialize the set of values that require a dedicated memory free
183 // operation since their operands cannot be safely deallocated in a post
184 // dominator.
185 SmallPtrSet<Value, 8> valuesToFree;
186 llvm::SmallDenseSet<std::tuple<Value, Block *>> visitedValues;
187 SmallVector<std::tuple<Value, Block *>, 8> toProcess;
188
189 // Check dominance relation for proper dominance properties. If the given
190 // value node does not dominate an alias, we will have to create a copy in
191 // order to free all buffers that can potentially leak into a post
192 // dominator.
193 auto findUnsafeValues = [&](Value source, Block *definingBlock) {
194 auto it = aliases.find(source);
195 if (it == aliases.end())
196 return;
197 for (Value value : it->second) {
198 if (valuesToFree.count(value) > 0)
199 continue;
200 Block *parentBlock = value.getParentBlock();
201 // Check whether we have to free this particular block argument or
202 // generic value. We have to free the current alias if it is either
203 // defined in a non-dominated block or it is defined in the same block
204 // but the current value is not dominated by the source value.
205 if (!dominators.dominates(definingBlock, parentBlock) ||
206 (definingBlock == parentBlock && value.isa<BlockArgument>())) {
207 toProcess.emplace_back(value, parentBlock);
208 valuesToFree.insert(value);
209 } else if (visitedValues.insert(std::make_tuple(value, definingBlock))
210 .second)
211 toProcess.emplace_back(value, definingBlock);
212 }
213 };
214
215 // Detect possibly unsafe aliases starting from all allocations.
216 for (BufferPlacementAllocs::AllocEntry &entry : allocs) {
217 Value allocValue = std::get<0>(entry);
218 findUnsafeValues(allocValue, allocValue.getDefiningOp()->getBlock());
219 }
220 // Try to find block arguments that require an explicit free operation
221 // until we reach a fix point.
222 while (!toProcess.empty()) {
223 auto current = toProcess.pop_back_val();
224 findUnsafeValues(std::get<0>(current), std::get<1>(current));
225 }
226
227 // Update buffer aliases to ensure that we free all buffers and block
228 // arguments at the correct locations.
229 aliases.remove(valuesToFree);
230
231 // Add new allocs and additional copy operations.
232 for (Value value : valuesToFree) {
233 if (auto blockArg = value.dyn_cast<BlockArgument>())
234 introduceBlockArgCopy(blockArg);
235 else
236 introduceValueCopyForRegionResult(value);
237
238 // Register the value to require a final dealloc. Note that we do not have
239 // to assign a block here since we do not want to move the allocation node
240 // to another location.
241 allocs.registerAlloc(std::make_tuple(value, nullptr));
242 }
243 }
244
245 /// Introduces temporary allocs in all predecessors and copies the source
246 /// values into the newly allocated buffers.
introduceBlockArgCopy(BlockArgument blockArg)247 void introduceBlockArgCopy(BlockArgument blockArg) {
248 // Allocate a buffer for the current block argument in the block of
249 // the associated value (which will be a predecessor block by
250 // definition).
251 Block *block = blockArg.getOwner();
252 for (auto it = block->pred_begin(), e = block->pred_end(); it != e; ++it) {
253 // Get the terminator and the value that will be passed to our
254 // argument.
255 Operation *terminator = (*it)->getTerminator();
256 auto branchInterface = cast<BranchOpInterface>(terminator);
257 // Query the associated source value.
258 Value sourceValue =
259 branchInterface.getSuccessorOperands(it.getSuccessorIndex())
260 .getValue()[blockArg.getArgNumber()];
261 // Create a new alloc and copy at the current location of the terminator.
262 Value alloc = introduceBufferCopy(sourceValue, terminator);
263 // Wire new alloc and successor operand.
264 auto mutableOperands =
265 branchInterface.getMutableSuccessorOperands(it.getSuccessorIndex());
266 if (!mutableOperands.hasValue())
267 terminator->emitError() << "terminators with immutable successor "
268 "operands are not supported";
269 else
270 mutableOperands.getValue()
271 .slice(blockArg.getArgNumber(), 1)
272 .assign(alloc);
273 }
274
275 // Check whether the block argument has implicitly defined predecessors via
276 // the RegionBranchOpInterface. This can be the case if the current block
277 // argument belongs to the first block in a region and the parent operation
278 // implements the RegionBranchOpInterface.
279 Region *argRegion = block->getParent();
280 Operation *parentOp = argRegion->getParentOp();
281 RegionBranchOpInterface regionInterface;
282 if (!argRegion || &argRegion->front() != block ||
283 !(regionInterface = dyn_cast<RegionBranchOpInterface>(parentOp)))
284 return;
285
286 introduceCopiesForRegionSuccessors(
287 regionInterface, argRegion->getParentOp()->getRegions(), blockArg,
288 [&](RegionSuccessor &successorRegion) {
289 // Find a predecessor of our argRegion.
290 return successorRegion.getSuccessor() == argRegion;
291 });
292
293 // Check whether the block argument belongs to an entry region of the
294 // parent operation. In this case, we have to introduce an additional copy
295 // for buffer that is passed to the argument.
296 SmallVector<RegionSuccessor, 2> successorRegions;
297 regionInterface.getSuccessorRegions(/*index=*/llvm::None, successorRegions);
298 auto *it =
299 llvm::find_if(successorRegions, [&](RegionSuccessor &successorRegion) {
300 return successorRegion.getSuccessor() == argRegion;
301 });
302 if (it == successorRegions.end())
303 return;
304
305 // Determine the actual operand to introduce a copy for and rewire the
306 // operand to point to the copy instead.
307 Value operand =
308 regionInterface.getSuccessorEntryOperands(argRegion->getRegionNumber())
309 [llvm::find(it->getSuccessorInputs(), blockArg).getIndex()];
310 Value copy = introduceBufferCopy(operand, parentOp);
311
312 auto op = llvm::find(parentOp->getOperands(), operand);
313 assert(op != parentOp->getOperands().end() &&
314 "parentOp does not contain operand");
315 parentOp->setOperand(op.getIndex(), copy);
316 }
317
318 /// Introduces temporary allocs in front of all associated nested-region
319 /// terminators and copies the source values into the newly allocated buffers.
introduceValueCopyForRegionResult(Value value)320 void introduceValueCopyForRegionResult(Value value) {
321 // Get the actual result index in the scope of the parent terminator.
322 Operation *operation = value.getDefiningOp();
323 auto regionInterface = cast<RegionBranchOpInterface>(operation);
324 // Filter successors that return to the parent operation.
325 auto regionPredicate = [&](RegionSuccessor &successorRegion) {
326 // If the RegionSuccessor has no associated successor, it will return to
327 // its parent operation.
328 return !successorRegion.getSuccessor();
329 };
330 // Introduce a copy for all region "results" that are returned to the parent
331 // operation. This is required since the parent's result value has been
332 // considered critical. Therefore, the algorithm assumes that a copy of a
333 // previously allocated buffer is returned by the operation (like in the
334 // case of a block argument).
335 introduceCopiesForRegionSuccessors(regionInterface, operation->getRegions(),
336 value, regionPredicate);
337 }
338
339 /// Introduces buffer copies for all terminators in the given regions. The
340 /// regionPredicate is applied to every successor region in order to restrict
341 /// the copies to specific regions.
342 template <typename TPredicate>
introduceCopiesForRegionSuccessors(RegionBranchOpInterface regionInterface,MutableArrayRef<Region> regions,Value argValue,const TPredicate & regionPredicate)343 void introduceCopiesForRegionSuccessors(
344 RegionBranchOpInterface regionInterface, MutableArrayRef<Region> regions,
345 Value argValue, const TPredicate ®ionPredicate) {
346 for (Region ®ion : regions) {
347 // Query the regionInterface to get all successor regions of the current
348 // one.
349 SmallVector<RegionSuccessor, 2> successorRegions;
350 regionInterface.getSuccessorRegions(region.getRegionNumber(),
351 successorRegions);
352 // Try to find a matching region successor.
353 RegionSuccessor *regionSuccessor =
354 llvm::find_if(successorRegions, regionPredicate);
355 if (regionSuccessor == successorRegions.end())
356 continue;
357 // Get the operand index in the context of the current successor input
358 // bindings.
359 size_t operandIndex =
360 llvm::find(regionSuccessor->getSuccessorInputs(), argValue)
361 .getIndex();
362
363 // Iterate over all immediate terminator operations to introduce
364 // new buffer allocations. Thereby, the appropriate terminator operand
365 // will be adjusted to point to the newly allocated buffer instead.
366 walkReturnOperations(®ion, [&](Operation *terminator) {
367 // Extract the source value from the current terminator.
368 Value sourceValue = terminator->getOperand(operandIndex);
369 // Create a new alloc at the current location of the terminator.
370 Value alloc = introduceBufferCopy(sourceValue, terminator);
371 // Wire alloc and terminator operand.
372 terminator->setOperand(operandIndex, alloc);
373 });
374 }
375 }
376
377 /// Creates a new memory allocation for the given source value and copies
378 /// its content into the newly allocated buffer. The terminator operation is
379 /// used to insert the alloc and copy operations at the right places.
introduceBufferCopy(Value sourceValue,Operation * terminator)380 Value introduceBufferCopy(Value sourceValue, Operation *terminator) {
381 // Avoid multiple copies of the same source value. This can happen in the
382 // presence of loops when a branch acts as a backedge while also having
383 // another successor that returns to its parent operation. Note: that
384 // copying copied buffers can introduce memory leaks since the invariant of
385 // BufferPlacement assumes that a buffer will be only copied once into a
386 // temporary buffer. Hence, the construction of copy chains introduces
387 // additional allocations that are not tracked automatically by the
388 // algorithm.
389 if (copiedValues.contains(sourceValue))
390 return sourceValue;
391 // Create a new alloc at the current location of the terminator.
392 auto memRefType = sourceValue.getType().cast<MemRefType>();
393 OpBuilder builder(terminator);
394
395 // Extract information about dynamically shaped types by
396 // extracting their dynamic dimensions.
397 SmallVector<Value, 4> dynamicOperands;
398 for (auto shapeElement : llvm::enumerate(memRefType.getShape())) {
399 if (!ShapedType::isDynamic(shapeElement.value()))
400 continue;
401 dynamicOperands.push_back(builder.create<DimOp>(
402 terminator->getLoc(), sourceValue, shapeElement.index()));
403 }
404
405 // TODO: provide a generic interface to create dialect-specific
406 // Alloc and CopyOp nodes.
407 auto alloc = builder.create<AllocOp>(terminator->getLoc(), memRefType,
408 dynamicOperands);
409
410 // Create a new copy operation that copies to contents of the old
411 // allocation to the new one.
412 builder.create<linalg::CopyOp>(terminator->getLoc(), sourceValue, alloc);
413
414 // Remember the copy of original source value.
415 copiedValues.insert(alloc);
416 return alloc;
417 }
418
419 /// Finds correct dealloc positions according to the algorithm described at
420 /// the top of the file for all alloc nodes and block arguments that can be
421 /// handled by this analysis.
placeDeallocs() const422 void placeDeallocs() const {
423 // Move or insert deallocs using the previously computed information.
424 // These deallocations will be linked to their associated allocation nodes
425 // since they don't have any aliases that can (potentially) increase their
426 // liveness.
427 for (const BufferPlacementAllocs::AllocEntry &entry : allocs) {
428 Value alloc = std::get<0>(entry);
429 auto aliasesSet = aliases.resolve(alloc);
430 assert(aliasesSet.size() > 0 && "must contain at least one alias");
431
432 // Determine the actual block to place the dealloc and get liveness
433 // information.
434 Block *placementBlock =
435 findCommonDominator(alloc, aliasesSet, postDominators);
436 const LivenessBlockInfo *livenessInfo =
437 liveness.getLiveness(placementBlock);
438
439 // We have to ensure that the dealloc will be after the last use of all
440 // aliases of the given value. We first assume that there are no uses in
441 // the placementBlock and that we can safely place the dealloc at the
442 // beginning.
443 Operation *endOperation = &placementBlock->front();
444
445 // Iterate over all aliases and ensure that the endOperation will point
446 // to the last operation of all potential aliases in the placementBlock.
447 for (Value alias : aliasesSet) {
448 // Ensure that the start operation is at least the defining operation of
449 // the current alias to avoid invalid placement of deallocs for aliases
450 // without any uses.
451 Operation *beforeOp = endOperation;
452 if (alias.getDefiningOp() &&
453 !(beforeOp = placementBlock->findAncestorOpInBlock(
454 *alias.getDefiningOp())))
455 continue;
456
457 Operation *aliasEndOperation =
458 livenessInfo->getEndOperation(alias, beforeOp);
459 // Check whether the aliasEndOperation lies in the desired block and
460 // whether it is behind the current endOperation. If yes, this will be
461 // the new endOperation.
462 if (aliasEndOperation->getBlock() == placementBlock &&
463 endOperation->isBeforeInBlock(aliasEndOperation))
464 endOperation = aliasEndOperation;
465 }
466 // endOperation is the last operation behind which we can safely store
467 // the dealloc taking all potential aliases into account.
468
469 // If there is an existing dealloc, move it to the right place.
470 Operation *deallocOperation = std::get<1>(entry);
471 if (deallocOperation) {
472 deallocOperation->moveAfter(endOperation);
473 } else {
474 // If the Dealloc position is at the terminator operation of the
475 // block, then the value should escape from a deallocation.
476 Operation *nextOp = endOperation->getNextNode();
477 if (!nextOp)
478 continue;
479 // If there is no dealloc node, insert one in the right place.
480 OpBuilder builder(nextOp);
481 builder.create<DeallocOp>(alloc.getLoc(), alloc);
482 }
483 }
484 }
485
486 /// The dominator info to find the appropriate start operation to move the
487 /// allocs.
488 DominanceInfo dominators;
489
490 /// The post dominator info to move the dependent allocs in the right
491 /// position.
492 PostDominanceInfo postDominators;
493
494 /// Stores already copied allocations to avoid additional copies of copies.
495 ValueSetT copiedValues;
496 };
497
498 //===----------------------------------------------------------------------===//
499 // BufferDeallocationPass
500 //===----------------------------------------------------------------------===//
501
502 /// The actual buffer deallocation pass that inserts and moves dealloc nodes
503 /// into the right positions. Furthermore, it inserts additional allocs and
504 /// copies if necessary. It uses the algorithm described at the top of the file.
505 struct BufferDeallocationPass : BufferDeallocationBase<BufferDeallocationPass> {
506
runOnFunction__anon97c32e470111::BufferDeallocationPass507 void runOnFunction() override {
508 // Ensure that there are supported loops only.
509 Backedges backedges(getFunction());
510 if (backedges.size()) {
511 getFunction().emitError(
512 "Structured control-flow loops are supported only.");
513 return;
514 }
515
516 // Place all required temporary alloc, copy and dealloc nodes.
517 BufferDeallocation deallocation(getFunction());
518 deallocation.deallocate();
519 }
520 };
521
522 } // end anonymous namespace
523
524 //===----------------------------------------------------------------------===//
525 // BufferDeallocationPass construction
526 //===----------------------------------------------------------------------===//
527
createBufferDeallocationPass()528 std::unique_ptr<Pass> mlir::createBufferDeallocationPass() {
529 return std::make_unique<BufferDeallocationPass>();
530 }
531