1 //===- AffineDataCopyGeneration.cpp - Explicit memref copying pass ------*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a pass to automatically promote accessed memref regions
10 // to buffers in a faster memory space that is explicitly managed, with the
11 // necessary data movement operations performed through either regular
12 // point-wise load/store's or DMAs. Such explicit copying (also referred to as
13 // array packing/unpacking in the literature), when done on arrays that exhibit
14 // reuse, results in near elimination of conflict misses, TLB misses, reduced
15 // use of hardware prefetch streams, and reduced false sharing. It is also
16 // necessary for hardware that explicitly managed levels in the memory
17 // hierarchy, and where DMAs may have to be used. This optimization is often
18 // performed on already tiled code.
19 //
20 //===----------------------------------------------------------------------===//
21
22 #include "PassDetail.h"
23 #include "mlir/Analysis/Utils.h"
24 #include "mlir/Dialect/Affine/IR/AffineOps.h"
25 #include "mlir/Dialect/Affine/Passes.h"
26 #include "mlir/Dialect/StandardOps/IR/Ops.h"
27 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
28 #include "mlir/Transforms/LoopUtils.h"
29 #include "llvm/ADT/MapVector.h"
30 #include "llvm/Support/CommandLine.h"
31 #include "llvm/Support/Debug.h"
32 #include <algorithm>
33
34 #define DEBUG_TYPE "affine-data-copy-generate"
35
36 using namespace mlir;
37
38 namespace {
39
40 /// Replaces all loads and stores on memref's living in 'slowMemorySpace' by
41 /// introducing copy operations to transfer data into `fastMemorySpace` and
42 /// rewriting the original load's/store's to instead load/store from the
43 /// allocated fast memory buffers. Additional options specify the identifier
44 /// corresponding to the fast memory space and the amount of fast memory space
45 /// available. The pass traverses through the nesting structure, recursing to
46 /// inner levels if necessary to determine at what depth copies need to be
47 /// placed so that the allocated buffers fit within the memory capacity
48 /// provided.
49 // TODO: We currently can't generate copies correctly when stores
50 // are strided. Check for strided stores.
51 struct AffineDataCopyGeneration
52 : public AffineDataCopyGenerationBase<AffineDataCopyGeneration> {
53 AffineDataCopyGeneration() = default;
AffineDataCopyGeneration__anonb238fc050111::AffineDataCopyGeneration54 explicit AffineDataCopyGeneration(unsigned slowMemorySpace,
55 unsigned fastMemorySpace,
56 unsigned tagMemorySpace,
57 int minDmaTransferSize,
58 uint64_t fastMemCapacityBytes) {
59 this->slowMemorySpace = slowMemorySpace;
60 this->fastMemorySpace = fastMemorySpace;
61 this->tagMemorySpace = tagMemorySpace;
62 this->minDmaTransferSize = minDmaTransferSize;
63 this->fastMemoryCapacity = fastMemCapacityBytes / 1024;
64 }
65
66 void runOnFunction() override;
67 LogicalResult runOnBlock(Block *block, DenseSet<Operation *> ©Nests);
68
69 // Constant zero index to avoid too many duplicates.
70 Value zeroIndex = nullptr;
71 };
72
73 } // end anonymous namespace
74
75 /// Generates copies for memref's living in 'slowMemorySpace' into newly created
76 /// buffers in 'fastMemorySpace', and replaces memory operations to the former
77 /// by the latter. Only load op's handled for now.
78 /// TODO: extend this to store op's.
createAffineDataCopyGenerationPass(unsigned slowMemorySpace,unsigned fastMemorySpace,unsigned tagMemorySpace,int minDmaTransferSize,uint64_t fastMemCapacityBytes)79 std::unique_ptr<OperationPass<FuncOp>> mlir::createAffineDataCopyGenerationPass(
80 unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace,
81 int minDmaTransferSize, uint64_t fastMemCapacityBytes) {
82 return std::make_unique<AffineDataCopyGeneration>(
83 slowMemorySpace, fastMemorySpace, tagMemorySpace, minDmaTransferSize,
84 fastMemCapacityBytes);
85 }
86 std::unique_ptr<OperationPass<FuncOp>>
createAffineDataCopyGenerationPass()87 mlir::createAffineDataCopyGenerationPass() {
88 return std::make_unique<AffineDataCopyGeneration>();
89 }
90
91 /// Generate copies for this block. The block is partitioned into separate
92 /// ranges: each range is either a sequence of one or more operations starting
93 /// and ending with an affine load or store op, or just an affine.forop (which
94 /// could have other affine for op's nested within).
95 LogicalResult
runOnBlock(Block * block,DenseSet<Operation * > & copyNests)96 AffineDataCopyGeneration::runOnBlock(Block *block,
97 DenseSet<Operation *> ©Nests) {
98 if (block->empty())
99 return success();
100
101 uint64_t fastMemCapacityBytes =
102 fastMemoryCapacity != std::numeric_limits<uint64_t>::max()
103 ? fastMemoryCapacity * 1024
104 : fastMemoryCapacity;
105 AffineCopyOptions copyOptions = {generateDma, slowMemorySpace,
106 fastMemorySpace, tagMemorySpace,
107 fastMemCapacityBytes};
108
109 // Every affine.forop in the block starts and ends a block range for copying;
110 // in addition, a contiguous sequence of operations starting with a
111 // load/store op but not including any copy nests themselves is also
112 // identified as a copy block range. Straightline code (a contiguous chunk of
113 // operations excluding AffineForOp's) are always assumed to not exhaust
114 // memory. As a result, this approach is conservative in some cases at the
115 // moment; we do a check later and report an error with location info.
116 // TODO: An 'affine.if' operation is being treated similar to an
117 // operation. 'affine.if''s could have 'affine.for's in them;
118 // treat them separately.
119
120 // Get to the first load, store, or for op (that is not a copy nest itself).
121 auto curBegin =
122 std::find_if(block->begin(), block->end(), [&](Operation &op) {
123 return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) &&
124 copyNests.count(&op) == 0;
125 });
126
127 // Create [begin, end) ranges.
128 auto it = curBegin;
129 while (it != block->end()) {
130 AffineForOp forOp;
131 // If you hit a non-copy for loop, we will split there.
132 if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) {
133 // Perform the copying up unti this 'for' op first.
134 affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions,
135 /*filterMemRef=*/llvm::None, copyNests);
136
137 // Returns true if the footprint is known to exceed capacity.
138 auto exceedsCapacity = [&](AffineForOp forOp) {
139 Optional<int64_t> footprint =
140 getMemoryFootprintBytes(forOp,
141 /*memorySpace=*/0);
142 return (footprint.hasValue() &&
143 static_cast<uint64_t>(footprint.getValue()) >
144 fastMemCapacityBytes);
145 };
146
147 // If the memory footprint of the 'affine.for' loop is higher than fast
148 // memory capacity (when provided), we recurse to copy at an inner level
149 // until we find a depth at which footprint fits in fast mem capacity. If
150 // the footprint can't be calculated, we assume for now it fits. Recurse
151 // inside if footprint for 'forOp' exceeds capacity, or when
152 // skipNonUnitStrideLoops is set and the step size is not one.
153 bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != 1
154 : exceedsCapacity(forOp);
155 if (recurseInner) {
156 // We'll recurse and do the copies at an inner level for 'forInst'.
157 // Recurse onto the body of this loop.
158 runOnBlock(forOp.getBody(), copyNests);
159 } else {
160 // We have enough capacity, i.e., copies will be computed for the
161 // portion of the block until 'it', and for 'it', which is 'forOp'. Note
162 // that for the latter, the copies are placed just before this loop (for
163 // incoming copies) and right after (for outgoing ones).
164
165 // Inner loop copies have their own scope - we don't thus update
166 // consumed capacity. The footprint check above guarantees this inner
167 // loop's footprint fits.
168 affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it), copyOptions,
169 /*filterMemRef=*/llvm::None, copyNests);
170 }
171 // Get to the next load or store op after 'forOp'.
172 curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) {
173 return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) &&
174 copyNests.count(&op) == 0;
175 });
176 it = curBegin;
177 } else {
178 assert(copyNests.count(&*it) == 0 &&
179 "all copy nests generated should have been skipped above");
180 // We simply include this op in the current range and continue for more.
181 ++it;
182 }
183 }
184
185 // Generate the copy for the final block range.
186 if (curBegin != block->end()) {
187 // Can't be a terminator because it would have been skipped above.
188 assert(!curBegin->isKnownTerminator() && "can't be a terminator");
189 // Exclude the affine.yield - hence, the std::prev.
190 affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/std::prev(block->end()),
191 copyOptions, /*filterMemRef=*/llvm::None, copyNests);
192 }
193
194 return success();
195 }
196
runOnFunction()197 void AffineDataCopyGeneration::runOnFunction() {
198 FuncOp f = getFunction();
199 OpBuilder topBuilder(f.getBody());
200 zeroIndex = topBuilder.create<ConstantIndexOp>(f.getLoc(), 0);
201
202 // Nests that are copy-in's or copy-out's; the root AffineForOps of those
203 // nests are stored herein.
204 DenseSet<Operation *> copyNests;
205
206 // Clear recorded copy nests.
207 copyNests.clear();
208
209 for (auto &block : f)
210 runOnBlock(&block, copyNests);
211
212 // Promote any single iteration loops in the copy nests and collect
213 // load/stores to simplify.
214 SmallVector<Operation *, 4> copyOps;
215 for (Operation *nest : copyNests)
216 // With a post order walk, the erasure of loops does not affect
217 // continuation of the walk or the collection of load/store ops.
218 nest->walk([&](Operation *op) {
219 if (auto forOp = dyn_cast<AffineForOp>(op))
220 promoteIfSingleIteration(forOp);
221 else if (isa<AffineLoadOp, AffineStoreOp>(op))
222 copyOps.push_back(op);
223 });
224
225 // Promoting single iteration loops could lead to simplification of
226 // contained load's/store's, and the latter could anyway also be
227 // canonicalized.
228 OwningRewritePatternList patterns;
229 AffineLoadOp::getCanonicalizationPatterns(patterns, &getContext());
230 AffineStoreOp::getCanonicalizationPatterns(patterns, &getContext());
231 FrozenRewritePatternList frozenPatterns(std::move(patterns));
232 for (Operation *op : copyOps)
233 applyOpPatternsAndFold(op, frozenPatterns);
234 }
235