1 //===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements utilities that allow one to create IR moving the data
10 // across different levels of the GPU memory hierarchy.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "mlir/Dialect/GPU/MemoryPromotion.h"
15 #include "mlir/Dialect/GPU/GPUDialect.h"
16 #include "mlir/Dialect/SCF/EDSC/Builders.h"
17 #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"
18 #include "mlir/Pass/Pass.h"
19 #include "mlir/Transforms/LoopUtils.h"
20
21 using namespace mlir;
22 using namespace mlir::edsc;
23 using namespace mlir::edsc::intrinsics;
24 using namespace mlir::gpu;
25
26 /// Returns the textual name of a GPU dimension.
getDimName(unsigned dim)27 static StringRef getDimName(unsigned dim) {
28 if (dim == 0)
29 return "x";
30 if (dim == 1)
31 return "y";
32 if (dim == 2)
33 return "z";
34
35 llvm_unreachable("dimension ID overflow");
36 }
37
38 /// Emits the (imperfect) loop nest performing the copy between "from" and "to"
39 /// values using the bounds derived from the "from" value. Emits at least
40 /// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with
41 /// single-iteration loops. Maps the innermost loops to thread dimensions, in
42 /// reverse order to enable access coalescing in the innermost loop.
insertCopyLoops(OpBuilder & builder,Location loc,MemRefBoundsCapture & bounds,Value from,Value to)43 static void insertCopyLoops(OpBuilder &builder, Location loc,
44 MemRefBoundsCapture &bounds, Value from, Value to) {
45 // Create EDSC handles for bounds.
46 unsigned rank = bounds.rank();
47 SmallVector<Value, 4> lbs, ubs, steps;
48
49 // Make sure we have enough loops to use all thread dimensions, these trivial
50 // loops should be outermost and therefore inserted first.
51 if (rank < GPUDialect::getNumWorkgroupDimensions()) {
52 unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank;
53 Value zero = std_constant_index(0);
54 Value one = std_constant_index(1);
55 lbs.resize(extraLoops, zero);
56 ubs.resize(extraLoops, one);
57 steps.resize(extraLoops, one);
58 }
59
60 // Add existing bounds.
61 lbs.append(bounds.getLbs().begin(), bounds.getLbs().end());
62 ubs.append(bounds.getUbs().begin(), bounds.getUbs().end());
63
64 // Emit constant operations for steps.
65 steps.reserve(lbs.size());
66 llvm::transform(bounds.getSteps(), std::back_inserter(steps),
67 [](int64_t step) { return std_constant_index(step); });
68
69 // Obtain thread identifiers and block sizes, necessary to map to them.
70 auto indexType = builder.getIndexType();
71 SmallVector<Value, 3> threadIds, blockDims;
72 for (unsigned i = 0; i < 3; ++i) {
73 auto dimName = builder.getStringAttr(getDimName(i));
74 threadIds.push_back(
75 builder.create<gpu::ThreadIdOp>(loc, indexType, dimName));
76 blockDims.push_back(
77 builder.create<gpu::BlockDimOp>(loc, indexType, dimName));
78 }
79
80 // Produce the loop nest with copies.
81 SmallVector<Value, 8> ivs(lbs.size());
82 loopNestBuilder(lbs, ubs, steps, [&](ValueRange loopIvs) {
83 ivs.assign(loopIvs.begin(), loopIvs.end());
84 auto activeIvs = llvm::makeArrayRef(ivs).take_back(rank);
85 StdIndexedValue fromHandle(from), toHandle(to);
86 toHandle(activeIvs) = fromHandle(activeIvs);
87 });
88
89 // Map the innermost loops to threads in reverse order.
90 for (auto en :
91 llvm::enumerate(llvm::reverse(llvm::makeArrayRef(ivs).take_back(
92 GPUDialect::getNumWorkgroupDimensions())))) {
93 Value v = en.value();
94 auto loop = cast<scf::ForOp>(v.getParentRegion()->getParentOp());
95 mapLoopToProcessorIds(loop, {threadIds[en.index()]},
96 {blockDims[en.index()]});
97 }
98 }
99
100 /// Emits the loop nests performing the copy to the designated location in the
101 /// beginning of the region, and from the designated location immediately before
102 /// the terminator of the first block of the region. The region is expected to
103 /// have one block. This boils down to the following structure
104 ///
105 /// ^bb(...):
106 /// <loop-bound-computation>
107 /// for %arg0 = ... to ... step ... {
108 /// ...
109 /// for %argN = <thread-id-x> to ... step <block-dim-x> {
110 /// %0 = load %from[%arg0, ..., %argN]
111 /// store %0, %to[%arg0, ..., %argN]
112 /// }
113 /// ...
114 /// }
115 /// gpu.barrier
116 /// <... original body ...>
117 /// gpu.barrier
118 /// for %arg0 = ... to ... step ... {
119 /// ...
120 /// for %argN = <thread-id-x> to ... step <block-dim-x> {
121 /// %1 = load %to[%arg0, ..., %argN]
122 /// store %1, %from[%arg0, ..., %argN]
123 /// }
124 /// ...
125 /// }
126 ///
127 /// Inserts the barriers unconditionally since different threads may be copying
128 /// values and reading them. An analysis would be required to eliminate barriers
129 /// in case where value is only used by the thread that copies it. Both copies
130 /// are inserted unconditionally, an analysis would be required to only copy
131 /// live-in and live-out values when necessary. This copies the entire memref
132 /// pointed to by "from". In case a smaller block would be sufficient, the
133 /// caller can create a subview of the memref and promote it instead.
insertCopies(Region & region,Location loc,Value from,Value to)134 static void insertCopies(Region ®ion, Location loc, Value from, Value to) {
135 auto fromType = from.getType().cast<MemRefType>();
136 auto toType = to.getType().cast<MemRefType>();
137 (void)fromType;
138 (void)toType;
139 assert(fromType.getShape() == toType.getShape());
140 assert(fromType.getRank() != 0);
141 assert(llvm::hasSingleElement(region) &&
142 "unstructured control flow not supported");
143
144 OpBuilder builder(region.getContext());
145 builder.setInsertionPointToStart(®ion.front());
146
147 ScopedContext edscContext(builder, loc);
148 MemRefBoundsCapture fromBoundsCapture(from);
149 insertCopyLoops(builder, loc, fromBoundsCapture, from, to);
150 builder.create<gpu::BarrierOp>(loc);
151
152 builder.setInsertionPoint(®ion.front().back());
153 builder.create<gpu::BarrierOp>(loc);
154 insertCopyLoops(builder, loc, fromBoundsCapture, to, from);
155 }
156
157 /// Promotes a function argument to workgroup memory in the given function. The
158 /// copies will be inserted in the beginning and in the end of the function.
promoteToWorkgroupMemory(GPUFuncOp op,unsigned arg)159 void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) {
160 Value value = op.getArgument(arg);
161 auto type = value.getType().dyn_cast<MemRefType>();
162 assert(type && type.hasStaticShape() && "can only promote memrefs");
163
164 // Get the type of the buffer in the workgroup memory.
165 int workgroupMemoryAddressSpace = gpu::GPUDialect::getWorkgroupAddressSpace();
166 auto bufferType = MemRefType::get(type.getShape(), type.getElementType(), {},
167 workgroupMemoryAddressSpace);
168
169 Value attribution = op.addWorkgroupAttribution(bufferType);
170
171 // Replace the uses first since only the original uses are currently present.
172 // Then insert the copies.
173 value.replaceAllUsesWith(attribution);
174 insertCopies(op.getBody(), op.getLoc(), value, attribution);
175 }
176