//===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements utilities that allow one to create IR moving the data // across different levels of the GPU memory hierarchy. // //===----------------------------------------------------------------------===// #include "mlir/Dialect/GPU/MemoryPromotion.h" #include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/Dialect/SCF/EDSC/Builders.h" #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/LoopUtils.h" using namespace mlir; using namespace mlir::edsc; using namespace mlir::edsc::intrinsics; using namespace mlir::gpu; /// Returns the textual name of a GPU dimension. static StringRef getDimName(unsigned dim) { if (dim == 0) return "x"; if (dim == 1) return "y"; if (dim == 2) return "z"; llvm_unreachable("dimension ID overflow"); } /// Emits the (imperfect) loop nest performing the copy between "from" and "to" /// values using the bounds derived from the "from" value. Emits at least /// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with /// single-iteration loops. Maps the innermost loops to thread dimensions, in /// reverse order to enable access coalescing in the innermost loop. static void insertCopyLoops(OpBuilder &builder, Location loc, MemRefBoundsCapture &bounds, Value from, Value to) { // Create EDSC handles for bounds. unsigned rank = bounds.rank(); SmallVector lbs, ubs, steps; // Make sure we have enough loops to use all thread dimensions, these trivial // loops should be outermost and therefore inserted first. if (rank < GPUDialect::getNumWorkgroupDimensions()) { unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank; Value zero = std_constant_index(0); Value one = std_constant_index(1); lbs.resize(extraLoops, zero); ubs.resize(extraLoops, one); steps.resize(extraLoops, one); } // Add existing bounds. lbs.append(bounds.getLbs().begin(), bounds.getLbs().end()); ubs.append(bounds.getUbs().begin(), bounds.getUbs().end()); // Emit constant operations for steps. steps.reserve(lbs.size()); llvm::transform(bounds.getSteps(), std::back_inserter(steps), [](int64_t step) { return std_constant_index(step); }); // Obtain thread identifiers and block sizes, necessary to map to them. auto indexType = builder.getIndexType(); SmallVector threadIds, blockDims; for (unsigned i = 0; i < 3; ++i) { auto dimName = builder.getStringAttr(getDimName(i)); threadIds.push_back( builder.create(loc, indexType, dimName)); blockDims.push_back( builder.create(loc, indexType, dimName)); } // Produce the loop nest with copies. SmallVector ivs(lbs.size()); loopNestBuilder(lbs, ubs, steps, [&](ValueRange loopIvs) { ivs.assign(loopIvs.begin(), loopIvs.end()); auto activeIvs = llvm::makeArrayRef(ivs).take_back(rank); StdIndexedValue fromHandle(from), toHandle(to); toHandle(activeIvs) = fromHandle(activeIvs); }); // Map the innermost loops to threads in reverse order. for (auto en : llvm::enumerate(llvm::reverse(llvm::makeArrayRef(ivs).take_back( GPUDialect::getNumWorkgroupDimensions())))) { Value v = en.value(); auto loop = cast(v.getParentRegion()->getParentOp()); mapLoopToProcessorIds(loop, {threadIds[en.index()]}, {blockDims[en.index()]}); } } /// Emits the loop nests performing the copy to the designated location in the /// beginning of the region, and from the designated location immediately before /// the terminator of the first block of the region. The region is expected to /// have one block. This boils down to the following structure /// /// ^bb(...): /// /// for %arg0 = ... to ... step ... { /// ... /// for %argN = to ... step { /// %0 = load %from[%arg0, ..., %argN] /// store %0, %to[%arg0, ..., %argN] /// } /// ... /// } /// gpu.barrier /// <... original body ...> /// gpu.barrier /// for %arg0 = ... to ... step ... { /// ... /// for %argN = to ... step { /// %1 = load %to[%arg0, ..., %argN] /// store %1, %from[%arg0, ..., %argN] /// } /// ... /// } /// /// Inserts the barriers unconditionally since different threads may be copying /// values and reading them. An analysis would be required to eliminate barriers /// in case where value is only used by the thread that copies it. Both copies /// are inserted unconditionally, an analysis would be required to only copy /// live-in and live-out values when necessary. This copies the entire memref /// pointed to by "from". In case a smaller block would be sufficient, the /// caller can create a subview of the memref and promote it instead. static void insertCopies(Region ®ion, Location loc, Value from, Value to) { auto fromType = from.getType().cast(); auto toType = to.getType().cast(); (void)fromType; (void)toType; assert(fromType.getShape() == toType.getShape()); assert(fromType.getRank() != 0); assert(llvm::hasSingleElement(region) && "unstructured control flow not supported"); OpBuilder builder(region.getContext()); builder.setInsertionPointToStart(®ion.front()); ScopedContext edscContext(builder, loc); MemRefBoundsCapture fromBoundsCapture(from); insertCopyLoops(builder, loc, fromBoundsCapture, from, to); builder.create(loc); builder.setInsertionPoint(®ion.front().back()); builder.create(loc); insertCopyLoops(builder, loc, fromBoundsCapture, to, from); } /// Promotes a function argument to workgroup memory in the given function. The /// copies will be inserted in the beginning and in the end of the function. void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) { Value value = op.getArgument(arg); auto type = value.getType().dyn_cast(); assert(type && type.hasStaticShape() && "can only promote memrefs"); // Get the type of the buffer in the workgroup memory. int workgroupMemoryAddressSpace = gpu::GPUDialect::getWorkgroupAddressSpace(); auto bufferType = MemRefType::get(type.getShape(), type.getElementType(), {}, workgroupMemoryAddressSpace); Value attribution = op.addWorkgroupAttribution(bufferType); // Replace the uses first since only the original uses are currently present. // Then insert the copies. value.replaceAllUsesWith(attribution); insertCopies(op.getBody(), op.getLoc(), value, attribution); }