//===- VectorToSCF.cpp - Conversion from Vector to mix of SCF and Std -----===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements target-dependent lowering of vector transfer operations. // //===----------------------------------------------------------------------===// #include #include "mlir/Conversion/VectorToSCF/VectorToSCF.h" #include "../PassDetail.h" #include "mlir/Dialect/Affine/EDSC/Intrinsics.h" #include "mlir/Dialect/SCF/EDSC/Builders.h" #include "mlir/Dialect/SCF/EDSC/Intrinsics.h" #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h" #include "mlir/Dialect/Vector/EDSC/Intrinsics.h" #include "mlir/Dialect/Vector/VectorOps.h" #include "mlir/Dialect/Vector/VectorUtils.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineMap.h" #include "mlir/IR/Builders.h" #include "mlir/IR/Matchers.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "mlir/Transforms/Passes.h" using namespace mlir; using namespace mlir::edsc; using namespace mlir::edsc::intrinsics; using vector::TransferReadOp; using vector::TransferWriteOp; // Return a list of Values that correspond to multiple AffineApplyOp, one for // each result of `map`. Each `expr` in `map` is canonicalized and folded // greedily according to its operands. // TODO: factor out in a common location that both linalg and vector can use. static SmallVector applyMapToValues(OpBuilder &b, Location loc, AffineMap map, ValueRange values) { SmallVector res; res.reserve(map.getNumResults()); unsigned numDims = map.getNumDims(), numSym = map.getNumSymbols(); // For each `expr` in `map`, applies the `expr` to the values extracted from // ranges. If the resulting application can be folded into a Value, the // folding occurs eagerly. Otherwise, an affine.apply operation is emitted. for (auto expr : map.getResults()) { AffineMap map = AffineMap::get(numDims, numSym, expr); SmallVector operands(values.begin(), values.end()); fullyComposeAffineMapAndOperands(&map, &operands); canonicalizeMapAndOperands(&map, &operands); res.push_back(b.createOrFold(loc, map, operands)); } return res; } namespace { /// Helper class captures the common information needed to lower N>1-D vector /// transfer operations (read and write). /// On construction, this class opens an edsc::ScopedContext for simpler IR /// manipulation. /// In pseudo-IR, for an n-D vector_transfer_read such as: /// /// ``` /// vector_transfer_read(%m, %offsets, identity_map, %fill) : /// memref<(leading_dims) x (major_dims) x (minor_dims) x type>, /// vector<(major_dims) x (minor_dims) x type> /// ``` /// /// where rank(minor_dims) is the lower-level vector rank (e.g. 1 for LLVM or /// higher). /// /// This is the entry point to emitting pseudo-IR resembling: /// /// ``` /// %tmp = alloc(): memref<(major_dims) x vector> /// for (%ivs_major, {0}, {vector_shape}, {1}) { // (N-1)-D loop nest /// if (any_of(%ivs_major + %offsets, <, major_dims)) { /// %v = vector_transfer_read( /// {%offsets_leading, %ivs_major + %offsets_major, %offsets_minor}, /// %ivs_minor): /// memref<(leading_dims) x (major_dims) x (minor_dims) x type>, /// vector<(minor_dims) x type>; /// store(%v, %tmp); /// } else { /// %v = splat(vector<(minor_dims) x type>, %fill) /// store(%v, %tmp, %ivs_major); /// } /// } /// %res = load(%tmp, %0): memref<(major_dims) x vector>): // vector<(major_dims) x (minor_dims) x type> /// ``` /// template class NDTransferOpHelper { public: NDTransferOpHelper(PatternRewriter &rewriter, ConcreteOp xferOp, const VectorTransferToSCFOptions &options) : rewriter(rewriter), options(options), loc(xferOp.getLoc()), scope(std::make_unique(rewriter, loc)), xferOp(xferOp), op(xferOp.getOperation()) { vectorType = xferOp.getVectorType(); // TODO: when we go to k > 1-D vectors adapt minorRank. minorRank = 1; majorRank = vectorType.getRank() - minorRank; leadingRank = xferOp.getLeadingMemRefRank(); majorVectorType = VectorType::get(vectorType.getShape().take_front(majorRank), vectorType.getElementType()); minorVectorType = VectorType::get(vectorType.getShape().take_back(minorRank), vectorType.getElementType()); /// Memref of minor vector type is used for individual transfers. memRefMinorVectorType = MemRefType::get(majorVectorType.getShape(), minorVectorType, {}, xferOp.getMemRefType().getMemorySpace()); } LogicalResult doReplace(); private: /// Creates the loop nest on the "major" dimensions and calls the /// `loopBodyBuilder` lambda in the context of the loop nest. void emitLoops(llvm::function_ref loopBodyBuilder); /// Common state to lower vector transfer ops. PatternRewriter &rewriter; const VectorTransferToSCFOptions &options; Location loc; std::unique_ptr scope; ConcreteOp xferOp; Operation *op; // A vector transfer copies data between: // - memref<(leading_dims) x (major_dims) x (minor_dims) x type> // - vector<(major_dims) x (minor_dims) x type> unsigned minorRank; // for now always 1 unsigned majorRank; // vector rank - minorRank unsigned leadingRank; // memref rank - vector rank VectorType vectorType; // vector<(major_dims) x (minor_dims) x type> VectorType majorVectorType; // vector<(major_dims) x type> VectorType minorVectorType; // vector<(minor_dims) x type> MemRefType memRefMinorVectorType; // memref> }; template void NDTransferOpHelper::emitLoops( llvm::function_ref loopBodyBuilder) { /// Loop nest operates on the major dimensions MemRefBoundsCapture memrefBoundsCapture(xferOp.memref()); if (options.unroll) { auto shape = majorVectorType.getShape(); auto strides = computeStrides(shape); unsigned numUnrolledInstances = computeMaxLinearIndex(shape); ValueRange indices(xferOp.indices()); for (unsigned idx = 0; idx < numUnrolledInstances; ++idx) { SmallVector offsets = delinearize(strides, idx); SmallVector offsetValues = llvm::to_vector<4>(llvm::map_range(offsets, [](int64_t off) -> Value { return std_constant_index(off); })); loopBodyBuilder(offsetValues, indices.take_front(leadingRank), indices.drop_front(leadingRank).take_front(majorRank), indices.take_back(minorRank), memrefBoundsCapture); } } else { VectorBoundsCapture vectorBoundsCapture(majorVectorType); auto majorLbs = vectorBoundsCapture.getLbs(); auto majorUbs = vectorBoundsCapture.getUbs(); auto majorSteps = vectorBoundsCapture.getSteps(); affineLoopNestBuilder( majorLbs, majorUbs, majorSteps, [&](ValueRange majorIvs) { ValueRange indices(xferOp.indices()); loopBodyBuilder(majorIvs, indices.take_front(leadingRank), indices.drop_front(leadingRank).take_front(majorRank), indices.take_back(minorRank), memrefBoundsCapture); }); } } static Optional extractConstantIndex(Value v) { if (auto cstOp = v.getDefiningOp()) return cstOp.getValue(); if (auto affineApplyOp = v.getDefiningOp()) if (affineApplyOp.getAffineMap().isSingleConstant()) return affineApplyOp.getAffineMap().getSingleConstantResult(); return None; } // Missing foldings of scf.if make it necessary to perform poor man's folding // eagerly, especially in the case of unrolling. In the future, this should go // away once scf.if folds properly. static Value onTheFlyFoldSLT(Value v, Value ub) { using namespace mlir::edsc::op; auto maybeCstV = extractConstantIndex(v); auto maybeCstUb = extractConstantIndex(ub); if (maybeCstV && maybeCstUb && *maybeCstV < *maybeCstUb) return Value(); return slt(v, ub); } /// 1. Compute the indexings `majorIvs + majorOffsets` and save them in /// `majorIvsPlusOffsets`. /// 2. Return a value of i1 that determines whether the first /// `majorIvs.rank()` /// dimensions `majorIvs + majorOffsets` are all within `memrefBounds`. static Value emitInBoundsCondition(PatternRewriter &rewriter, VectorTransferOpInterface xferOp, unsigned leadingRank, ValueRange majorIvs, ValueRange majorOffsets, const MemRefBoundsCapture &memrefBounds, SmallVectorImpl &majorIvsPlusOffsets) { Value inBoundsCondition; majorIvsPlusOffsets.reserve(majorIvs.size()); unsigned idx = 0; SmallVector bounds = applyMapToValues(rewriter, xferOp.getLoc(), xferOp.permutation_map(), memrefBounds.getUbs()); for (auto it : llvm::zip(majorIvs, majorOffsets, bounds)) { Value iv = std::get<0>(it), off = std::get<1>(it), ub = std::get<2>(it); using namespace mlir::edsc::op; majorIvsPlusOffsets.push_back(iv + off); if (xferOp.isMaskedDim(leadingRank + idx)) { Value inBoundsCond = onTheFlyFoldSLT(majorIvsPlusOffsets.back(), ub); if (inBoundsCond) inBoundsCondition = (inBoundsCondition) ? (inBoundsCondition && inBoundsCond) : inBoundsCond; } ++idx; } return inBoundsCondition; } // TODO: Parallelism and threadlocal considerations. static Value setAllocAtFunctionEntry(MemRefType memRefMinorVectorType, Operation *op) { auto &b = ScopedContext::getBuilderRef(); OpBuilder::InsertionGuard guard(b); Operation *scope = op->getParentWithTrait(); assert(scope && "Expected op to be inside automatic allocation scope"); b.setInsertionPointToStart(&scope->getRegion(0).front()); Value res = std_alloca(memRefMinorVectorType); return res; } template <> LogicalResult NDTransferOpHelper::doReplace() { Value alloc, result; if (options.unroll) result = std_splat(vectorType, xferOp.padding()); else alloc = setAllocAtFunctionEntry(memRefMinorVectorType, op); emitLoops([&](ValueRange majorIvs, ValueRange leadingOffsets, ValueRange majorOffsets, ValueRange minorOffsets, const MemRefBoundsCapture &memrefBounds) { /// Lambda to load 1-D vector in the current loop ivs + offset context. auto load1DVector = [&](ValueRange majorIvsPlusOffsets) -> Value { SmallVector indexing; indexing.reserve(leadingRank + majorRank + minorRank); indexing.append(leadingOffsets.begin(), leadingOffsets.end()); indexing.append(majorIvsPlusOffsets.begin(), majorIvsPlusOffsets.end()); indexing.append(minorOffsets.begin(), minorOffsets.end()); Value memref = xferOp.memref(); auto map = getTransferMinorIdentityMap(xferOp.getMemRefType(), minorVectorType); ArrayAttr masked; if (!xferOp.isMaskedDim(xferOp.getVectorType().getRank() - 1)) { OpBuilder &b = ScopedContext::getBuilderRef(); masked = b.getBoolArrayAttr({false}); } return vector_transfer_read(minorVectorType, memref, indexing, AffineMapAttr::get(map), xferOp.padding(), masked); }; // 1. Compute the inBoundsCondition in the current loops ivs + offset // context. SmallVector majorIvsPlusOffsets; Value inBoundsCondition = emitInBoundsCondition( rewriter, cast(xferOp.getOperation()), leadingRank, majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets); if (inBoundsCondition) { // 2. If the condition is not null, we need an IfOp, which may yield // if `options.unroll` is true. SmallVector resultType; if (options.unroll) resultType.push_back(vectorType); // 3. If in-bounds, progressively lower to a 1-D transfer read, otherwise // splat a 1-D vector. ValueRange ifResults = conditionBuilder( resultType, inBoundsCondition, [&]() -> scf::ValueVector { Value vector = load1DVector(majorIvsPlusOffsets); // 3.a. If `options.unroll` is true, insert the 1-D vector in the // aggregate. We must yield and merge with the `else` branch. if (options.unroll) { vector = vector_insert(vector, result, majorIvs); return {vector}; } // 3.b. Otherwise, just go through the temporary `alloc`. std_store(vector, alloc, majorIvs); return {}; }, [&]() -> scf::ValueVector { Value vector = std_splat(minorVectorType, xferOp.padding()); // 3.c. If `options.unroll` is true, insert the 1-D vector in the // aggregate. We must yield and merge with the `then` branch. if (options.unroll) { vector = vector_insert(vector, result, majorIvs); return {vector}; } // 3.d. Otherwise, just go through the temporary `alloc`. std_store(vector, alloc, majorIvs); return {}; }); if (!resultType.empty()) result = *ifResults.begin(); } else { // 4. Guaranteed in-bounds, progressively lower to a 1-D transfer read. Value loaded1D = load1DVector(majorIvsPlusOffsets); // 5.a. If `options.unroll` is true, insert the 1-D vector in the // aggregate. if (options.unroll) result = vector_insert(loaded1D, result, majorIvs); // 5.b. Otherwise, just go through the temporary `alloc`. else std_store(loaded1D, alloc, majorIvs); } }); assert((!options.unroll ^ (bool)result) && "Expected resulting Value iff unroll"); if (!result) result = std_load(vector_type_cast(MemRefType::get({}, vectorType), alloc)); rewriter.replaceOp(op, result); return success(); } template <> LogicalResult NDTransferOpHelper::doReplace() { Value alloc; if (!options.unroll) { alloc = setAllocAtFunctionEntry(memRefMinorVectorType, op); std_store(xferOp.vector(), vector_type_cast(MemRefType::get({}, vectorType), alloc)); } emitLoops([&](ValueRange majorIvs, ValueRange leadingOffsets, ValueRange majorOffsets, ValueRange minorOffsets, const MemRefBoundsCapture &memrefBounds) { // Lower to 1-D vector_transfer_write and let recursion handle it. auto emitTransferWrite = [&](ValueRange majorIvsPlusOffsets) { SmallVector indexing; indexing.reserve(leadingRank + majorRank + minorRank); indexing.append(leadingOffsets.begin(), leadingOffsets.end()); indexing.append(majorIvsPlusOffsets.begin(), majorIvsPlusOffsets.end()); indexing.append(minorOffsets.begin(), minorOffsets.end()); Value result; // If `options.unroll` is true, extract the 1-D vector from the // aggregate. if (options.unroll) result = vector_extract(xferOp.vector(), majorIvs); else result = std_load(alloc, majorIvs); auto map = getTransferMinorIdentityMap(xferOp.getMemRefType(), minorVectorType); ArrayAttr masked; if (!xferOp.isMaskedDim(xferOp.getVectorType().getRank() - 1)) { OpBuilder &b = ScopedContext::getBuilderRef(); masked = b.getBoolArrayAttr({false}); } vector_transfer_write(result, xferOp.memref(), indexing, AffineMapAttr::get(map), masked); }; // 1. Compute the inBoundsCondition in the current loops ivs + offset // context. SmallVector majorIvsPlusOffsets; Value inBoundsCondition = emitInBoundsCondition( rewriter, cast(xferOp.getOperation()), leadingRank, majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets); if (inBoundsCondition) { // 2.a. If the condition is not null, we need an IfOp, to write // conditionally. Progressively lower to a 1-D transfer write. conditionBuilder(inBoundsCondition, [&] { emitTransferWrite(majorIvsPlusOffsets); }); } else { // 2.b. Guaranteed in-bounds. Progressively lower to a 1-D transfer write. emitTransferWrite(majorIvsPlusOffsets); } }); rewriter.eraseOp(op); return success(); } } // namespace /// Analyzes the `transfer` to find an access dimension along the fastest remote /// MemRef dimension. If such a dimension with coalescing properties is found, /// `pivs` and `vectorBoundsCapture` are swapped so that the invocation of /// LoopNestBuilder captures it in the innermost loop. template static int computeCoalescedIndex(TransferOpTy transfer) { // rank of the remote memory access, coalescing behavior occurs on the // innermost memory dimension. auto remoteRank = transfer.getMemRefType().getRank(); // Iterate over the results expressions of the permutation map to determine // the loop order for creating pointwise copies between remote and local // memories. int coalescedIdx = -1; auto exprs = transfer.permutation_map().getResults(); for (auto en : llvm::enumerate(exprs)) { auto dim = en.value().template dyn_cast(); if (!dim) { continue; } auto memRefDim = dim.getPosition(); if (memRefDim == remoteRank - 1) { // memRefDim has coalescing properties, it should be swapped in the last // position. assert(coalescedIdx == -1 && "Unexpected > 1 coalesced indices"); coalescedIdx = en.index(); } } return coalescedIdx; } template VectorTransferRewriter::VectorTransferRewriter( VectorTransferToSCFOptions options, MLIRContext *context) : RewritePattern(TransferOpTy::getOperationName(), 1, context), options(options) {} /// Used for staging the transfer in a local buffer. template MemRefType VectorTransferRewriter::tmpMemRefType( TransferOpTy transfer) const { auto vectorType = transfer.getVectorType(); return MemRefType::get(vectorType.getShape().drop_back(), VectorType::get(vectorType.getShape().take_back(), vectorType.getElementType()), {}, 0); } static void emitWithBoundsChecks( PatternRewriter &rewriter, VectorTransferOpInterface transfer, ValueRange ivs, const MemRefBoundsCapture &memRefBoundsCapture, function_ref)> inBoundsFun, function_ref)> outOfBoundsFun = nullptr) { // Permute the incoming indices according to the permutation map. SmallVector indices = applyMapToValues(rewriter, transfer.getLoc(), transfer.permutation_map(), transfer.indices()); // Generate a bounds check if necessary. SmallVector majorIvsPlusOffsets; Value inBoundsCondition = emitInBoundsCondition(rewriter, transfer, 0, ivs, indices, memRefBoundsCapture, majorIvsPlusOffsets); // Apply the permutation map to the ivs. The permutation map may not use all // the inputs. SmallVector scalarAccessExprs(transfer.indices().size()); for (unsigned memRefDim = 0; memRefDim < transfer.indices().size(); ++memRefDim) { // Linear search on a small number of entries. int loopIndex = -1; auto exprs = transfer.permutation_map().getResults(); for (auto en : llvm::enumerate(exprs)) { auto expr = en.value(); auto dim = expr.dyn_cast(); // Sanity check. assert((dim || expr.cast().getValue() == 0) && "Expected dim or 0 in permutationMap"); if (dim && memRefDim == dim.getPosition()) { loopIndex = en.index(); break; } } using namespace edsc::op; auto i = transfer.indices()[memRefDim]; scalarAccessExprs[memRefDim] = loopIndex < 0 ? i : i + ivs[loopIndex]; } if (inBoundsCondition) conditionBuilder( /* scf.if */ inBoundsCondition, // { [&] { inBoundsFun(scalarAccessExprs); }, // } else { outOfBoundsFun ? [&] { outOfBoundsFun(scalarAccessExprs); } : function_ref() // } ); else inBoundsFun(scalarAccessExprs); } namespace mlir { /// Lowers TransferReadOp into a combination of: /// 1. local memory allocation; /// 2. perfect loop nest over: /// a. scalar load from local buffers (viewed as a scalar memref); /// a. scalar store to original memref (with padding). /// 3. vector_load from local buffer (viewed as a memref<1 x vector>); /// 4. local memory deallocation. /// /// Lowers the data transfer part of a TransferReadOp while ensuring no /// out-of-bounds accesses are possible. Out-of-bounds behavior is handled by /// padding. /// Performs the rewrite. template <> LogicalResult VectorTransferRewriter::matchAndRewrite( Operation *op, PatternRewriter &rewriter) const { using namespace mlir::edsc::op; TransferReadOp transfer = cast(op); // Fall back to a loop if the fastest varying stride is not 1 or it is // permuted. int64_t offset; SmallVector strides; auto successStrides = getStridesAndOffset(transfer.getMemRefType(), strides, offset); if (succeeded(successStrides) && strides.back() == 1 && transfer.permutation_map().isMinorIdentity()) { // If > 1D, emit a bunch of loops around 1-D vector transfers. if (transfer.getVectorType().getRank() > 1) return NDTransferOpHelper(rewriter, transfer, options) .doReplace(); // If 1-D this is now handled by the target-specific lowering. if (transfer.getVectorType().getRank() == 1) return failure(); } // Conservative lowering to scalar load / stores. // 1. Setup all the captures. ScopedContext scope(rewriter, transfer.getLoc()); StdIndexedValue remote(transfer.memref()); MemRefBoundsCapture memRefBoundsCapture(transfer.memref()); VectorBoundsCapture vectorBoundsCapture(transfer.vector()); int coalescedIdx = computeCoalescedIndex(transfer); // Swap the vectorBoundsCapture which will reorder loop bounds. if (coalescedIdx >= 0) vectorBoundsCapture.swapRanges(vectorBoundsCapture.rank() - 1, coalescedIdx); auto lbs = vectorBoundsCapture.getLbs(); auto ubs = vectorBoundsCapture.getUbs(); SmallVector steps; steps.reserve(vectorBoundsCapture.getSteps().size()); for (auto step : vectorBoundsCapture.getSteps()) steps.push_back(std_constant_index(step)); // 2. Emit alloc-copy-load-dealloc. MLIRContext *ctx = op->getContext(); Value tmp = setAllocAtFunctionEntry(tmpMemRefType(transfer), transfer); StdIndexedValue local(tmp); loopNestBuilder(lbs, ubs, steps, [&](ValueRange loopIvs) { auto ivsStorage = llvm::to_vector<8>(loopIvs); // Swap the ivs which will reorder memory accesses. if (coalescedIdx >= 0) std::swap(ivsStorage.back(), ivsStorage[coalescedIdx]); ArrayRef ivs(ivsStorage); Value pos = std_index_cast(IntegerType::get(32, ctx), ivs.back()); Value inVector = local(ivs.drop_back()); auto loadValue = [&](ArrayRef indices) { Value vector = vector_insert_element(remote(indices), inVector, pos); local(ivs.drop_back()) = vector; }; auto loadPadding = [&](ArrayRef) { Value vector = vector_insert_element(transfer.padding(), inVector, pos); local(ivs.drop_back()) = vector; }; emitWithBoundsChecks( rewriter, cast(transfer.getOperation()), ivs, memRefBoundsCapture, loadValue, loadPadding); }); Value vectorValue = std_load(vector_type_cast(tmp)); // 3. Propagate. rewriter.replaceOp(op, vectorValue); return success(); } /// Lowers TransferWriteOp into a combination of: /// 1. local memory allocation; /// 2. vector_store to local buffer (viewed as a memref<1 x vector>); /// 3. perfect loop nest over: /// a. scalar load from local buffers (viewed as a scalar memref); /// a. scalar store to original memref (if in bounds). /// 4. local memory deallocation. /// /// More specifically, lowers the data transfer part while ensuring no /// out-of-bounds accesses are possible. template <> LogicalResult VectorTransferRewriter::matchAndRewrite( Operation *op, PatternRewriter &rewriter) const { using namespace edsc::op; TransferWriteOp transfer = cast(op); // Fall back to a loop if the fastest varying stride is not 1 or it is // permuted. int64_t offset; SmallVector strides; auto successStrides = getStridesAndOffset(transfer.getMemRefType(), strides, offset); if (succeeded(successStrides) && strides.back() == 1 && transfer.permutation_map().isMinorIdentity()) { // If > 1D, emit a bunch of loops around 1-D vector transfers. if (transfer.getVectorType().getRank() > 1) return NDTransferOpHelper(rewriter, transfer, options) .doReplace(); // If 1-D this is now handled by the target-specific lowering. if (transfer.getVectorType().getRank() == 1) return failure(); } // 1. Setup all the captures. ScopedContext scope(rewriter, transfer.getLoc()); StdIndexedValue remote(transfer.memref()); MemRefBoundsCapture memRefBoundsCapture(transfer.memref()); Value vectorValue(transfer.vector()); VectorBoundsCapture vectorBoundsCapture(transfer.vector()); int coalescedIdx = computeCoalescedIndex(transfer); // Swap the vectorBoundsCapture which will reorder loop bounds. if (coalescedIdx >= 0) vectorBoundsCapture.swapRanges(vectorBoundsCapture.rank() - 1, coalescedIdx); auto lbs = vectorBoundsCapture.getLbs(); auto ubs = vectorBoundsCapture.getUbs(); SmallVector steps; steps.reserve(vectorBoundsCapture.getSteps().size()); for (auto step : vectorBoundsCapture.getSteps()) steps.push_back(std_constant_index(step)); // 2. Emit alloc-store-copy-dealloc. Value tmp = setAllocAtFunctionEntry(tmpMemRefType(transfer), transfer); StdIndexedValue local(tmp); Value vec = vector_type_cast(tmp); std_store(vectorValue, vec); loopNestBuilder(lbs, ubs, steps, [&](ValueRange loopIvs) { auto ivsStorage = llvm::to_vector<8>(loopIvs); // Swap the ivsStorage which will reorder memory accesses. if (coalescedIdx >= 0) std::swap(ivsStorage.back(), ivsStorage[coalescedIdx]); ArrayRef ivs(ivsStorage); Value pos = std_index_cast(IntegerType::get(32, op->getContext()), ivs.back()); auto storeValue = [&](ArrayRef indices) { Value scalar = vector_extract_element(local(ivs.drop_back()), pos); remote(indices) = scalar; }; emitWithBoundsChecks( rewriter, cast(transfer.getOperation()), ivs, memRefBoundsCapture, storeValue); }); // 3. Erase. rewriter.eraseOp(op); return success(); } void populateVectorToSCFConversionPatterns( OwningRewritePatternList &patterns, MLIRContext *context, const VectorTransferToSCFOptions &options) { patterns.insert, VectorTransferRewriter>(options, context); } } // namespace mlir namespace { struct ConvertVectorToSCFPass : public ConvertVectorToSCFBase { ConvertVectorToSCFPass() = default; ConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) { this->fullUnroll = options.unroll; } void runOnFunction() override { OwningRewritePatternList patterns; auto *context = getFunction().getContext(); populateVectorToSCFConversionPatterns( patterns, context, VectorTransferToSCFOptions().setUnroll(fullUnroll)); applyPatternsAndFoldGreedily(getFunction(), std::move(patterns)); } }; } // namespace std::unique_ptr mlir::createConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) { return std::make_unique(options); }