1 //===- GPUOpsLowering.h - GPU FuncOp / ReturnOp lowering -------*- C++ -*--===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 #ifndef MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_ 9 #define MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_ 10 11 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" 12 #include "mlir/Dialect/GPU/GPUDialect.h" 13 #include "mlir/Dialect/LLVMIR/LLVMDialect.h" 14 #include "mlir/Dialect/StandardOps/IR/Ops.h" 15 #include "mlir/IR/Builders.h" 16 #include "llvm/Support/FormatVariadic.h" 17 18 namespace mlir { 19 20 template <unsigned AllocaAddrSpace> 21 struct GPUFuncOpLowering : ConvertToLLVMPattern { GPUFuncOpLoweringGPUFuncOpLowering22 explicit GPUFuncOpLowering(LLVMTypeConverter &typeConverter) 23 : ConvertToLLVMPattern(gpu::GPUFuncOp::getOperationName(), 24 typeConverter.getDialect()->getContext(), 25 typeConverter) {} 26 27 LogicalResult matchAndRewriteGPUFuncOpLowering28 matchAndRewrite(Operation *op, ArrayRef<Value> operands, 29 ConversionPatternRewriter &rewriter) const override { 30 assert(operands.empty() && "func op is not expected to have operands"); 31 auto gpuFuncOp = cast<gpu::GPUFuncOp>(op); 32 Location loc = gpuFuncOp.getLoc(); 33 34 SmallVector<LLVM::GlobalOp, 3> workgroupBuffers; 35 workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions()); 36 for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) { 37 Value attribution = en.value(); 38 39 auto type = attribution.getType().dyn_cast<MemRefType>(); 40 assert(type && type.hasStaticShape() && "unexpected type in attribution"); 41 42 uint64_t numElements = type.getNumElements(); 43 44 auto elementType = typeConverter->convertType(type.getElementType()) 45 .template cast<LLVM::LLVMType>(); 46 auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements); 47 std::string name = std::string( 48 llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index())); 49 auto globalOp = rewriter.create<LLVM::GlobalOp>( 50 gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false, 51 LLVM::Linkage::Internal, name, /*value=*/Attribute(), 52 gpu::GPUDialect::getWorkgroupAddressSpace()); 53 workgroupBuffers.push_back(globalOp); 54 } 55 56 // Rewrite the original GPU function to an LLVM function. 57 auto funcType = typeConverter->convertType(gpuFuncOp.getType()) 58 .template cast<LLVM::LLVMType>() 59 .getPointerElementTy(); 60 61 // Remap proper input types. 62 TypeConverter::SignatureConversion signatureConversion( 63 gpuFuncOp.front().getNumArguments()); 64 getTypeConverter()->convertFunctionSignature( 65 gpuFuncOp.getType(), /*isVariadic=*/false, signatureConversion); 66 67 // Create the new function operation. Only copy those attributes that are 68 // not specific to function modeling. 69 SmallVector<NamedAttribute, 4> attributes; 70 for (const auto &attr : gpuFuncOp.getAttrs()) { 71 if (attr.first == SymbolTable::getSymbolAttrName() || 72 attr.first == impl::getTypeAttrName() || 73 attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName()) 74 continue; 75 attributes.push_back(attr); 76 } 77 auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>( 78 gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType, 79 LLVM::Linkage::External, attributes); 80 81 { 82 // Insert operations that correspond to converted workgroup and private 83 // memory attributions to the body of the function. This must operate on 84 // the original function, before the body region is inlined in the new 85 // function to maintain the relation between block arguments and the 86 // parent operation that assigns their semantics. 87 OpBuilder::InsertionGuard guard(rewriter); 88 89 // Rewrite workgroup memory attributions to addresses of global buffers. 90 rewriter.setInsertionPointToStart(&gpuFuncOp.front()); 91 unsigned numProperArguments = gpuFuncOp.getNumArguments(); 92 auto i32Type = LLVM::LLVMType::getInt32Ty(rewriter.getContext()); 93 94 Value zero = nullptr; 95 if (!workgroupBuffers.empty()) 96 zero = rewriter.create<LLVM::ConstantOp>(loc, i32Type, 97 rewriter.getI32IntegerAttr(0)); 98 for (auto en : llvm::enumerate(workgroupBuffers)) { 99 LLVM::GlobalOp global = en.value(); 100 Value address = rewriter.create<LLVM::AddressOfOp>(loc, global); 101 auto elementType = global.getType().getArrayElementType(); 102 Value memory = rewriter.create<LLVM::GEPOp>( 103 loc, elementType.getPointerTo(global.addr_space()), address, 104 ArrayRef<Value>{zero, zero}); 105 106 // Build a memref descriptor pointing to the buffer to plug with the 107 // existing memref infrastructure. This may use more registers than 108 // otherwise necessary given that memref sizes are fixed, but we can try 109 // and canonicalize that away later. 110 Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()]; 111 auto type = attribution.getType().cast<MemRefType>(); 112 auto descr = MemRefDescriptor::fromStaticShape( 113 rewriter, loc, *getTypeConverter(), type, memory); 114 signatureConversion.remapInput(numProperArguments + en.index(), descr); 115 } 116 117 // Rewrite private memory attributions to alloca'ed buffers. 118 unsigned numWorkgroupAttributions = 119 gpuFuncOp.getNumWorkgroupAttributions(); 120 auto int64Ty = LLVM::LLVMType::getInt64Ty(rewriter.getContext()); 121 for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) { 122 Value attribution = en.value(); 123 auto type = attribution.getType().cast<MemRefType>(); 124 assert(type && type.hasStaticShape() && 125 "unexpected type in attribution"); 126 127 // Explicitly drop memory space when lowering private memory 128 // attributions since NVVM models it as `alloca`s in the default 129 // memory space and does not support `alloca`s with addrspace(5). 130 auto ptrType = typeConverter->convertType(type.getElementType()) 131 .template cast<LLVM::LLVMType>() 132 .getPointerTo(AllocaAddrSpace); 133 Value numElements = rewriter.create<LLVM::ConstantOp>( 134 gpuFuncOp.getLoc(), int64Ty, 135 rewriter.getI64IntegerAttr(type.getNumElements())); 136 Value allocated = rewriter.create<LLVM::AllocaOp>( 137 gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0); 138 auto descr = MemRefDescriptor::fromStaticShape( 139 rewriter, loc, *getTypeConverter(), type, allocated); 140 signatureConversion.remapInput( 141 numProperArguments + numWorkgroupAttributions + en.index(), descr); 142 } 143 } 144 145 // Move the region to the new function, update the entry block signature. 146 rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(), 147 llvmFuncOp.end()); 148 if (failed(rewriter.convertRegionTypes( 149 &llvmFuncOp.getBody(), *typeConverter, &signatureConversion))) 150 return failure(); 151 152 rewriter.eraseOp(gpuFuncOp); 153 return success(); 154 } 155 }; 156 157 struct GPUReturnOpLowering : public ConvertToLLVMPattern { GPUReturnOpLoweringGPUReturnOpLowering158 GPUReturnOpLowering(LLVMTypeConverter &typeConverter) 159 : ConvertToLLVMPattern(gpu::ReturnOp::getOperationName(), 160 typeConverter.getDialect()->getContext(), 161 typeConverter) {} 162 163 LogicalResult matchAndRewriteGPUReturnOpLowering164 matchAndRewrite(Operation *op, ArrayRef<Value> operands, 165 ConversionPatternRewriter &rewriter) const override { 166 rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(op, operands); 167 return success(); 168 } 169 }; 170 171 } // namespace mlir 172 173 #endif // MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_ 174