1 //===- LowerGpuOpsToNVVMOps.cpp - MLIR GPU to NVVM lowering passes --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a pass to generate NVVMIR operations for higher-level
10 // GPU operations.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
15
16 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
17 #include "mlir/Dialect/GPU/GPUDialect.h"
18 #include "mlir/Dialect/GPU/Passes.h"
19 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
20 #include "mlir/IR/BlockAndValueMapping.h"
21 #include "mlir/Transforms/DialectConversion.h"
22 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
23 #include "llvm/Support/FormatVariadic.h"
24
25 #include "../GPUCommon/GPUOpsLowering.h"
26 #include "../GPUCommon/IndexIntrinsicsOpLowering.h"
27 #include "../GPUCommon/OpToFuncCallLowering.h"
28 #include "../PassDetail.h"
29
30 using namespace mlir;
31
32 namespace {
33
34 struct GPUShuffleOpLowering : public ConvertToLLVMPattern {
GPUShuffleOpLowering__anon1c69e0dd0111::GPUShuffleOpLowering35 explicit GPUShuffleOpLowering(LLVMTypeConverter &lowering_)
36 : ConvertToLLVMPattern(gpu::ShuffleOp::getOperationName(),
37 lowering_.getDialect()->getContext(), lowering_) {}
38
39 /// Lowers a shuffle to the corresponding NVVM op.
40 ///
41 /// Convert the `width` argument into an activeMask (a bitmask which specifies
42 /// which threads participate in the shuffle) and a maskAndClamp (specifying
43 /// the highest lane which participates in the shuffle).
44 ///
45 /// %one = llvm.constant(1 : i32) : !llvm.i32
46 /// %shl = llvm.shl %one, %width : !llvm.i32
47 /// %active_mask = llvm.sub %shl, %one : !llvm.i32
48 /// %mask_and_clamp = llvm.sub %width, %one : !llvm.i32
49 /// %shfl = nvvm.shfl.sync.bfly %active_mask, %value, %offset,
50 /// %mask_and_clamp : !llvm<"{ float, i1 }">
51 /// %shfl_value = llvm.extractvalue %shfl[0 : index] :
52 /// !llvm<"{ float, i1 }">
53 /// %shfl_pred = llvm.extractvalue %shfl[1 : index] :
54 /// !llvm<"{ float, i1 }">
55 LogicalResult
matchAndRewrite__anon1c69e0dd0111::GPUShuffleOpLowering56 matchAndRewrite(Operation *op, ArrayRef<Value> operands,
57 ConversionPatternRewriter &rewriter) const override {
58 Location loc = op->getLoc();
59 gpu::ShuffleOpAdaptor adaptor(operands);
60
61 auto valueTy = adaptor.value().getType().cast<LLVM::LLVMType>();
62 auto int32Type = LLVM::LLVMType::getInt32Ty(rewriter.getContext());
63 auto predTy = LLVM::LLVMType::getInt1Ty(rewriter.getContext());
64 auto resultTy =
65 LLVM::LLVMType::getStructTy(rewriter.getContext(), {valueTy, predTy});
66
67 Value one = rewriter.create<LLVM::ConstantOp>(
68 loc, int32Type, rewriter.getI32IntegerAttr(1));
69 // Bit mask of active lanes: `(1 << activeWidth) - 1`.
70 Value activeMask = rewriter.create<LLVM::SubOp>(
71 loc, int32Type,
72 rewriter.create<LLVM::ShlOp>(loc, int32Type, one, adaptor.width()),
73 one);
74 // Clamp lane: `activeWidth - 1`
75 Value maskAndClamp =
76 rewriter.create<LLVM::SubOp>(loc, int32Type, adaptor.width(), one);
77
78 auto returnValueAndIsValidAttr = rewriter.getUnitAttr();
79 Value shfl = rewriter.create<NVVM::ShflBflyOp>(
80 loc, resultTy, activeMask, adaptor.value(), adaptor.offset(),
81 maskAndClamp, returnValueAndIsValidAttr);
82 Value shflValue = rewriter.create<LLVM::ExtractValueOp>(
83 loc, valueTy, shfl, rewriter.getIndexArrayAttr(0));
84 Value isActiveSrcLane = rewriter.create<LLVM::ExtractValueOp>(
85 loc, predTy, shfl, rewriter.getIndexArrayAttr(1));
86
87 rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
88 return success();
89 }
90 };
91
92 /// Import the GPU Ops to NVVM Patterns.
93 #include "GPUToNVVM.cpp.inc"
94
95 /// A pass that replaces all occurrences of GPU device operations with their
96 /// corresponding NVVM equivalent.
97 ///
98 /// This pass only handles device code and is not meant to be run on GPU host
99 /// code.
100 struct LowerGpuOpsToNVVMOpsPass
101 : public ConvertGpuOpsToNVVMOpsBase<LowerGpuOpsToNVVMOpsPass> {
102 LowerGpuOpsToNVVMOpsPass() = default;
LowerGpuOpsToNVVMOpsPass__anon1c69e0dd0111::LowerGpuOpsToNVVMOpsPass103 LowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth) {
104 this->indexBitwidth = indexBitwidth;
105 }
106
runOnOperation__anon1c69e0dd0111::LowerGpuOpsToNVVMOpsPass107 void runOnOperation() override {
108 gpu::GPUModuleOp m = getOperation();
109
110 /// Customize the bitwidth used for the device side index computations.
111 LowerToLLVMOptions options = {/*useBarePtrCallConv =*/false,
112 /*emitCWrappers =*/true,
113 /*indexBitwidth =*/indexBitwidth,
114 /*useAlignedAlloc =*/false};
115
116 /// MemRef conversion for GPU to NVVM lowering. The GPU dialect uses memory
117 /// space 5 for private memory attributions, but NVVM represents private
118 /// memory allocations as local `alloca`s in the default address space. This
119 /// converter drops the private memory space to support the use case above.
120 LLVMTypeConverter converter(m.getContext(), options);
121 converter.addConversion([&](MemRefType type) -> Optional<Type> {
122 if (type.getMemorySpace() != gpu::GPUDialect::getPrivateAddressSpace())
123 return llvm::None;
124 return converter.convertType(MemRefType::Builder(type).setMemorySpace(0));
125 });
126
127 OwningRewritePatternList patterns, llvmPatterns;
128
129 // Apply in-dialect lowering first. In-dialect lowering will replace ops
130 // which need to be lowered further, which is not supported by a single
131 // conversion pass.
132 populateGpuRewritePatterns(m.getContext(), patterns);
133 applyPatternsAndFoldGreedily(m, std::move(patterns));
134
135 populateStdToLLVMConversionPatterns(converter, llvmPatterns);
136 populateGpuToNVVMConversionPatterns(converter, llvmPatterns);
137 LLVMConversionTarget target(getContext());
138 configureGpuToNVVMConversionLegality(target);
139 if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
140 signalPassFailure();
141 }
142 };
143
144 } // anonymous namespace
145
configureGpuToNVVMConversionLegality(ConversionTarget & target)146 void mlir::configureGpuToNVVMConversionLegality(ConversionTarget &target) {
147 target.addIllegalOp<FuncOp>();
148 target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
149 target.addLegalDialect<::mlir::NVVM::NVVMDialect>();
150 target.addIllegalDialect<gpu::GPUDialect>();
151 target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::FAbsOp, LLVM::FCeilOp,
152 LLVM::FFloorOp, LLVM::LogOp, LLVM::Log10Op, LLVM::Log2Op,
153 LLVM::SinOp, LLVM::SqrtOp>();
154
155 // TODO: Remove once we support replacing non-root ops.
156 target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp, gpu::ModuleEndOp>();
157 }
158
populateGpuToNVVMConversionPatterns(LLVMTypeConverter & converter,OwningRewritePatternList & patterns)159 void mlir::populateGpuToNVVMConversionPatterns(
160 LLVMTypeConverter &converter, OwningRewritePatternList &patterns) {
161 populateWithGenerated(converter.getDialect()->getContext(), patterns);
162 patterns
163 .insert<GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, NVVM::ThreadIdXOp,
164 NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,
165 GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, NVVM::BlockDimXOp,
166 NVVM::BlockDimYOp, NVVM::BlockDimZOp>,
167 GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, NVVM::BlockIdXOp,
168 NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
169 GPUIndexIntrinsicOpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
170 NVVM::GridDimYOp, NVVM::GridDimZOp>,
171 GPUShuffleOpLowering, GPUReturnOpLowering,
172 // Explicitly drop memory space when lowering private memory
173 // attributions since NVVM models it as `alloca`s in the default
174 // memory space and does not support `alloca`s with addrspace(5).
175 GPUFuncOpLowering<0>>(converter);
176 patterns.insert<OpToFuncCallLowering<AbsFOp>>(converter, "__nv_fabsf",
177 "__nv_fabs");
178 patterns.insert<OpToFuncCallLowering<CeilFOp>>(converter, "__nv_ceilf",
179 "__nv_ceil");
180 patterns.insert<OpToFuncCallLowering<CosOp>>(converter, "__nv_cosf",
181 "__nv_cos");
182 patterns.insert<OpToFuncCallLowering<ExpOp>>(converter, "__nv_expf",
183 "__nv_exp");
184 patterns.insert<OpToFuncCallLowering<FloorFOp>>(converter, "__nv_floorf",
185 "__nv_floor");
186 patterns.insert<OpToFuncCallLowering<LogOp>>(converter, "__nv_logf",
187 "__nv_log");
188 patterns.insert<OpToFuncCallLowering<Log10Op>>(converter, "__nv_log10f",
189 "__nv_log10");
190 patterns.insert<OpToFuncCallLowering<Log2Op>>(converter, "__nv_log2f",
191 "__nv_log2");
192 patterns.insert<OpToFuncCallLowering<RsqrtOp>>(converter, "__nv_rsqrtf",
193 "__nv_rsqrt");
194 patterns.insert<OpToFuncCallLowering<SinOp>>(converter, "__nv_sinf",
195 "__nv_sin");
196 patterns.insert<OpToFuncCallLowering<SqrtOp>>(converter, "__nv_sqrtf",
197 "__nv_sqrt");
198 patterns.insert<OpToFuncCallLowering<TanhOp>>(converter, "__nv_tanhf",
199 "__nv_tanh");
200 }
201
202 std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
createLowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth)203 mlir::createLowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth) {
204 return std::make_unique<LowerGpuOpsToNVVMOpsPass>(indexBitwidth);
205 }
206