1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMISSION_UTILS_H_
17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMISSION_UTILS_H_
18
19 #include <string>
20 #include <utility>
21
22 #include "llvm/ADT/DenseMap.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/IR/IRBuilder.h"
25 #include "llvm/IR/Value.h"
26 #include "mlir/IR/BuiltinTypes.h" // from @llvm-project
27 #include "mlir/IR/Operation.h" // from @llvm-project
28 #include "mlir/IR/Value.h" // from @llvm-project
29 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
30 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
31 #include "tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h"
32 #include "tensorflow/compiler/mlir/xla/type_to_shape.h"
33 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
34 #include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
35 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
36 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
37 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
38
39 // TODO(jlebar): Move functions related to cublas/cudnn to a separate file; they
40 // don't belong in "ir_emission_utils".
41
42 namespace xla {
43 namespace gpu {
44
45 // Different types of convolutions supported by cudnn.
46 //
47 // A way to think about these is that a convolution is defined by three arrays
48 // -- the "input", the "filter", and the "output" -- and given any two of these,
49 // we can compute the third. For example, a backward-input convolution takes as
50 // input a filter and an "output" and produces an "input" such that if one were
51 // to do a forward convolution of "input" using filter, the result would be
52 // something with the same shape as "output".
53 //
54 // This way of thinking is not correct if you look at the values produced. For
55 // example, a backward-input convolution is not actually the mathematical
56 // inverse of a forward convolution. But it's right as far as the shapes and
57 // "connectivity" (i.e. which elements of the input affect which elements of
58 // the output) are concerned.
59 enum class CudnnConvKind {
60 kForward, // input + filter => output
61 kBackwardInput, // filter + output => input
62 kBackwardFilter, // input + output => filter
63 kForwardActivation, // activation(conv(input, filter) + broadcast(bias) +
64 // (optionally) side_input) => output
65 };
66
67 StatusOr<CudnnConvKind> GetCudnnConvKind(const HloCustomCallInstruction* instr);
68
69 // Converts a CudnnConvKind value to a string.
70 string CudnnConvKindToString(CudnnConvKind kind);
71
72 // Matrix multiplication before the rewrite.
73 //
74 // This function should never return "true" on instructions after
75 // GemmRewriter pass has finished.
76 bool IsMatrixMultiplication(const HloInstruction& dot);
77
78 // Matrix multiplication rewritten into a GEMM custom call.
79 // All matrix multiplications should be rewritten as such custom calls
80 // after a GemmRewriter lowering pass.
81 bool IsCublasGemm(const HloInstruction& hlo);
82
83 constexpr int64_t kWarpSize = 32;
84
85 // Need at least 256 threads/block for reasonable tree reduction
86 // performance (assuming all data fits).
87 constexpr int64_t kMinThreadsXRowReduction = 256;
88
89 // When doing batched row reduction, how big the batch dimension could be.
90 static constexpr int64_t kBatchedReductionRaceFreeBound = 8;
91
92 // A call to cuBLAS general matrix multiplication API.
93 extern const char* const kGemmCallTarget;
94
95 // A call to cuDNN for batch normalization is represented as CustomCall HLO with
96 // a call target equal to one of these strings.
97 //
98 // The operands to and outputs of these calls are the same as those of the
99 // corresponding HLOs, except:
100 //
101 // - epsilon and feature_index are proper operands, at the end of the operands
102 // list. They must be HLO constants.
103 // - The cuDNN forward training call returns inv_stddev =
104 // 1/sqrt(variance + epsilon) in place of plain variance.
105 // - Similarly, BatchNormGrad accepts inv_stddev in place of the variance
106 // operand.
107 extern const char* const kCudnnBatchNormForwardInferenceCallTarget;
108 extern const char* const kCudnnBatchNormForwardTrainingCallTarget;
109 extern const char* const kCudnnBatchNormBackwardCallTarget;
110
111 // Returns true if `hlo` will be implemented as a call to a cuDNN batch
112 // normalization routine.
113 //
114 // This returns true if `hlo` is a CustomCall HLO with a call target equal to
115 // one of the kCudnnBatchNormFoo constants above, but returns *false* for HLOs
116 // with one of the kBatchNorm opcodes, because these are lowered either to a
117 // sequence of generic HLOs or to a cuDNN CustomCall.
118 bool IsCustomCallToDnnBatchNorm(const HloInstruction& hlo);
119
120 // A call to cuDNN for convolution (forward, backward filter, or backward input)
121 // is represented as a CustomCall HLO with a call target equal to one of these
122 // strings.
123 //
124 // These CustomCalls have window() and convolution_dimension_numbers() set like
125 // regular convolution ops. They have the same LHS and RHS operands, plus two
126 // additional constant operands: an int64 operand for the cudnn algorithm and
127 // a bool operand for whether tensor_ops is enabled. A value of -1 for the cudnn
128 // algorithm means that the implementation is free to choose the best algorithm
129 // it can.
130 //
131 // These calls output a tuple (conv_result, scratch_memory), where conv_result
132 // is the actual result of the convolution, and scratch_memory is temporary
133 // memory used by cudnn. Callers shouldn't inspect scratch_memory, as its value
134 // is not well-defined.
135 //
136 // GpuConvRewriter lowers kConvolution HLOs to these custom calls.
137 // When it does so, it chooses algorithm -1 and 0 bytes of scratch space. Later
138 // on in the pipeline, CudnnConvAlgorithmChooser chooses an explicit
139 // algorithm for each conv and sets the amount of scratch space needed.
140 //
141 // (Representing the scratch memory as an output may seem strange at first, but
142 // it's quite sensible, from a certain point of view. The scratch buffer is a
143 // location in memory that the conv can write into, but which it can't legally
144 // read from, at least until it's written something first. But that's exactly
145 // the definition of an output buffer.)
146 extern const char* const kCudnnConvForwardCallTarget;
147 extern const char* const kCudnnConvBackwardInputCallTarget;
148 extern const char* const kCudnnConvBackwardFilterCallTarget;
149 extern const char* const kCudnnConvBiasActivationForwardCallTarget;
150
151 // Returns true if `hlo` will be implemented as a call to a cuDNN convolution
152 // routine.
153 //
154 // This returns true if `hlo` is a CustomCall HLO with a call target equal to
155 // one of the kCudnnConvFoo constants above, but returns *false* for HLOs with a
156 // kConvolution opcode.
157 bool IsCustomCallToDnnConvolution(const HloInstruction& hlo);
158
159 // Returns true if `hlo` will be implemented as a call to a cuSolver routine.
160 //
161 // This returns true if `hlo` is a CustomCall HLO with a call target equal to
162 // one of the kCusolver... constants, but returns *false* for HLOs with
163 // say, a kCholesky opcode.
164 bool IsCustomCallToCusolver(const HloInstruction& hlo);
165
166 // Cholesky decomposition. Takes a (batched) matrix as input, and returns a
167 // tuple of (result, workspace, info), where result is the result of the
168 // Cholesky decomposition, workspace is scratch space for cuSolver, and info
169 // is a success/failure code per batch element.
170 extern const char* const kCusolverCholeskyCallTarget;
171
172 // Returns true if `hlo` will be implemented as a library call, e.g. cuBLAS gemm
173 // or cuDNN convolution.
174 bool ImplementedAsLibraryCall(const HloInstruction& hlo);
175
176 // Layout analysis for fusion. The constructor will analyze the given LMHLO
177 // fusion operation and store the inferred layouts of fusion internal values.
178 // The default constructor will be used when dealing with LMHLO operations, in
179 // which case there no analysis is needed and the layout can be inferred from
180 // the memref types (so that we can have a unified interface in helper functions
181 // to query layouts).
182 class FusionLayoutAnalysis {
183 public:
FusionLayoutAnalysis()184 FusionLayoutAnalysis() {}
185 explicit FusionLayoutAnalysis(mlir::lmhlo::FusionOp fusion_op);
186
187 // Gets the shape of a given value, including its inferred layout.
188 Shape GetShape(mlir::Value value) const;
189
190 private:
191 llvm::DenseMap<mlir::Value, Layout> layouts_;
192 };
193
194 // Returns true if either the dimensions being reduced or the dimensions being
195 // kept are contiguous in the input of the reduce instruction.
196 bool IsReductionFromOrToContiguousDimensions(const HloInstruction& reduce);
197
198 // MLIR variant that relies on the shape layouts from fusion layout analysis.
199 bool IsReductionFromOrToContiguousDimensions(
200 mlir::Operation* reduce, const FusionLayoutAnalysis& layout_analysis);
201
202 // Returns whether unnested_hlo is an input fusion whose root is either a slice
203 // or a tuple of slices. If verify_no_strides is true, returns false unless all
204 // ROOT slices have no strides.
205 bool IsInputFusibleSlices(mlir::Operation* unnested_hlo,
206 bool verify_no_strides);
207
208 struct ReductionDimensions {
209 // Indicates whether the reduction is a row reduction or a column reduction.
210 bool is_row_reduction;
211
212 // Contains the size of the three contiguous components for
213 // the reduction [depth, height, width] (major-to-minor ordering).
214 //
215 // For row reduction, we do: [D, H, W] -> [D, H].
216 // For column reduction, we do: [D, H, W] -> [D, W].
217 std::array<int64, 3> dimensions;
218 };
219
220 // Given the input shape and dimensions to reduce for a reduction, returns
221 // ReductionDimensions.
222 //
223 // Prerequisite: the reduction instruction passes the check
224 // IsReductionFromOrToContiguousDimensions, which guarantees either the
225 // dimensions to reduce or the dimensions to keep are consecutive.
226 ReductionDimensions GetReductionKindAndContiguousComponents(
227 const HloInstruction& reduce);
228 ReductionDimensions GetReductionKindAndContiguousComponents(
229 mlir::Operation* reduce);
230
231 // Get tiling per thread for the given reduction in dimensions [D, H, W].
232 std::array<int64, 3> GetReductionTiling(
233 const ReductionDimensions& reduction_dimensions,
234 int smallest_input_dtype_bits,
235 se::CudaComputeCapability cuda_compute_capability);
236
237 // Emits call to "vprintf" with given format and arguments.
238 llvm::Value* EmitPrintf(absl::string_view fmt,
239 absl::Span<llvm::Value* const> arguments,
240 llvm::IRBuilder<>* builder);
241
242 // Emits code to shuffle data between threads of a warp. This has the same
243 // semantics as the PTX "shfl.sync.down" instruction but works for values that
244 // aren't 32 bits in size. The last operand of the emitted "shfl" is
245 // `kWarpSize - 1`.
246 //
247 // This function emits a "full-warp" shuffle, which all threads of a warp
248 // participate in. *Do not use this function from a divergent context:* You
249 // can't correctly do so on both Volta and earlier GPUs.
250 //
251 // https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-shfl-sync
252 llvm::Value* EmitFullWarpShuffleDown(llvm::Value* value, llvm::Value* offset,
253 llvm::IRBuilder<>* builder);
254
255 // Emits code that determines whether the current thread is thread 0 within
256 // block 0 of the kernel.
257 llvm::Value* IsBlock0Thread0(llvm::IRBuilder<>* b);
258
259 // Returns whether the output of a fusion with reduction are consistent with
260 // `first_reduce`.
261 bool IsFusedReductionOutputConsistent(const HloInstruction* inst,
262 const HloInstruction* first_reduce);
263 bool IsFusedReductionOutputConsistent(
264 mlir::mhlo::ReduceOp inst, mlir::mhlo::ReduceOp first_reduce,
265 const FusionLayoutAnalysis& layout_analysis);
266
AreFusedReductionOutputsConsistent(absl::Span<const HloInstruction * const> output_instructions,const HloInstruction * first_reduce)267 inline bool AreFusedReductionOutputsConsistent(
268 absl::Span<const HloInstruction* const> output_instructions,
269 const HloInstruction* first_reduce) {
270 return absl::c_all_of(output_instructions, [=](const HloInstruction* inst) {
271 return IsFusedReductionOutputConsistent(inst, first_reduce);
272 });
273 }
274
MlirToString(mlir::Operation * op)275 inline std::string MlirToString(mlir::Operation* op) {
276 std::string s;
277 {
278 llvm::raw_string_ostream os(s);
279 op->print(os);
280 }
281 return s;
282 }
283
MlirToString(const mlir::Location & loc)284 inline std::string MlirToString(const mlir::Location& loc) {
285 std::string s;
286 {
287 llvm::raw_string_ostream os(s);
288 loc.print(os);
289 }
290 return s;
291 }
292
293 int PartitionLmhloOperandsAndOutputs(mlir::Operation* op);
294 std::vector<mlir::Value> GetHloOperands(mlir::Operation* op);
295 std::vector<mlir::Value> GetHloOutputs(mlir::Operation* op);
296
297 bool WritesMlirBuffer(mlir::Operation* op, mlir::Value operand);
298
299 template <typename T>
ToStdVector(const llvm::SmallVectorImpl<T> & v)300 std::vector<T> ToStdVector(const llvm::SmallVectorImpl<T>& v) {
301 return std::vector<T>(v.begin(), v.end());
302 }
303
304 StatusOr<BufferAllocation::Slice> GetAllocationSlice(
305 mlir::Value v, absl::Span<const BufferAllocation> allocations,
306 std::string* constant_name = nullptr);
307
308 bool CanEmitFusedDynamicUpdateSliceInPlaceForGpu(
309 mlir::lmhlo::FusionOp fusion,
310 absl::Span<const BufferAllocation> allocations);
311
312 Shape GetShape(mlir::Value value);
313
314 // Returns whether the given reduction can be safely generated without atomics:
315 // that is, at most one block will write to every output element.
316 bool ReductionIsRaceFree(const ReductionDimensions& reduction_dimensions,
317 const std::array<int64_t, 3>& reduction_tiling);
318
319 } // namespace gpu
320 } // namespace xla
321
322 #endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMISSION_UTILS_H_
323