• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMISSION_UTILS_H_
17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMISSION_UTILS_H_
18 
19 #include <string>
20 #include <utility>
21 
22 #include "llvm/ADT/DenseMap.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/IR/IRBuilder.h"
25 #include "llvm/IR/Value.h"
26 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
27 #include "mlir/IR/Operation.h"  // from @llvm-project
28 #include "mlir/IR/Value.h"  // from @llvm-project
29 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
30 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
31 #include "tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h"
32 #include "tensorflow/compiler/mlir/xla/type_to_shape.h"
33 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
34 #include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
35 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
36 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
37 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
38 
39 // TODO(jlebar): Move functions related to cublas/cudnn to a separate file; they
40 // don't belong in "ir_emission_utils".
41 
42 namespace xla {
43 namespace gpu {
44 
45 // Different types of convolutions supported by cudnn.
46 //
47 // A way to think about these is that a convolution is defined by three arrays
48 // -- the "input", the "filter", and the "output" -- and given any two of these,
49 // we can compute the third.  For example, a backward-input convolution takes as
50 // input a filter and an "output" and produces an "input" such that if one were
51 // to do a forward convolution of "input" using filter, the result would be
52 // something with the same shape as "output".
53 //
54 // This way of thinking is not correct if you look at the values produced. For
55 // example, a backward-input convolution is not actually the mathematical
56 // inverse of a forward convolution.  But it's right as far as the shapes and
57 // "connectivity" (i.e. which elements of the input affect which elements of
58 // the output) are concerned.
59 enum class CudnnConvKind {
60   kForward,            // input  + filter => output
61   kBackwardInput,      // filter + output => input
62   kBackwardFilter,     // input  + output => filter
63   kForwardActivation,  // activation(conv(input, filter) + broadcast(bias) +
64                        // (optionally) side_input) => output
65 };
66 
67 StatusOr<CudnnConvKind> GetCudnnConvKind(const HloCustomCallInstruction* instr);
68 
69 // Converts a CudnnConvKind value to a string.
70 string CudnnConvKindToString(CudnnConvKind kind);
71 
72 // Matrix multiplication before the rewrite.
73 //
74 // This function should never return "true" on instructions after
75 // GemmRewriter pass has finished.
76 bool IsMatrixMultiplication(const HloInstruction& dot);
77 
78 // Matrix multiplication rewritten into a GEMM custom call.
79 // All matrix multiplications should be rewritten as such custom calls
80 // after a GemmRewriter lowering pass.
81 bool IsCublasGemm(const HloInstruction& hlo);
82 
83 constexpr int64_t kWarpSize = 32;
84 
85 // Need at least 256 threads/block for reasonable tree reduction
86 // performance (assuming all data fits).
87 constexpr int64_t kMinThreadsXRowReduction = 256;
88 
89 // When doing batched row reduction, how big the batch dimension could be.
90 static constexpr int64_t kBatchedReductionRaceFreeBound = 8;
91 
92 // A call to cuBLAS general matrix multiplication API.
93 extern const char* const kGemmCallTarget;
94 
95 // A call to cuDNN for batch normalization is represented as CustomCall HLO with
96 // a call target equal to one of these strings.
97 //
98 // The operands to and outputs of these calls are the same as those of the
99 // corresponding HLOs, except:
100 //
101 //  - epsilon and feature_index are proper operands, at the end of the operands
102 //    list.  They must be HLO constants.
103 //  - The cuDNN forward training call returns inv_stddev =
104 //    1/sqrt(variance + epsilon) in place of plain variance.
105 //  - Similarly, BatchNormGrad accepts inv_stddev in place of the variance
106 //    operand.
107 extern const char* const kCudnnBatchNormForwardInferenceCallTarget;
108 extern const char* const kCudnnBatchNormForwardTrainingCallTarget;
109 extern const char* const kCudnnBatchNormBackwardCallTarget;
110 
111 // Returns true if `hlo` will be implemented as a call to a cuDNN batch
112 // normalization routine.
113 //
114 // This returns true if `hlo` is a CustomCall HLO with a call target equal to
115 // one of the kCudnnBatchNormFoo constants above, but returns *false* for HLOs
116 // with one of the kBatchNorm opcodes, because these are lowered either to a
117 // sequence of generic HLOs or to a cuDNN CustomCall.
118 bool IsCustomCallToDnnBatchNorm(const HloInstruction& hlo);
119 
120 // A call to cuDNN for convolution (forward, backward filter, or backward input)
121 // is represented as a CustomCall HLO with a call target equal to one of these
122 // strings.
123 //
124 // These CustomCalls have window() and convolution_dimension_numbers() set like
125 // regular convolution ops.  They have the same LHS and RHS operands, plus two
126 // additional constant operands: an int64 operand for the cudnn algorithm and
127 // a bool operand for whether tensor_ops is enabled. A value of -1 for the cudnn
128 // algorithm means that the implementation is free to choose the best algorithm
129 // it can.
130 //
131 // These calls output a tuple (conv_result, scratch_memory), where conv_result
132 // is the actual result of the convolution, and scratch_memory is temporary
133 // memory used by cudnn.  Callers shouldn't inspect scratch_memory, as its value
134 // is not well-defined.
135 //
136 // GpuConvRewriter lowers kConvolution HLOs to these custom calls.
137 // When it does so, it chooses algorithm -1 and 0 bytes of scratch space.  Later
138 // on in the pipeline, CudnnConvAlgorithmChooser chooses an explicit
139 // algorithm for each conv and sets the amount of scratch space needed.
140 //
141 // (Representing the scratch memory as an output may seem strange at first, but
142 // it's quite sensible, from a certain point of view.  The scratch buffer is a
143 // location in memory that the conv can write into, but which it can't legally
144 // read from, at least until it's written something first.  But that's exactly
145 // the definition of an output buffer.)
146 extern const char* const kCudnnConvForwardCallTarget;
147 extern const char* const kCudnnConvBackwardInputCallTarget;
148 extern const char* const kCudnnConvBackwardFilterCallTarget;
149 extern const char* const kCudnnConvBiasActivationForwardCallTarget;
150 
151 // Returns true if `hlo` will be implemented as a call to a cuDNN convolution
152 // routine.
153 //
154 // This returns true if `hlo` is a CustomCall HLO with a call target equal to
155 // one of the kCudnnConvFoo constants above, but returns *false* for HLOs with a
156 // kConvolution opcode.
157 bool IsCustomCallToDnnConvolution(const HloInstruction& hlo);
158 
159 // Returns true if `hlo` will be implemented as a call to a cuSolver routine.
160 //
161 // This returns true if `hlo` is a CustomCall HLO with a call target equal to
162 // one of the kCusolver... constants, but returns *false* for HLOs with
163 // say, a kCholesky opcode.
164 bool IsCustomCallToCusolver(const HloInstruction& hlo);
165 
166 // Cholesky decomposition. Takes a (batched) matrix as input, and returns a
167 // tuple of (result, workspace, info), where result is the result of the
168 // Cholesky decomposition, workspace is scratch space for cuSolver, and info
169 // is a success/failure code per batch element.
170 extern const char* const kCusolverCholeskyCallTarget;
171 
172 // Returns true if `hlo` will be implemented as a library call, e.g. cuBLAS gemm
173 // or cuDNN convolution.
174 bool ImplementedAsLibraryCall(const HloInstruction& hlo);
175 
176 // Layout analysis for fusion. The constructor will analyze the given LMHLO
177 // fusion operation and store the inferred layouts of fusion internal values.
178 // The default constructor will be used when dealing with LMHLO operations, in
179 // which case there no analysis is needed and the layout can be inferred from
180 // the memref types (so that we can have a unified interface in helper functions
181 // to query layouts).
182 class FusionLayoutAnalysis {
183  public:
FusionLayoutAnalysis()184   FusionLayoutAnalysis() {}
185   explicit FusionLayoutAnalysis(mlir::lmhlo::FusionOp fusion_op);
186 
187   // Gets the shape of a given value, including its inferred layout.
188   Shape GetShape(mlir::Value value) const;
189 
190  private:
191   llvm::DenseMap<mlir::Value, Layout> layouts_;
192 };
193 
194 // Returns true if either the dimensions being reduced or the dimensions being
195 // kept are contiguous in the input of the reduce instruction.
196 bool IsReductionFromOrToContiguousDimensions(const HloInstruction& reduce);
197 
198 // MLIR variant that relies on the shape layouts from fusion layout analysis.
199 bool IsReductionFromOrToContiguousDimensions(
200     mlir::Operation* reduce, const FusionLayoutAnalysis& layout_analysis);
201 
202 // Returns whether unnested_hlo is an input fusion whose root is either a slice
203 // or a tuple of slices. If verify_no_strides is true, returns false unless all
204 // ROOT slices have no strides.
205 bool IsInputFusibleSlices(mlir::Operation* unnested_hlo,
206                           bool verify_no_strides);
207 
208 struct ReductionDimensions {
209   // Indicates whether the reduction is a row reduction or a column reduction.
210   bool is_row_reduction;
211 
212   // Contains the size of the three contiguous components for
213   // the reduction [depth, height, width] (major-to-minor ordering).
214   //
215   // For row reduction, we do: [D, H, W] -> [D, H].
216   // For column reduction, we do: [D, H, W] -> [D, W].
217   std::array<int64, 3> dimensions;
218 };
219 
220 // Given the input shape and dimensions to reduce for a reduction, returns
221 // ReductionDimensions.
222 //
223 // Prerequisite: the reduction instruction passes the check
224 // IsReductionFromOrToContiguousDimensions, which guarantees either the
225 // dimensions to reduce or the dimensions to keep are consecutive.
226 ReductionDimensions GetReductionKindAndContiguousComponents(
227     const HloInstruction& reduce);
228 ReductionDimensions GetReductionKindAndContiguousComponents(
229     mlir::Operation* reduce);
230 
231 // Get tiling per thread for the given reduction in dimensions [D, H, W].
232 std::array<int64, 3> GetReductionTiling(
233     const ReductionDimensions& reduction_dimensions,
234     int smallest_input_dtype_bits,
235     se::CudaComputeCapability cuda_compute_capability);
236 
237 // Emits call to "vprintf" with given format and arguments.
238 llvm::Value* EmitPrintf(absl::string_view fmt,
239                         absl::Span<llvm::Value* const> arguments,
240                         llvm::IRBuilder<>* builder);
241 
242 // Emits code to shuffle data between threads of a warp. This has the same
243 // semantics as the PTX "shfl.sync.down" instruction but works for values that
244 // aren't 32 bits in size. The last operand of the emitted "shfl" is
245 // `kWarpSize - 1`.
246 //
247 // This function emits a "full-warp" shuffle, which all threads of a warp
248 // participate in.  *Do not use this function from a divergent context:* You
249 // can't correctly do so on both Volta and earlier GPUs.
250 //
251 // https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-shfl-sync
252 llvm::Value* EmitFullWarpShuffleDown(llvm::Value* value, llvm::Value* offset,
253                                      llvm::IRBuilder<>* builder);
254 
255 // Emits code that determines whether the current thread is thread 0 within
256 // block 0 of the kernel.
257 llvm::Value* IsBlock0Thread0(llvm::IRBuilder<>* b);
258 
259 // Returns whether the output of a fusion with reduction are consistent with
260 // `first_reduce`.
261 bool IsFusedReductionOutputConsistent(const HloInstruction* inst,
262                                       const HloInstruction* first_reduce);
263 bool IsFusedReductionOutputConsistent(
264     mlir::mhlo::ReduceOp inst, mlir::mhlo::ReduceOp first_reduce,
265     const FusionLayoutAnalysis& layout_analysis);
266 
AreFusedReductionOutputsConsistent(absl::Span<const HloInstruction * const> output_instructions,const HloInstruction * first_reduce)267 inline bool AreFusedReductionOutputsConsistent(
268     absl::Span<const HloInstruction* const> output_instructions,
269     const HloInstruction* first_reduce) {
270   return absl::c_all_of(output_instructions, [=](const HloInstruction* inst) {
271     return IsFusedReductionOutputConsistent(inst, first_reduce);
272   });
273 }
274 
MlirToString(mlir::Operation * op)275 inline std::string MlirToString(mlir::Operation* op) {
276   std::string s;
277   {
278     llvm::raw_string_ostream os(s);
279     op->print(os);
280   }
281   return s;
282 }
283 
MlirToString(const mlir::Location & loc)284 inline std::string MlirToString(const mlir::Location& loc) {
285   std::string s;
286   {
287     llvm::raw_string_ostream os(s);
288     loc.print(os);
289   }
290   return s;
291 }
292 
293 int PartitionLmhloOperandsAndOutputs(mlir::Operation* op);
294 std::vector<mlir::Value> GetHloOperands(mlir::Operation* op);
295 std::vector<mlir::Value> GetHloOutputs(mlir::Operation* op);
296 
297 bool WritesMlirBuffer(mlir::Operation* op, mlir::Value operand);
298 
299 template <typename T>
ToStdVector(const llvm::SmallVectorImpl<T> & v)300 std::vector<T> ToStdVector(const llvm::SmallVectorImpl<T>& v) {
301   return std::vector<T>(v.begin(), v.end());
302 }
303 
304 StatusOr<BufferAllocation::Slice> GetAllocationSlice(
305     mlir::Value v, absl::Span<const BufferAllocation> allocations,
306     std::string* constant_name = nullptr);
307 
308 bool CanEmitFusedDynamicUpdateSliceInPlaceForGpu(
309     mlir::lmhlo::FusionOp fusion,
310     absl::Span<const BufferAllocation> allocations);
311 
312 Shape GetShape(mlir::Value value);
313 
314 // Returns whether the given reduction can be safely generated without atomics:
315 // that is, at most one block will write to every output element.
316 bool ReductionIsRaceFree(const ReductionDimensions& reduction_dimensions,
317                          const std::array<int64_t, 3>& reduction_tiling);
318 
319 }  // namespace gpu
320 }  // namespace xla
321 
322 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMISSION_UTILS_H_
323