1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_PASSES_H_ 17 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_PASSES_H_ 18 19 #include <memory> 20 21 #include "mlir/IR/MLIRContext.h" // from @llvm-project 22 #include "mlir/IR/PatternMatch.h" // from @llvm-project 23 #include "mlir/Pass/Pass.h" // from @llvm-project 24 25 namespace mlir { 26 27 // Creates a pass that breaks up an island with multiple ops into multiple 28 // islands, each with a single op. 29 std::unique_ptr<OperationPass<ModuleOp>> CreateBreakUpIslandsPass(); 30 31 // Creates a pass that converts mlir functions consisting of mlir ops into a 32 // tf_executor dialect as a single island. 33 std::unique_ptr<OperationPass<FuncOp>> 34 CreateFunctionalToExecutorDialectConversionPass(); 35 36 // Creates a pass that lifts inner ops of tf_executor.island ops in 37 // tf_executor.graph into the same block as the tf_executor.graph. 38 std::unique_ptr<OperationPass<FuncOp>> 39 CreateExecutorDialectToFunctionalConversionPass(); 40 41 namespace TF { 42 // Creates a pass that drops `shape_invariant` attribute from While/WhileRegion 43 // ops. 44 std::unique_ptr<OperationPass<FuncOp>> CreateDropWhileShapeInvariantPass(); 45 46 // Transforms functional control flow operations in the TensorFlow dialect to 47 // MLIR Control Flow Graph (CFG) form. 48 std::unique_ptr<OperationPass<FuncOp>> CreateTFFunctionalControlFlowToCFG(); 49 50 // Transforms functional control flow operations in the TensorFlow dialect to 51 // their region based counterparts. 52 std::unique_ptr<OperationPass<ModuleOp>> 53 CreateTFFunctionalControlFlowToRegions(); 54 55 // Transforms region bases control flow operations in the TensorFlow dialect to 56 // their functional counterparts. 57 std::unique_ptr<OperationPass<ModuleOp>> 58 CreateTFRegionControlFlowToFunctional(); 59 60 // Materialize the MlirPassthroughOp by replacing it with the MLIR module 61 // attached as an attribute. 62 std::unique_ptr<OperationPass<FuncOp>> CreateMaterializePassthroughOpPass(); 63 64 // Performs Shape Inference on the TensorFlow dialect using the global registry. 65 std::unique_ptr<OperationPass<ModuleOp>> CreateTFShapeInferencePass(); 66 67 // Guarantee that all FuncOp's have a single use. 68 std::unique_ptr<OperationPass<ModuleOp>> CreateGuaranteeAllFuncsOneUsePass(); 69 70 // Optional pass which will unroll BatchMatMul and use only MatMul 71 std::unique_ptr<OperationPass<FuncOp>> CreateUnrollBatchMatMulPassPass(); 72 73 // Optional pass which will map TF BatchMatMul to TF Einsum 74 std::unique_ptr<OperationPass<FuncOp>> CreateBatchMatMulToEinsumPass(); 75 76 // Optimizes Tensorflow graph. 77 std::unique_ptr<OperationPass<FuncOp>> CreateTFOptimizePass(); 78 79 // Creates pass to rewrite RecvTPUEmbeddingActivationsOp and 80 // SendTPUEmbeddingGradients ops to internal variants. 81 std::unique_ptr<OperationPass<FuncOp>> CreateRewriteTPUEmbeddingOpsPass(); 82 83 // Performs specific fusion for GPU targets. 84 std::unique_ptr<OperationPass<FuncOp>> CreateGpuOpFusionPass(); 85 86 // Create a pass that convert ops that copy tensors between devices, e.g. 87 // tf.Identity. 88 std::unique_ptr<OperationPass<mlir::FuncOp>> 89 CreateTensorDeviceCopyConversionPass(); 90 91 // Returns a pass that folds tf.BroadcastTo nodes with subsequent nodes if they 92 // have built in broadcasting support. 93 std::unique_ptr<OperationPass<FuncOp>> CreateBroadcastFoldPass(); 94 95 struct LayoutOptimizationPipelineOptions 96 : public PassPipelineOptions<LayoutOptimizationPipelineOptions> { 97 Option<std::string> force_data_format{ 98 *this, "force-data-format", 99 llvm::cl::desc("Force data format for all layout sensitive ops")}; 100 Option<bool> skip_fold_transpose_in_ops{ 101 *this, "skip-fold-transpose-in-ops", 102 llvm::cl::desc("Skip folding transpose operands in Ops which can support " 103 "different layouts.")}; 104 }; 105 106 // Layout optimization assigns optimal data layout for layout sensitive 107 // operations, and cancels all redundant transposes. 108 void CreateLayoutOptimizationPipeline( 109 OpPassManager& pm, // NOLINT - MLIR contract is pass by mutable reference. 110 const LayoutOptimizationPipelineOptions& options); 111 112 struct StandardPipelineOptions 113 : public PassPipelineOptions<StandardPipelineOptions> { 114 Option<bool> enable_inliner{*this, "enable-inliner", 115 llvm::cl::desc("Enable inliner."), 116 llvm::cl::init(false)}; 117 Option<bool> form_clusters{*this, "form-clusters", 118 llvm::cl::desc("Enable Cluster Formation pass."), 119 llvm::cl::init(false)}; 120 }; 121 122 // Propagates the pass manager with the passes involved in transforming or 123 // optimizing an MLIR graph without any target specialization. 124 // NOLINTNEXTLINE - MLIR contract is pass by mutable reference. 125 void CreateTFStandardPipeline(OpPassManager& pm, 126 const StandardPipelineOptions& options); 127 128 // Propagates device attributes of resources from callers to callees. 129 std::unique_ptr<OperationPass<ModuleOp>> CreateResourceDeviceInferencePass(); 130 131 // Creates a pass that promotes resource reads/writes in the main function to 132 // inputs and outputs of the main function, assuming that resource operations 133 // have already been decomposed and function calls have already been inlined. 134 // The pass also annotates the input arguments for resources with the indices 135 // of their aliasing output arguments. 136 std::unique_ptr<OperationPass<ModuleOp>> CreatePromoteResourcesToArgsPass(); 137 138 // Creates a pass that promotes tf.VarHandleOp to resource arguments for all 139 // functions. 140 std::unique_ptr<OperationPass<ModuleOp>> CreatePromoteVarHandlesToArgsPass(); 141 142 // Creates a pass that converts readonly reference variables to the 143 // corresponding resource variables. 144 std::unique_ptr<OperationPass<FuncOp>> 145 CreateConvertReadonlyReferenceVariablesToResourceVariablesPass(); 146 147 // Creates a simple device assignment pass on TF dialect for CoreRT use case. 148 std::unique_ptr<OperationPass<FuncOp>> CreateSimpleTFDeviceAssignmentPass( 149 llvm::StringRef default_device); 150 151 // Performs resource lifting on the function body to hoist resource variable 152 // accesses outside all control flow statements. 153 LogicalResult ResourceLiftingForFunctionalControlFlow(FuncOp function); 154 155 // Converts stack ops into operations on local variables, which can later be 156 // removed by resource lifting. Requires known maximum sizes of stacks and 157 // known element shapes of push ops. 158 std::unique_ptr<OperationPass<ModuleOp>> CreateStackOpsDecompositionPass(); 159 160 // Converts tensor list operations into operations on buffers and sizes. Needs 161 // static shapes and known max element count. 162 std::unique_ptr<OperationPass<ModuleOp>> CreateTensorListOpsDecompositionPass(); 163 164 // Converts tensor array ops into operations on local variables, which can later 165 // be removed by resource lifting. Requires known sizes and known element shapes 166 // (either defined in TensorArrayV3 or implied in the first write). 167 std::unique_ptr<OperationPass<ModuleOp>> 168 CreateTensorArrayOpsDecompositionPass(); 169 170 // Create a pass that legalize HLO to TF dialect. 171 std::unique_ptr<OperationPass<FuncOp>> CreateLegalizeHloToTfPass(); 172 173 // Addds the HLO to TF rewrite patterns to the specified pattern list. 174 void PopulateLegalizeHloToTfPatterns(OwningRewritePatternList* patterns, 175 MLIRContext* context); 176 177 // Matches sequence of ops to TensorFlow fused kernels. This pass should not be 178 // generally used beyond exporting to runtimes that supports these ops. In the 179 // future these fusions may be codegen'd automatically. 180 std::unique_ptr<OperationPass<FuncOp>> CreateFusedKernelMatcherPass(); 181 182 // Fuses operations defining `ContractionFusableInterface` interface into the 183 // contraction operations (MatMul, Conv2D, etc...). This is a more general 184 // version of `CreateFusedKernelMatcherPass` that relies on codegen to compose 185 // contraction fusions together. 186 std::unique_ptr<OperationPass<FuncOp>> CreateContractionFusionPass(); 187 188 // Creates function pass to select device index/fold tf.DeviceIndex. 189 std::unique_ptr<OperationPass<FuncOp>> CreateDeviceIndexSelectorPass(); 190 191 // Creates function pass to replace InitializeTableFromTextFileV2Ops with 192 // LookupTableImportV2Op ops. 193 std::unique_ptr<OperationPass<FuncOp>> CreateInitTextFileToImportPass(); 194 195 // Creates function pass to cluster TensorFlow ops by host. The program 196 // generated by this pass will have one function per host where all operations 197 // in the same function are placed on the same host. Each result of the per-host 198 // function will have a "tf.device" attribute which specifies the device 199 // assignment of the result. 200 std::unique_ptr<OperationPass<mlir::ModuleOp>> CreateClusterTFOpsByHostPass(); 201 202 // Creates a pass to insert tf_device.send and tf_device.receive ops to make 203 // sure any argument of any op is on the same host of the op itself. 204 std::unique_ptr<OperationPass<mlir::ModuleOp>> CreateCrossHostTransferPass(); 205 206 // Creates a pass that adds the device attribute to every tf.Const op based on 207 // the device attribute of the operations that read its result. If the result of 208 // a tf.Const op is read by operations placed on multiple devices, then the pass 209 // will replicate the tf.Const op once for each device. 210 std::unique_ptr<OperationPass<ModuleOp>> CreateConstantOpDeviceAssignmentPass(); 211 212 } // namespace TF 213 214 namespace tf_executor { 215 // Creates a pass to merge IslandOps from TFExecutor dialect. 216 std::unique_ptr<OperationPass<FuncOp>> CreateTFExecutorIslandCoarseningPass(); 217 218 // Creates a pass to merge IslandOps for operation marked for execution on TPU. 219 // This is a V1 backward compatibility. 220 std::unique_ptr<OperationPass<ModuleOp>> 221 CreateTFExecutorTPUV1IslandCoarseningPass(); 222 223 // Creates a pass to outlining TPU clusters from single IslandOp into a nested 224 // module suitable for being processed as-if it was a V2 module. 225 // This is a V1 backward compatibility. 226 std::unique_ptr<OperationPass<ModuleOp>> 227 CreateTFExecutorTPUV1IslandOutliningPass(); 228 229 // Creates a pass to inline calls to the nested TPU module, this reverses the 230 // effect of the `TFExecutorTPUV1IslandOutlining` pass above. 231 // This is a V1 backward compatibility. 232 std::unique_ptr<OperationPass<ModuleOp>> 233 CreateTFExecutorTPUV1IslandInliningPass(); 234 235 // Creates a pass to prune tf_executor.graph from dead nodes. 236 std::unique_ptr<OperationPass<FuncOp>> CreateTFExecutorGraphPruningPass( 237 llvm::ArrayRef<std::string> ops_to_preserve = {}); 238 } // namespace tf_executor 239 240 namespace TFDevice { 241 // Creates a pass that forms clusters from instructions that are assigned to 242 // same device. 243 std::unique_ptr<OperationPass<FuncOp>> CreateClusterFormationPass(); 244 245 // Sinks `tf.Const` operations in the ClusterOp region using them. This is 246 // performed in order to limit the number of values implicitly captured in this 247 // region before outlining. 248 std::unique_ptr<OperationPass<FuncOp>> CreateClusterConstantSinkingPass(); 249 250 // Creates a pass that outlines regions of tf_device.launch operations. 251 std::unique_ptr<OperationPass<ModuleOp>> CreateClusterOutliningPass(); 252 253 // Creates a pass that clusters ops into tf_device::ClusterOp regions 254 // according to a policy specified by the pass options. 255 std::unique_ptr<FunctionPass> CreateClusterOpsByPolicyPass(); 256 257 // A pass that decomposes composite resource operations into primitive ones like 258 // ReadVariableOp, AssignVariableOp and other computations to facilitate 259 // transformations like resource op lifting. 260 std::unique_ptr<OperationPass<FuncOp>> CreateDecomposeResourceOpsPass(); 261 262 // Creates a pass that lifts operations on external resource variables from 263 // device computation nested in `tf_device::LaunchOp` out so that resource 264 // variable load operations are all before device computation while resource 265 // variable store operations are all after device computation. After this pass, 266 // device computation no longer interacts with external resource variables. 267 std::unique_ptr<OperationPass<ModuleOp>> CreateResourceOpLiftingPass(); 268 269 // Lifts resource operations from tf_device.launch_func ops nested in `op` 270 // outside. Returns a failure if there are remaining resource-type values that 271 // can not be lifted. 272 LogicalResult LiftResourceOps(Operation* op); 273 274 // Creates a pass that hoists invariant operations in a `tf_device.replicate`. 275 std::unique_ptr<OperationPass<FuncOp>> CreateReplicateInvariantOpHoistingPass(); 276 277 // Creates a pass that forms replica `tf_executor.island` from a single 278 // `tf_device.replicate` island. 279 std::unique_ptr<OperationPass<FuncOp>> CreateReplicateToIslandPass(); 280 281 // Creates a pass that creates `tf_executor.island` from a single 282 // `tf_device.parallel_execute` island. 283 std::unique_ptr<OperationPass<FuncOp>> CreateParallelExecuteToIslandsPass(); 284 285 // Creates a pass that annotates whether a LaunchFuncOp's parameters have the 286 // same data across replicas. 287 std::unique_ptr<OperationPass<ModuleOp>> 288 CreateAnnotateParameterReplicationPass(); 289 290 // Creates a pass that marks unsupported ops in device cluster for outside 291 // compilation. 292 std::unique_ptr<OperationPass<ModuleOp>> 293 CreateMarkOpsForOutsideCompilationPass(); 294 295 // Creates a pass that merges control flow with similar predicates. 296 std::unique_ptr<OperationPass<ModuleOp>> CreateMergeControlFlowPass(); 297 298 // Creates a pass that hoists a `tf_device.launch` body and assigns a `device` 299 // attribute to each TensorFlow dialect op in the body based on the `device` 300 // attribute on the `tf_device.launch`. 301 std::unique_ptr<OperationPass<FuncOp>> CreateLaunchToDeviceAttributePass(); 302 303 // Creates a pass that hoists a `tf_device.replicate` body and replicates each 304 // TensorFlow dialect op in the body based on its `device` attribute and the 305 // `devices` attribute on the `tf_device.replicate`. 306 std::unique_ptr<OperationPass<mlir::ModuleOp>> CreateTFDeviceReplicationPass(); 307 } // namespace TFDevice 308 309 namespace TFTPU { 310 // Creates a pass that forms clusters from operations of the same 311 // `_tpu_replicate` attribute. 312 std::unique_ptr<OperationPass<ModuleOp>> CreateTPUClusterFormationPass(); 313 314 // Creates a pass that cleans up `_tpu_replicate` attribute on operations 315 // that are inside a cluster. 316 std::unique_ptr<OperationPass<ModuleOp>> 317 CreateTPUClusterCleanupAttributesPass(); 318 319 // Creates a pass that removes Identity/IdentityN ops from a cluster. 320 std::unique_ptr<OperationPass<ModuleOp>> CreateTPUIdentityPruningPass(); 321 322 // Creates a pass that allows TPU program inputs to have layouts determined at 323 // run time. 324 std::unique_ptr<OperationPass<ModuleOp>> CreateTPUDynamicLayoutPass(); 325 326 // Creates a pass that remaps and assigns padding map from a 327 // `tf_device.launch_func` `padding_map` attribute to its encapsulated function. 328 std::unique_ptr<OperationPass<ModuleOp>> CreateTPUDynamicPaddingMapperPass(); 329 330 // Creates a pass that adds `tf.ReadVariableOp` to a TPU cluster for resources 331 // the cluster only writes to. 332 std::unique_ptr<OperationPass<ModuleOp>> CreateTPUResourceReadForWritePass(); 333 334 // Creates a pass that reorders partitiioned resource reads and replicated 335 // inputs. 336 std::unique_ptr<OperationPass<FuncOp>> 337 CreateTPUReorderReplicateAndPartitionedInputsPass(); 338 339 // Creates a pass that partitions unpartitioned resource read/write to 340 // partitioned resource variables. 341 std::unique_ptr<OperationPass<FuncOp>> 342 CreateTPUResourceReadsWritesPartitioningPass(); 343 344 // Creates a pass that rewrites `tf_device.launch_func` on TPUs into TPU runtime 345 // ops. 346 std::unique_ptr<OperationPass<ModuleOp>> CreateTPURewritePass(); 347 348 // Creates a pass that identifies XLASharding ops in launch op for TPU 349 // computation. 350 std::unique_ptr<OperationPass<ModuleOp>> CreateTPUShardingIdentificationPass(); 351 352 // Creates a pass that moves `tf.AssignVariableOp` into a 353 // `tf_device.parallel_execute` region if the `tf.AssignVariableOp` is the 354 // only consumer of a `tf_device.parallel_execute` result. 355 std::unique_ptr<OperationPass<FuncOp>> 356 CreateTPUParallelExecuteSinkResourceWritePass(); 357 358 // Creates a pass that merges device variable reads/updates into the surrounded 359 // TPUExecute node. This allows the execute node to perform in-place variable 360 // updates. 361 std::unique_ptr<OperationPass<FuncOp>> CreateTPUMergeVariablesWithExecutePass(); 362 363 // Creates a pass that wraps ReadVariableOp/AssignVariable op that consumes a 364 // packed tensor to have same device placement as underlying TPU device. 365 std::unique_ptr<OperationPass<FuncOp>> CreateTPUColocateCompositeResourceOps(); 366 367 // Creates a pass that adds ops which perform formatting on variables at 368 // run-time according to compilation result. 369 std::unique_ptr<OperationPass<ModuleOp>> CreateTPUVariableReformattingPass(); 370 371 // Creates a pass that wraps ops with the same `_xla_outside_compilation` 372 // attribute value in a tf_device.launch op with host device assignment. 373 std::unique_ptr<OperationPass<ModuleOp>> 374 CreateOutsideCompiledToHostLaunchPass(); 375 376 // Creates a pass that extracts outside compilation (CPU ops inside TPU cluster) 377 // at head/tail of TPU cluster to run before/after TPU computation. 378 std::unique_ptr<OperationPass<ModuleOp>> 379 CreateTPUExtractHeadTailOutsideCompilationPass(); 380 381 // Creates a pass that expands outside compilation cluster at the head/tail of 382 // TPU computation by adding outside compilation attribute to identity/cast ops 383 // that are only used for host computation. 384 std::unique_ptr<OperationPass<FuncOp>> CreateTPUHostComputationExpansionPass(); 385 386 // Creates a pass that updates inputs to TPU embedding layer enqueue ops so that 387 // correct ops are invoked during training and evaluation. 388 std::unique_ptr<OperationPass<FuncOp>> 389 CreateTPUUpdateEmbeddingEnqueueOpInputsPass(); 390 391 // Creates a pass that extract outside compilation (CPU ops inside TPU cluster) 392 // ops to a separate parallel_execute region to run on CPU. 393 std::unique_ptr<OperationPass<ModuleOp>> 394 CreateTPUExtractOutsideCompilationPass(); 395 396 // Creates a pass that propagates TPU devices to users. 397 std::unique_ptr<OperationPass<FuncOp>> CreateTPUDevicePropagationPass(); 398 399 // Populates the supplied passmanager with the passes required to run the 400 // bridge. 401 void CreateTPUBridgePipeline(OpPassManager& pm); 402 403 // Populates the supplied passmanager with the passes required to run the 404 // bridge in V1 mode. 405 void CreateTPUBridgePipelineV1(OpPassManager& pm); 406 407 // Creates a pass that replicates the tf._TPUCompileMlir op on each host that 408 // needs the compiled program. It helps avoid transferring the compiled binary 409 // between hosts. 410 std::unique_ptr<OperationPass<mlir::ModuleOp>> 411 CreateTPUCompileOpReplicationPass(); 412 413 } // namespace TFTPU 414 415 #define GEN_PASS_REGISTRATION 416 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc" 417 418 } // namespace mlir 419 420 #endif // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_PASSES_H_ 421