1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_PASSES_H_ 17 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_PASSES_H_ 18 19 #include <memory> 20 21 #include "mlir/IR/MLIRContext.h" // from @llvm-project 22 #include "mlir/IR/PatternMatch.h" // from @llvm-project 23 #include "mlir/Pass/Pass.h" // from @llvm-project 24 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h" 25 26 namespace mlir { 27 28 // Creates a pass that breaks up an island with multiple ops into multiple 29 // islands, each with a single op. 30 std::unique_ptr<OperationPass<ModuleOp>> CreateBreakUpIslandsPass(); 31 32 // Creates a pass that converts mlir functions consisting of mlir ops into a 33 // tf_executor dialect as a single island. 34 std::unique_ptr<OperationPass<FuncOp>> 35 CreateFunctionalToExecutorDialectConversionPass(); 36 37 // Creates a pass that lifts inner ops of tf_executor.island ops in 38 // tf_executor.graph into the same block as the tf_executor.graph. 39 std::unique_ptr<OperationPass<FuncOp>> 40 CreateExecutorDialectToFunctionalConversionPass(); 41 42 namespace TF { 43 // Creates a pass that drops `shape_invariant` attribute from While/WhileRegion 44 // ops. 45 std::unique_ptr<OperationPass<FuncOp>> CreateDropWhileShapeInvariantPass(); 46 47 // Creates a pass that drops `shape_invariant` attribute from While/WhileRegion 48 // ops within device cluster. 49 std::unique_ptr<OperationPass<FuncOp>> 50 CreateDropWhileShapeInvariantInDeviceClusterPass(); 51 52 // Creates a pass that moves writes to replicate invariant resource variables 53 // outside tf_device.replicate op. 54 std::unique_ptr<OperationPass<FuncOp>> 55 CreateHoistReplicateInvariantResourceWritesPass(); 56 57 // Transforms functional control flow operations in the TensorFlow dialect to 58 // MLIR Control Flow Graph (CFG) form. 59 std::unique_ptr<OperationPass<FuncOp>> CreateTFFunctionalControlFlowToCFG(); 60 61 // Transforms functional control flow operations in the TensorFlow dialect to 62 // their region based counterparts. 63 std::unique_ptr<OperationPass<ModuleOp>> 64 CreateTFFunctionalControlFlowToRegions(); 65 66 // Transforms region bases control flow operations in the TensorFlow dialect to 67 // their functional counterparts. 68 std::unique_ptr<OperationPass<ModuleOp>> 69 CreateTFRegionControlFlowToFunctional(); 70 71 // Materialize the MlirPassthroughOp by replacing it with the MLIR module 72 // attached as an attribute. 73 std::unique_ptr<OperationPass<FuncOp>> CreateMaterializePassthroughOpPass(); 74 75 // Performs Shape Inference on the TensorFlow dialect using the global registry. 76 std::unique_ptr<OperationPass<ModuleOp>> CreateTFShapeInferencePass(); 77 78 // Performs checks that the whole module does not contain dynamic shapes. 79 std::unique_ptr<OperationPass<FuncOp>> CreateTFEnsureStaticShapesPass(); 80 81 // Guarantee that all FuncOp's have a single use. 82 std::unique_ptr<OperationPass<ModuleOp>> CreateGuaranteeAllFuncsOneUsePass(); 83 84 // Optional pass which will unroll BatchMatMul and use only MatMul 85 std::unique_ptr<OperationPass<FuncOp>> CreateUnrollBatchMatMulPassPass(); 86 87 // Optional pass which will map TF BatchMatMul to TF Einsum 88 std::unique_ptr<OperationPass<FuncOp>> CreateBatchMatMulToEinsumPass(); 89 90 // Optimizes Tensorflow graph. 91 std::unique_ptr<OperationPass<FuncOp>> CreateTFOptimizePass(); 92 93 // Creates pass to rewrite RecvTPUEmbeddingActivationsOp and 94 // SendTPUEmbeddingGradients ops to internal variants. 95 std::unique_ptr<OperationPass<FuncOp>> CreateRewriteTPUEmbeddingOpsPass(); 96 97 // Performs specific fusion for GPU targets. 98 std::unique_ptr<OperationPass<FuncOp>> CreateGpuOpFusionPass(); 99 100 // Create a pass that convert ops that copy tensors between devices, e.g. 101 // tf.Identity. 102 std::unique_ptr<OperationPass<mlir::FuncOp>> 103 CreateTensorDeviceCopyConversionPass(); 104 105 // Returns a pass that folds tf.BroadcastTo nodes with subsequent nodes if they 106 // have built in broadcasting support. 107 std::unique_ptr<OperationPass<FuncOp>> CreateBroadcastFoldPass(); 108 109 struct LayoutOptimizationPipelineOptions 110 : public PassPipelineOptions<LayoutOptimizationPipelineOptions> { 111 Option<std::string> force_data_format{ 112 *this, "force-data-format", 113 llvm::cl::desc("Force data format for all layout sensitive ops")}; 114 Option<bool> skip_fold_transpose_in_ops{ 115 *this, "skip-fold-transpose-in-ops", 116 llvm::cl::desc("Skip folding transpose operands in Ops which can support " 117 "different layouts.")}; 118 }; 119 120 // Layout optimization assigns optimal data layout for layout sensitive 121 // operations, and cancels all redundant transposes. 122 void CreateLayoutOptimizationPipeline( 123 OpPassManager& pm, // NOLINT - MLIR contract is pass by mutable reference. 124 const LayoutOptimizationPipelineOptions& options); 125 126 struct StandardPipelineOptions 127 : public PassPipelineOptions<StandardPipelineOptions> { 128 Option<bool> enable_inliner{*this, "enable-inliner", 129 llvm::cl::desc("Enable inliner."), 130 llvm::cl::init(false)}; 131 Option<bool> form_clusters{*this, "form-clusters", 132 llvm::cl::desc("Enable Cluster Formation pass."), 133 llvm::cl::init(false)}; 134 }; 135 136 // Propagates the pass manager with the passes involved in transforming or 137 // optimizing an MLIR graph without any target specialization. 138 // NOLINTNEXTLINE - MLIR contract is pass by mutable reference. 139 void CreateTFStandardPipeline(OpPassManager& pm, 140 const StandardPipelineOptions& options); 141 142 // Propagates device attributes of resources from callers to callees. 143 std::unique_ptr<OperationPass<ModuleOp>> CreateResourceDeviceInferencePass(); 144 145 // Creates a pass that promotes resource reads/writes in the main function to 146 // inputs and outputs of the main function, assuming that resource operations 147 // have already been decomposed and function calls have already been inlined. 148 // The pass also annotates the input arguments for resources with the indices 149 // of their aliasing output arguments. 150 std::unique_ptr<OperationPass<ModuleOp>> CreatePromoteResourcesToArgsPass(); 151 152 // Creates a pass that promotes tf.VarHandleOp to resource arguments for all 153 // functions. 154 std::unique_ptr<OperationPass<ModuleOp>> CreatePromoteVarHandlesToArgsPass(); 155 156 // Creates a pass that converts readonly reference variables to the 157 // corresponding resource variables. 158 std::unique_ptr<OperationPass<FuncOp>> 159 CreateConvertReadonlyReferenceVariablesToResourceVariablesPass(); 160 161 // Creates a simple device assignment pass on TF dialect for CoreRT use case. 162 std::unique_ptr<OperationPass<FuncOp>> CreateSimpleTFDeviceAssignmentPass( 163 llvm::StringRef default_device); 164 165 // Performs resource lifting on the function body to hoist resource variable 166 // accesses outside all control flow statements. 167 LogicalResult ResourceLiftingForFunctionalControlFlow(FuncOp function); 168 169 // Converts stack ops into operations on local variables, which can later be 170 // removed by resource lifting. Requires known maximum sizes of stacks and 171 // known element shapes of push ops. 172 std::unique_ptr<OperationPass<ModuleOp>> CreateStackOpsDecompositionPass(); 173 174 // Converts tensor list operations into operations on buffers and sizes. Needs 175 // static shapes and known max element count. 176 std::unique_ptr<OperationPass<ModuleOp>> CreateTensorListOpsDecompositionPass(); 177 178 // Converts tensor array ops into operations on local variables, which can later 179 // be removed by resource lifting. Requires known sizes and known element shapes 180 // (either defined in TensorArrayV3 or implied in the first write). 181 std::unique_ptr<OperationPass<ModuleOp>> 182 CreateTensorArrayOpsDecompositionPass(); 183 184 // Create a pass that legalize HLO to TF dialect. 185 std::unique_ptr<OperationPass<FuncOp>> CreateLegalizeHloToTfPass(); 186 187 // Addds the HLO to TF rewrite patterns to the specified pattern list. 188 void PopulateLegalizeHloToTfPatterns(OwningRewritePatternList* patterns, 189 MLIRContext* context); 190 191 // Matches sequence of ops to TensorFlow fused kernels. This pass should not be 192 // generally used beyond exporting to runtimes that supports these ops. In the 193 // future these fusions may be codegen'd automatically. 194 std::unique_ptr<OperationPass<FuncOp>> CreateFusedKernelMatcherPass(); 195 196 // Creates function pass to select device index/fold tf.DeviceIndex. 197 std::unique_ptr<OperationPass<FuncOp>> CreateDeviceIndexSelectorPass(); 198 199 // Creates function pass to replace InitializeTableFromTextFileV2Ops with 200 // LookupTableImportV2Op ops. 201 std::unique_ptr<OperationPass<FuncOp>> CreateInitTextFileToImportPass( 202 std::string saved_model_dir = ""); 203 204 // Creates function pass to cluster TensorFlow ops by host. The program 205 // generated by this pass will have one function per host where all operations 206 // in the same function are placed on the same host. Each result of the per-host 207 // function will have a "tf.device" attribute which specifies the device 208 // assignment of the result. 209 std::unique_ptr<OperationPass<mlir::ModuleOp>> CreateClusterTFOpsByHostPass(); 210 211 // Creates a pass to insert tf_device.send and tf_device.receive ops to make 212 // sure any argument of any op is on the same host of the op itself. 213 std::unique_ptr<OperationPass<mlir::ModuleOp>> CreateCrossHostTransferPass(); 214 215 // Creates a pass that adds the device attribute to every tf.Const op based on 216 // the device attribute of the operations that read its result. If the result of 217 // a tf.Const op is read by operations placed on multiple devices, then the pass 218 // will replicate the tf.Const op once for each device. 219 std::unique_ptr<OperationPass<ModuleOp>> CreateConstantOpDeviceAssignmentPass(); 220 221 // Populates the supplied passmanager with the passes required to export 222 // to TensorFlow Graph. 223 void AddGraphExportLoweringPasses(OpPassManager& pm); 224 225 // Returns pass that verifies whether all functions in module are of single 226 // tf_executor.graph and each tf_executor.island in tf_executor.graph only has a 227 // single op. 228 std::unique_ptr<OperationPass<ModuleOp>> CreateVerifySuitableForExportPass(); 229 230 // Returns pass that prepares TPU computation to be legal for export to 231 // TensorFlow. 232 std::unique_ptr<OperationPass<FuncOp>> 233 CreatePrepareTpuComputationForTfExportPass(); 234 235 // Rewrites ops that require quantized inputs or outputs to ops that allow 236 // non-quantized inputs and outputs. 237 std::unique_ptr<OperationPass<FuncOp>> CreateLowerQuantizedPass(); 238 } // namespace TF 239 240 namespace tf_executor { 241 // Creates a pass to merge IslandOps from TFExecutor dialect. 242 std::unique_ptr<OperationPass<FuncOp>> CreateTFExecutorIslandCoarseningPass(); 243 244 // Creates a pass to merge IslandOps for operation marked for execution on TPU. 245 // This is a V1 backward compatibility. 246 std::unique_ptr<OperationPass<ModuleOp>> 247 CreateTFExecutorTPUV1IslandCoarseningPass(); 248 249 // Creates a pass to outlining TPU clusters from single IslandOp into a nested 250 // module suitable for being processed as-if it was a V2 module. 251 // This is a V1 backward compatibility. 252 std::unique_ptr<OperationPass<ModuleOp>> 253 CreateTFExecutorTPUV1IslandOutliningPass(); 254 255 // Creates a pass to inline calls to the nested TPU module, this reverses the 256 // effect of the `TFExecutorTPUV1IslandOutlining` pass above. 257 // This is a V1 backward compatibility. 258 std::unique_ptr<OperationPass<ModuleOp>> 259 CreateTFExecutorTPUV1IslandInliningPass(); 260 261 // Creates a pass to prune tf_executor.graph from dead nodes. 262 std::unique_ptr<OperationPass<FuncOp>> CreateTFExecutorGraphPruningPass( 263 llvm::ArrayRef<std::string> ops_to_preserve = {}); 264 } // namespace tf_executor 265 266 namespace TFDevice { 267 // Creates a pass that forms clusters from instructions that are assigned to 268 // same device. 269 std::unique_ptr<OperationPass<FuncOp>> CreateClusterFormationPass(); 270 271 // Sinks `tf.Const` operations in the ClusterOp region using them. This is 272 // performed in order to limit the number of values implicitly captured in this 273 // region before outlining. 274 std::unique_ptr<OperationPass<FuncOp>> CreateClusterConstantSinkingPass( 275 llvm::function_ref<bool(tf_device::ClusterOp, ElementsAttr)> filter = {}); 276 277 // Creates a pass that outlines regions of tf_device.cluster operations. 278 std::unique_ptr<OperationPass<ModuleOp>> CreateClusterOutliningPass(); 279 280 // Creates a pass that outlines regions of tf_device.launch operations. 281 std::unique_ptr<OperationPass<ModuleOp>> CreateLaunchOutliningPass(); 282 283 // Creates a pass that converts tf_device::LaunchFuncOp into 284 // TF::PartitionedCallOp. 285 std::unique_ptr<OperationPass<ModuleOp>> CreateConvertLaunchFuncToTFCallPass(); 286 287 // A pass that decomposes composite resource operations into primitive ones like 288 // ReadVariableOp, AssignVariableOp and other computations to facilitate 289 // transformations like resource op lifting. 290 std::unique_ptr<OperationPass<FuncOp>> CreateDecomposeResourceOpsPass(); 291 292 // A pass that decomposes composite resource operations in device cluster 293 // (tf_device.cluster op) into primitive ones like ReadVariableOp, 294 // AssignVariableOp and other computations to facilitate transformations like 295 // resource op lifting. 296 std::unique_ptr<OperationPass<ModuleOp>> 297 CreateDecomposeResourceOpsInClusterPass(); 298 299 // Creates a pass that marks TPU cluster input-output pairs reading and writing 300 // to same resource variable as aliases. 301 std::unique_ptr<OperationPass<ModuleOp>> CreateMarkInputOutputAliasesPass(); 302 303 // Creates a pass that lifts operations on external resource variables from 304 // device computation nested in `tf_device::LaunchOp` out so that resource 305 // variable load operations are all before device computation while resource 306 // variable store operations are all after device computation. After this pass, 307 // device computation no longer interacts with external resource variables. 308 std::unique_ptr<OperationPass<ModuleOp>> CreateResourceOpLiftingPass(); 309 310 // Creates a pass that lifts operations from the main function. 311 std::unique_ptr<OperationPass<ModuleOp>> 312 CreateResourceOpLiftingForMainFunctionPass(); 313 314 // Lifts resource operations from tf_device.launch_func ops nested in `op` 315 // outside. Returns a failure if there are remaining resource-type values that 316 // can not be lifted. 317 LogicalResult LiftResourceOps(Operation* op); 318 319 // Creates a pass that hoists invariant operations in a `tf_device.replicate`. 320 std::unique_ptr<OperationPass<FuncOp>> CreateReplicateInvariantOpHoistingPass(); 321 322 // Creates a pass that forms replica `tf_executor.island` from a single 323 // `tf_device.replicate` island. 324 std::unique_ptr<OperationPass<FuncOp>> CreateReplicateToIslandPass(); 325 326 // Creates a pass that creates `tf_executor.island` from a single 327 // `tf_device.parallel_execute` island. 328 std::unique_ptr<OperationPass<FuncOp>> CreateParallelExecuteToIslandsPass(); 329 330 // Creates a pass that annotates whether a LaunchFuncOp's parameters have the 331 // same data across replicas. 332 std::unique_ptr<OperationPass<ModuleOp>> 333 CreateAnnotateParameterReplicationPass(); 334 335 // Creates a pass that marks unsupported ops in device cluster for outside 336 // compilation. 337 std::unique_ptr<OperationPass<ModuleOp>> 338 CreateMarkOpsForOutsideCompilationPass(); 339 340 // Creates a pass that merges control flow with similar predicates. 341 std::unique_ptr<OperationPass<ModuleOp>> CreateMergeControlFlowPass(); 342 343 // Creates a pass that wraps each TensorFlow dialect with `device` attribute 344 // in a `tf_device.launch` op with the same `device` attribute. 345 std::unique_ptr<OperationPass<FuncOp>> CreateDeviceAttributeToLaunchPass(); 346 347 // Creates a pass that hoists a `tf_device.launch` body and assigns a `device` 348 // attribute to each TensorFlow dialect op in the body based on the `device` 349 // attribute on the `tf_device.launch`. 350 std::unique_ptr<OperationPass<FuncOp>> CreateLaunchToDeviceAttributePass(); 351 352 // Creates a pass that hoists a `tf_device.replicate` body and replicates each 353 // TensorFlow dialect op in the body based on its `device` attribute and the 354 // `devices` attribute on the `tf_device.replicate`. 355 std::unique_ptr<OperationPass<mlir::ModuleOp>> CreateTFDeviceReplicationPass(); 356 357 // Creates a pass that extracts ops in tf_device.launch op with host device 358 // assignment and adds an `_xla_outside_compilation` attribute value. 359 std::unique_ptr<OperationPass<ModuleOp>> 360 CreateHostLaunchToOutsideCompiledPass(); 361 362 } // namespace TFDevice 363 364 namespace TFTPU { 365 // Creates a pass that forms clusters from operations of the same 366 // `_tpu_replicate` attribute. 367 std::unique_ptr<OperationPass<ModuleOp>> CreateTPUClusterFormationPass(); 368 369 // Creates a pass that cleans up `_tpu_replicate` attribute on operations 370 // that are inside a cluster. 371 std::unique_ptr<OperationPass<ModuleOp>> 372 CreateTPUClusterCleanupAttributesPass(); 373 374 // Creates a pass that removes Identity/IdentityN ops from a cluster. 375 std::unique_ptr<OperationPass<ModuleOp>> CreateTPUIdentityPruningPass(); 376 377 // Creates a pass that allows TPU program inputs to have layouts determined at 378 // run time. 379 std::unique_ptr<OperationPass<ModuleOp>> CreateTPUDynamicLayoutPass(); 380 381 // Creates a pass that adds `tf.ReadVariableOp` to a TPU cluster for resources 382 // the cluster only writes to. 383 std::unique_ptr<OperationPass<ModuleOp>> CreateTPUResourceReadForWritePass(); 384 385 // Creates a pass that reorders partitiioned resource reads and replicated 386 // inputs. 387 std::unique_ptr<OperationPass<FuncOp>> 388 CreateTPUReorderReplicateAndPartitionedInputsPass(); 389 390 // Creates a pass that partitions unpartitioned resource read/write to 391 // partitioned resource variables. 392 std::unique_ptr<OperationPass<FuncOp>> 393 CreateTPUResourceReadsWritesPartitioningPass(); 394 395 // Creates a pass that rewrites `tf_device.launch_func` on TPUs into TPU runtime 396 // ops. 397 std::unique_ptr<OperationPass<ModuleOp>> CreateTPURewritePass(); 398 399 // Creates a pass that identifies XLASharding ops in launch op for TPU 400 // computation. 401 std::unique_ptr<OperationPass<ModuleOp>> CreateTPUShardingIdentificationPass(); 402 403 // Creates a pass that moves `tf.AssignVariableOp` into a 404 // `tf_device.parallel_execute` region if the `tf.AssignVariableOp` is the 405 // only consumer of a `tf_device.parallel_execute` result. 406 std::unique_ptr<OperationPass<FuncOp>> 407 CreateTPUParallelExecuteSinkResourceWritePass(); 408 409 // Creates a pass that merges device variable reads/updates into the surrounded 410 // TPUExecute node. This allows the execute node to perform in-place variable 411 // updates. 412 std::unique_ptr<OperationPass<ModuleOp>> 413 CreateTPUMergeVariablesWithExecutePass(); 414 415 // Creates a pass that wraps ReadVariableOp/AssignVariable op that consumes a 416 // packed tensor to have same device placement as underlying TPU device. 417 std::unique_ptr<OperationPass<FuncOp>> CreateTPUColocateCompositeResourceOps(); 418 419 // Creates a pass that adds ops which perform formatting on variables at 420 // run-time according to compilation result. 421 std::unique_ptr<OperationPass<ModuleOp>> CreateTPUVariableReformattingPass(); 422 423 // Creates a pass that wraps ops with the same `_xla_outside_compilation` 424 // attribute value in a tf_device.launch op with host device assignment. 425 std::unique_ptr<OperationPass<ModuleOp>> 426 CreateOutsideCompiledToHostLaunchPass(); 427 428 // Creates a pass that extracts outside compilation (CPU ops inside TPU cluster) 429 // at head/tail of TPU cluster to run before/after TPU computation. 430 std::unique_ptr<OperationPass<ModuleOp>> 431 CreateTPUExtractHeadTailOutsideCompilationPass(); 432 433 // Creates a pass that expands outside compilation cluster at the head/tail of 434 // TPU computation by adding outside compilation attribute to identity/cast ops 435 // that are only used for host computation. 436 std::unique_ptr<OperationPass<FuncOp>> CreateTPUHostComputationExpansionPass(); 437 438 // Creates a pass that updates inputs to TPU embedding layer enqueue ops so that 439 // correct ops are invoked during training and evaluation. 440 std::unique_ptr<OperationPass<FuncOp>> 441 CreateTPUUpdateEmbeddingEnqueueOpInputsPass(); 442 443 // Creates a pass that extract outside compilation (CPU ops inside TPU cluster) 444 // ops to a separate parallel_execute region to run on CPU. 445 std::unique_ptr<OperationPass<ModuleOp>> 446 CreateTPUExtractOutsideCompilationPass(); 447 448 // Creates a pass that propagates TPU devices to users. 449 std::unique_ptr<OperationPass<FuncOp>> CreateTPUDevicePropagationPass(); 450 451 // Populates the supplied passmanager with the passes required to run the 452 // bridge. 453 void CreateTPUBridgePipeline(OpPassManager& pm); 454 455 // Populates the supplied passmanager with the passes required to run the 456 // bridge in V1 mode. 457 void CreateTPUBridgePipelineV1(OpPassManager& pm); 458 459 // Creates a pass that replicates the tf._TPUCompileMlir op on each host that 460 // needs the compiled program. It helps avoid transferring the compiled binary 461 // between hosts. 462 std::unique_ptr<OperationPass<mlir::ModuleOp>> 463 CreateTPUCompileOpReplicationPass(); 464 465 // Creates a pass that applies space to depth transform 466 // for the first or frontier convolutions consume host inputs on TPU. 467 std::unique_ptr<OperationPass<ModuleOp>> CreateTPUSpaceToDepthPass(); 468 469 } // namespace TFTPU 470 471 #define GEN_PASS_REGISTRATION 472 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc" 473 474 namespace TFDevice { 475 #define GEN_PASS_REGISTRATION 476 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.h.inc" 477 } // namespace TFDevice 478 479 } // namespace mlir 480 481 #endif // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_PASSES_H_ 482