1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_PARALLEL_LOOP_EMITTER_H_ 17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_PARALLEL_LOOP_EMITTER_H_ 18 19 #include "llvm/IR/IRBuilder.h" 20 #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h" 21 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h" 22 #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h" 23 24 namespace xla { 25 namespace gpu { 26 27 // Emits a parallel loop for every element in the given array shape. This loop 28 // emitted will be executed by multiple threads in parallel. Therefore, each 29 // thread instance of the loop iterates over part of the array, and they 30 // collectively iterates over the entire array. 31 class ParallelLoopEmitter : public llvm_ir::LoopEmitter { 32 public: 33 // `thread_count` is the number of threads to parallelize the loop on. 34 // The meanings of other parameters are the same as LoopEmitter. 35 ParallelLoopEmitter(BodyEmitter body_emitter, const Shape& shape, 36 const LaunchDimensions& launch_dimensions, 37 llvm::IRBuilder<>* b, int unroll_factor = 1); 38 // Constructs a ParallelLoopEmitter from an element generator that generates 39 // each element of the given target array. 40 ParallelLoopEmitter(const llvm_ir::ElementGenerator& target_element_generator, 41 const llvm_ir::IrArray& target_array, 42 const LaunchDimensions& launch_dimensions, 43 llvm::IRBuilder<>* b, int unroll_factor = 1); 44 45 // Constructs a loop emitter for a loop that generates on element of each of N 46 // arrays on each iteration. 47 // 48 // This is used in multi-output fusion. target_element_generator should 49 // produce a struct with N elements, one for each of target_arrays. 50 ParallelLoopEmitter(const llvm_ir::ElementGenerator& target_element_generator, 51 absl::Span<const llvm_ir::IrArray> target_arrays, 52 const LaunchDimensions& launch_dimensions, 53 llvm::IRBuilder<>* b, int unroll_factor = 1); 54 55 ParallelLoopEmitter(const ParallelLoopEmitter&) = delete; 56 ParallelLoopEmitter& operator=(const ParallelLoopEmitter&) = delete; 57 ~ParallelLoopEmitter() override = default; 58 59 std::vector<llvm_ir::IrArray::Index> EmitIndexAndSetExitBasicBlock( 60 absl::string_view loop_name, llvm::Type* index_type) override; 61 62 private: 63 // The thread and block dimension to parallelize the loop on. 64 const LaunchDimensions launch_dimensions_; 65 const int unroll_factor_; 66 }; 67 68 } // namespace gpu 69 } // namespace xla 70 71 #endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_PARALLEL_LOOP_EMITTER_H_ 72