1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_COMPILER_H_ 17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_COMPILER_H_ 18 19 #include <memory> 20 #include <string> 21 #include <vector> 22 23 #include "mlir/IR/BuiltinOps.h" // from @llvm-project 24 #include "tensorflow/compiler/xla/service/executable.h" 25 #include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h" 26 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h" 27 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h" 28 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h" 29 #include "tensorflow/compiler/xla/service/hlo_module.h" 30 #include "tensorflow/compiler/xla/service/llvm_compiler.h" 31 #include "tensorflow/compiler/xla/statusor.h" 32 #include "tensorflow/compiler/xla/types.h" 33 #include "tensorflow/core/lib/hash/hash.h" 34 #include "tensorflow/core/platform/macros.h" 35 #include "tensorflow/core/platform/stream_executor_no_cuda.h" 36 #include "tensorflow/core/platform/thread_annotations.h" 37 #include "tensorflow/stream_executor/stream_executor_pimpl.h" 38 39 namespace xla { 40 namespace gpu { 41 42 // The GPU compiler generates efficient GPU executables. 43 class GpuCompiler : public LLVMCompiler { 44 public: 45 GpuCompiler(se::Platform::Id platform_id, const char* target_triple, 46 const char* data_layout); ~GpuCompiler()47 ~GpuCompiler() override {} 48 49 // Bring in 50 // StatusOr<std::vector<std::unique_ptr<Executable>>> Compile( 51 // std::vector<std::unique_ptr<HloModule>> modules, 52 // std::vector<std::vector<se::StreamExecutor*>> 53 // stream_execs) 54 using LLVMCompiler::Compile; 55 56 StatusOr<std::unique_ptr<HloModule>> RunHloPasses( 57 std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec, 58 const CompileOptions& options) override; 59 60 StatusOr< 61 std::tuple<std::unique_ptr<HloModule>, std::unique_ptr<BufferAssignment>>> 62 RunHloPassesAndBufferAssignement(std::unique_ptr<HloModule> hlo_module, 63 se::StreamExecutor* executor, bool optimize, 64 const CompileOptions& options) override; 65 66 Status OptimizeHloModule(HloModule* hlo_module, 67 se::StreamExecutor* stream_exec, 68 se::DeviceMemoryAllocator* device_allocator); 69 70 virtual Status OptimizeHloConvolutionCanonicalization( 71 HloModule* hlo_module, se::StreamExecutor* stream_exec, 72 se::DeviceMemoryAllocator* device_allocator) = 0; 73 74 virtual Status OptimizeHloPostLayoutAssignment( 75 HloModule* hlo_module, se::StreamExecutor* stream_exec, 76 se::DeviceMemoryAllocator* device_allocator); 77 GetCanShareBuffer()78 virtual HloDataflowAnalysis::CanShareBuffer GetCanShareBuffer() { 79 return 80 [](const HloInstruction*, const HloInstruction*, 81 const ShapeIndex&) -> absl::optional<bool> { return absl::nullopt; }; 82 } 83 84 virtual GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) = 0; 85 86 // TODO(timshen): Replace `debug_module` with some portable debug information 87 // that accommodates both HLO and MLIR. 88 virtual StatusOr<std::pair<std::string, std::vector<uint8>>> 89 CompileTargetBinary(const HloModuleConfig& module_config, 90 llvm::Module* llvm_module, GpuVersion gpu_version, 91 se::StreamExecutor* stream_exec, bool relocatable, 92 const HloModule* debug_module) = 0; 93 94 Status PrepareHloModuleForIrEmitting(HloModule* hlo_module); 95 96 StatusOr<std::unique_ptr<Executable>> RunBackend( 97 std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec, 98 const CompileOptions& options) override; 99 100 StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>> 101 CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group, 102 AotCompilationOptions const& options) override; 103 104 StatusOr<std::pair<std::string, std::vector<uint8>>> CompileToTargetBinary( 105 const HloModuleConfig& module_config, 106 std::unique_ptr<llvm::Module> llvm_module, 107 se::StreamExecutor* stream_exec, const CompileOptions& options, 108 const HloModule* debug_module); 109 PlatformId()110 se::Platform::Id PlatformId() const override { return platform_id_; } 111 ShapeSizeBytesFunction()112 HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override { 113 // Capture just the pointer size, not the entire GpuCompiler object. 114 return [pointer_size = pointer_size_](const Shape& shape) { 115 return GetSizeOfShape(shape, pointer_size); 116 }; 117 } 118 GetSizeOfShape(const Shape & shape,int pointer_size)119 static int64 GetSizeOfShape(const Shape& shape, int pointer_size) { 120 if (shape.is_static() || shape.IsTuple()) { 121 return ShapeUtil::ByteSizeOf(shape, pointer_size); 122 } 123 // Each dynamic dimension size is represented as a S32. 124 int64 metadata_size = sizeof(int32) * shape.dimensions_size(); 125 return ShapeUtil::ByteSizeOf(shape, pointer_size) + metadata_size; 126 } 127 128 private: LinkModules(se::StreamExecutor * stream_exec,std::vector<std::vector<uint8>> modules)129 virtual StatusOr<std::vector<uint8>> LinkModules( 130 se::StreamExecutor* stream_exec, 131 std::vector<std::vector<uint8>> modules) { 132 return Unimplemented("LinkModules is not implemented."); 133 } 134 135 se::Platform::Id platform_id_; 136 137 // The triple that represents our target. 138 const char* target_triple_; 139 140 // The data layout of the emitted module. 141 const char* data_layout_; 142 143 // The size in bytes of a pointer. Used by ShapeSizeBytesFunction. 144 const int64 pointer_size_; 145 146 TF_DISALLOW_COPY_AND_ASSIGN(GpuCompiler); 147 }; 148 149 GpuDeviceInfo GetGpuDeviceInfo(se::StreamExecutor* stream_exec); 150 151 // Compile `hlo_module` using XLA GPU and return the LLVM module thus generated. 152 // The GpuExecutable (and the Thunks that are part of it) are not returned. 153 StatusOr<std::unique_ptr<llvm::Module>> CompileModuleToLlvmIr( 154 HloModule* hlo_module, llvm::LLVMContext* llvm_context, 155 const std::string& target_triple, const std::string& data_layout, 156 const std::string& platform_name, GpuDeviceInfo gpu_device_info, 157 absl::optional<CudaComputeCapability> cuda_compute_capability, 158 int pointer_size); 159 160 // Compiles the given LMHLO module to an executable. 161 // ir_emitter_context should be partially populated: buffer_assignment 162 // or buffer_allocations should not be populated, while other fields should be 163 // populated (or left empty if that field is optional). 164 // 165 // NOTE: buffer_assignment will be gone from ir_emitter_context once LMHLO 166 // transition is done. 167 StatusOr<std::unique_ptr<Executable>> CompileLmhloToExecutable( 168 GpuCompiler* compiler, mlir::ModuleOp module, std::string module_name, 169 const HloModuleConfig& module_config, 170 const Compiler::CompileOptions& options, 171 absl::string_view entry_function_name, se::StreamExecutor* stream_exec, 172 std::unique_ptr<llvm::Module> llvm_module, 173 IrEmitterContext* ir_emitter_context); 174 175 } // namespace gpu 176 } // namespace xla 177 178 #endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_COMPILER_H_ 179