1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_COMPILER_H_ 17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_COMPILER_H_ 18 19 #include <memory> 20 #include <string> 21 #include <vector> 22 23 #include "mlir/IR/BuiltinOps.h" // from @llvm-project 24 #include "tensorflow/compiler/xla/service/executable.h" 25 #include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h" 26 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h" 27 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h" 28 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h" 29 #include "tensorflow/compiler/xla/service/hlo_module.h" 30 #include "tensorflow/compiler/xla/service/llvm_compiler.h" 31 #include "tensorflow/compiler/xla/statusor.h" 32 #include "tensorflow/compiler/xla/types.h" 33 #include "tensorflow/core/lib/hash/hash.h" 34 #include "tensorflow/core/platform/macros.h" 35 #include "tensorflow/core/platform/stream_executor_no_cuda.h" 36 #include "tensorflow/core/platform/thread_annotations.h" 37 #include "tensorflow/stream_executor/stream_executor_pimpl.h" 38 39 namespace xla { 40 namespace gpu { 41 42 // The GPU compiler generates efficient GPU executables. 43 class GpuCompiler : public LLVMCompiler { 44 public: 45 GpuCompiler(se::Platform::Id platform_id, const char* target_triple, 46 const char* data_layout); ~GpuCompiler()47 ~GpuCompiler() override {} 48 49 using LLVMCompiler::Compile; 50 51 StatusOr<std::unique_ptr<HloModule>> RunHloPasses( 52 std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec, 53 const CompileOptions& options) override; 54 55 StatusOr< 56 std::tuple<std::unique_ptr<HloModule>, std::unique_ptr<BufferAssignment>>> 57 RunHloPassesAndBufferAssignement(std::unique_ptr<HloModule> hlo_module, 58 se::StreamExecutor* executor, bool optimize, 59 const CompileOptions& options) override; 60 61 Status OptimizeHloModule(HloModule* hlo_module, 62 se::StreamExecutor* stream_exec, 63 se::DeviceMemoryAllocator* device_allocator); 64 65 virtual Status OptimizeHloConvolutionCanonicalization( 66 HloModule* hlo_module, se::StreamExecutor* stream_exec, 67 se::DeviceMemoryAllocator* device_allocator) = 0; 68 69 virtual Status OptimizeHloPostLayoutAssignment( 70 HloModule* hlo_module, se::StreamExecutor* stream_exec, 71 se::DeviceMemoryAllocator* device_allocator); 72 GetCanShareBuffer()73 virtual HloDataflowAnalysis::CanShareBuffer GetCanShareBuffer() { 74 return 75 [](const HloInstruction*, const HloInstruction*, 76 const ShapeIndex&) -> absl::optional<bool> { return absl::nullopt; }; 77 } 78 79 virtual GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) = 0; 80 81 // TODO(timshen): Replace `debug_module` with some portable debug information 82 // that accommodates both HLO and MLIR. 83 virtual StatusOr<std::pair<std::string, std::vector<uint8>>> 84 CompileTargetBinary(const HloModuleConfig& module_config, 85 llvm::Module* llvm_module, GpuVersion gpu_version, 86 se::StreamExecutor* stream_exec, bool relocatable, 87 const HloModule* debug_module) = 0; 88 89 Status PrepareHloModuleForIrEmitting(HloModule* hlo_module); 90 91 StatusOr<std::unique_ptr<Executable>> RunBackend( 92 std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec, 93 const CompileOptions& options) override; 94 95 StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>> 96 CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group, 97 AotCompilationOptions const& options) override; 98 99 StatusOr<std::pair<std::string, std::vector<uint8>>> CompileToTargetBinary( 100 const HloModuleConfig& module_config, 101 std::unique_ptr<llvm::Module> llvm_module, 102 se::StreamExecutor* stream_exec, const CompileOptions& options, 103 const HloModule* debug_module); 104 PlatformId()105 se::Platform::Id PlatformId() const override { return platform_id_; } 106 ShapeSizeBytesFunction()107 HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override { 108 // Capture just the pointer size, not the entire GpuCompiler object. 109 return [pointer_size = pointer_size_](const Shape& shape) { 110 return GetSizeOfShape(shape, pointer_size); 111 }; 112 } 113 GetSizeOfShape(const Shape & shape,int pointer_size)114 static int64 GetSizeOfShape(const Shape& shape, int pointer_size) { 115 if (shape.is_static() || shape.IsTuple()) { 116 return ShapeUtil::ByteSizeOf(shape, pointer_size); 117 } 118 // Each dynamic dimension size is represented as a S32. 119 int64_t metadata_size = sizeof(int32) * shape.dimensions_size(); 120 return ShapeUtil::ByteSizeOf(shape, pointer_size) + metadata_size; 121 } 122 123 private: LinkModules(se::StreamExecutor * stream_exec,std::vector<std::vector<uint8>> modules)124 virtual StatusOr<std::vector<uint8>> LinkModules( 125 se::StreamExecutor* stream_exec, 126 std::vector<std::vector<uint8>> modules) { 127 return Unimplemented("LinkModules is not implemented."); 128 } 129 130 se::Platform::Id platform_id_; 131 132 // The triple that represents our target. 133 const char* target_triple_; 134 135 // The data layout of the emitted module. 136 const char* data_layout_; 137 138 // The size in bytes of a pointer. Used by ShapeSizeBytesFunction. 139 const int64 pointer_size_; 140 141 TF_DISALLOW_COPY_AND_ASSIGN(GpuCompiler); 142 }; 143 144 GpuDeviceInfo GetGpuDeviceInfo(se::StreamExecutor* stream_exec); 145 146 // Compile `hlo_module` using XLA GPU and return the LLVM module thus generated. 147 // The GpuExecutable (and the Thunks that are part of it) are not returned. 148 StatusOr<std::unique_ptr<llvm::Module>> CompileModuleToLlvmIr( 149 HloModule* hlo_module, llvm::LLVMContext* llvm_context, 150 const std::string& target_triple, const std::string& data_layout, 151 const std::string& platform_name, GpuDeviceInfo gpu_device_info, 152 se::CudaComputeCapability cuda_compute_capability, int pointer_size); 153 154 // Compiles the given LMHLO module to an executable. 155 // ir_emitter_context should be partially populated: buffer_assignment 156 // or buffer_allocations should not be populated, while other fields should be 157 // populated (or left empty if that field is optional). 158 // 159 // NOTE: buffer_assignment will be gone from ir_emitter_context once LMHLO 160 // transition is done. 161 StatusOr<std::unique_ptr<Executable>> CompileLmhloToExecutable( 162 GpuCompiler* compiler, mlir::ModuleOp module, std::string module_name, 163 const HloModuleConfig& module_config, 164 const Compiler::CompileOptions& options, 165 absl::string_view entry_function_name, se::StreamExecutor* stream_exec, 166 std::unique_ptr<llvm::Module> llvm_module, 167 IrEmitterContext* ir_emitter_context); 168 169 } // namespace gpu 170 } // namespace xla 171 172 #endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_COMPILER_H_ 173