1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_H_ 17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_H_ 18 19 #include <memory> 20 #include <string> 21 22 #include "absl/container/flat_hash_map.h" 23 #include "absl/strings/string_view.h" 24 #include "absl/types/optional.h" 25 #include "absl/types/span.h" 26 #include "tensorflow/compiler/xla/service/buffer_assignment.h" 27 #include "tensorflow/compiler/xla/service/executable.h" 28 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" 29 #include "tensorflow/compiler/xla/service/gpu/gpu_types.h" 30 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h" 31 #include "tensorflow/compiler/xla/service/gpu/thunk.h" 32 #include "tensorflow/compiler/xla/service/gpu/thunk_schedule.h" 33 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h" 34 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h" 35 #include "tensorflow/compiler/xla/service/hlo_module.h" 36 #include "tensorflow/compiler/xla/service/shaped_buffer.h" 37 #include "tensorflow/compiler/xla/statusor.h" 38 #include "tensorflow/core/platform/macros.h" 39 #include "tensorflow/core/platform/stream_executor_no_cuda.h" 40 #include "tensorflow/stream_executor/device_memory_allocator.h" 41 42 namespace xla { 43 namespace gpu { 44 45 // GPU-targeting implementation of the XLA Executable interface. 46 // 47 // Launches the given GPU kernel via the StreamExecutor. 48 // 49 // This is an immutable data type after initialization, and thus thread safe. 50 class GpuExecutable : public Executable { 51 public: 52 // We need to share ownership of hlo_module and assignment with profiler to 53 // safely keep a reference to these objects during tracing period, thus they 54 // are passed as shared pointers. 55 GpuExecutable(const string& text, const std::vector<uint8>& binary, 56 GpuVersion gpu_version, 57 std::unique_ptr<const ThunkSchedule> thunk_schedule, 58 std::shared_ptr<HloModule> hlo_module, 59 std::shared_ptr<const BufferAssignment> assignment, 60 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data, 61 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map); 62 ~GpuExecutable() override; 63 64 int64 SizeOfGeneratedCodeInBytes() override; 65 66 // This should be called after set_ir_module_string. ir_module_string()67 const string& ir_module_string() const { return ir_module_string_; } 68 69 // This should be called before ExecuteOnStream. set_ir_module_string(const string & ir_module_string)70 void set_ir_module_string(const string& ir_module_string) { 71 ir_module_string_ = ir_module_string; 72 } 73 74 // Returns the compiled code for the computation. The compiled code is PTX in 75 // Cuda and unused empty string in ROCm. text()76 const string& text() const { return text_; } 77 78 // Returns the binary stored in this GpuExecutable. The binary is cubin in 79 // Cuda, and HSA code object in ROCm. It may be empty, in which case 80 // compilation is left up to the GPU driver. binary()81 const std::vector<uint8>& binary() const { return binary_; } 82 83 // ExecuteAsyncOnStream will fail if the compute capability of the stream 84 // doesn't match the compute capability passed to this object's constructor. 85 StatusOr<ExecutionOutput> ExecuteAsyncOnStream( 86 const ServiceExecutableRunOptions* run_options, 87 std::vector<ShapeTree<MaybeOwningDeviceMemory>> arguments, 88 HloExecutionProfile* hlo_execution_profile) override; 89 GetBufferAssignment()90 std::shared_ptr<const BufferAssignment> GetBufferAssignment() const { 91 return assignment_; 92 } 93 94 private: 95 // If `block_host_until_done` is false, execution will not block the host 96 // until the kernels have completed. This is used as an optimization for 97 // clients, such as Tensorflow, that use a single stream of execution for 98 // computations, and allow host-side deallocation from the allocator before 99 // GPU execution completes. 100 Status ExecuteThunks(const ServiceExecutableRunOptions* run_options, 101 const BufferAllocations& buffer_allocations, 102 bool block_host_until_done, 103 HloExecutionProfile* hlo_execution_profile); 104 105 // Returns the value set of the root instruction of the entry 106 // computation. Uses dataflow analysis from buffer assignment. 107 const InstructionValueSet& GetRootValueSet() const; 108 109 using BufferAllocToDeviceMemoryMap = 110 absl::flat_hash_map<BufferAllocation::Index, se::DeviceMemoryBase>; 111 112 // Loads the PTX or CUBIN for this executable into `executor` and resolves the 113 // globals corresponding to constant buffers. Returns a map mapping buffer 114 // allocation indices to GPU pointers. 115 StatusOr<const BufferAllocToDeviceMemoryMap*> ResolveConstantGlobals( 116 stream_executor::Stream* stream); 117 118 // Computes annotations for each thunk and store them in thunk_annotations_. 119 void ComputeThunkAnnotations(); 120 121 // GpuExecutable check with either AMD's ISA version, or Nvidia's major minor 122 // version for compute capability, depending on the hardware. 123 Status CheckCompatibilityWithServiceExecutableRunOptions( 124 const ServiceExecutableRunOptions* run_options); 125 126 // The LLVM IR, in string format, of the unoptimized module generated for this 127 // GpuExecutable. We save a string instead of an llvm::Module* because leaving 128 // llvm::Module* in a singleton can cause the heap checker to emit false 129 // positives. 130 // 131 // This string should be modified only before ExecuteOnStream. 132 string ir_module_string_; 133 134 // The compiled code for the computation. 135 const string text_; 136 137 // The GPU machine code for the computation, targeting GPUs at 138 // compute_capability_. 139 // 140 // May be empty, in which case we leave compilation up to the GPU driver. 141 const std::vector<uint8> binary_; 142 143 // The GPU version for compute compatibility check. 144 GpuVersion gpu_version_; 145 146 // The thunks to be invoked by this GpuExecutable. They are generated by the 147 // IrEmitter. 148 const std::unique_ptr<const ThunkSchedule> thunk_schedule_; 149 150 // Owns the buffer data at runtime. It provides information to allocate 151 // memory for every output/temp buffers. 152 const std::shared_ptr<const BufferAssignment> assignment_; 153 154 // Maps a thunk to a string describing the thunk. This is useful when 155 // constructing ScopeAnnotation objects. 156 absl::flat_hash_map<Thunk*, string> thunk_annotations_; 157 158 // Cache of module handles and constant buffer allocation maps used by 159 // `ResolveConstantGlobals`. 160 tensorflow::mutex module_handle_mutex_; 161 std::map<stream_executor::StreamExecutor*, se::ScopedModuleHandle> 162 module_handles_ GUARDED_BY(module_handle_mutex_); 163 std::map<stream_executor::StreamExecutor*, BufferAllocToDeviceMemoryMap> 164 module_globals_ GUARDED_BY(module_handle_mutex_); 165 166 TF_DISALLOW_COPY_AND_ASSIGN(GpuExecutable); 167 }; 168 169 } // namespace gpu 170 } // namespace xla 171 172 #endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_H_ 173