1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_H_ 17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_H_ 18 19 #include <memory> 20 #include <string> 21 22 #include "absl/container/flat_hash_map.h" 23 #include "absl/strings/string_view.h" 24 #include "absl/types/optional.h" 25 #include "absl/types/span.h" 26 #include "tensorflow/compiler/xla/service/buffer_assignment.h" 27 #include "tensorflow/compiler/xla/service/executable.h" 28 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" 29 #include "tensorflow/compiler/xla/service/gpu/gpu_types.h" 30 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h" 31 #include "tensorflow/compiler/xla/service/gpu/thunk.h" 32 #include "tensorflow/compiler/xla/service/gpu/thunk_schedule.h" 33 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h" 34 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h" 35 #include "tensorflow/compiler/xla/service/hlo_module.h" 36 #include "tensorflow/compiler/xla/service/shaped_buffer.h" 37 #include "tensorflow/compiler/xla/statusor.h" 38 #include "tensorflow/core/platform/macros.h" 39 #include "tensorflow/core/platform/stream_executor_no_cuda.h" 40 #include "tensorflow/stream_executor/device_memory_allocator.h" 41 42 namespace xla { 43 namespace gpu { 44 45 // GPU-targeting implementation of the XLA Executable interface. 46 // 47 // Launches the given GPU kernel via the StreamExecutor. 48 // 49 // This is an immutable data type after initialization, and thus thread safe. 50 class GpuExecutable : public Executable { 51 public: 52 struct ConstantInfo { 53 std::string symbol_name; 54 std::vector<uint8> content; 55 int allocation_index = -1; 56 }; 57 58 struct OutputInfo { 59 // Corresponding allocation index. 60 int allocation_index; 61 62 // Output is passed-through from a parameter. 63 bool passthrough = false; 64 65 // Whether this output is hinted to alias a parameter (BufferAllocation* 66 // would indicate the aliased parameter), and what kind of alias it is. 67 absl::optional<HloInputOutputAliasConfig::Alias> alias_config; 68 }; 69 70 struct Params { 71 std::string asm_text; 72 std::vector<uint8> binary; 73 GpuVersion gpu_version; 74 std::unique_ptr<const ThunkSchedule> thunk_schedule; 75 std::vector<ConstantInfo> constants; 76 absl::flat_hash_map<ShapeIndex, OutputInfo> output_info; 77 std::string module_name; 78 xla::Shape output_shape; 79 std::vector<BufferAllocation> allocations; 80 std::unique_ptr<BufferAssignmentProto> debug_buffer_assignment; 81 std::unique_ptr<HloModule> debug_module = nullptr; 82 size_t entry_computation_profile_index = 0; 83 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data = nullptr; 84 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map = nullptr; 85 }; 86 87 // We need to share ownership of hlo_module and assignment with profiler to 88 // safely keep a reference to these objects during tracing period, thus they 89 // are passed as shared pointers. 90 explicit GpuExecutable(Params params); 91 ~GpuExecutable() override; 92 93 int64 SizeOfGeneratedCodeInBytes() const override; 94 95 // This should be called after set_ir_module_string. ir_module_string()96 const string& ir_module_string() const { return ir_module_string_; } 97 98 // This should be called before ExecuteOnStream. set_ir_module_string(const string & ir_module_string)99 void set_ir_module_string(const string& ir_module_string) { 100 ir_module_string_ = ir_module_string; 101 } 102 103 // Returns the compiled code for the computation. The compiled code is PTX in 104 // Cuda and unused empty string in ROCm. text()105 const string& text() const { return text_; } 106 107 // Returns the binary stored in this GpuExecutable. The binary is cubin in 108 // Cuda, and HSA code object in ROCm. It may be empty, in which case 109 // compilation is left up to the GPU driver. binary()110 const std::vector<uint8>& binary() const { return binary_; } 111 112 // ExecuteAsyncOnStream will fail if the compute capability of the stream 113 // doesn't match the compute capability passed to this object's constructor. 114 StatusOr<ExecutionOutput> ExecuteAsyncOnStream( 115 const ServiceExecutableRunOptions* run_options, 116 std::vector<ExecutionInput> arguments, 117 HloExecutionProfile* hlo_execution_profile) override; 118 119 StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream( 120 const ServiceExecutableRunOptions* run_options, 121 absl::Span<const ShapedBuffer* const> arguments, 122 HloExecutionProfile* hlo_execution_profile); 123 124 using VariantArguments = absl::variant<absl::Span<const ShapedBuffer* const>, 125 absl::Span<ExecutionInput>>; 126 StatusOr<ExecutionOutput> ExecuteAsyncOnStreamImpl( 127 const ServiceExecutableRunOptions* run_options, 128 VariantArguments arguments, HloExecutionProfile* hlo_execution_profile); 129 GetAllocations()130 absl::Span<const BufferAllocation> GetAllocations() const { 131 return allocations_; 132 } 133 134 private: 135 // If `block_host_until_done` is false, execution will not block the host 136 // until the kernels have completed. This is used as an optimization for 137 // clients, such as Tensorflow, that use a single stream of execution for 138 // computations, and allow host-side deallocation from the allocator before 139 // GPU execution completes. 140 Status ExecuteThunks(const ServiceExecutableRunOptions* run_options, 141 const BufferAllocations& buffer_allocations, 142 bool block_host_until_done, 143 HloExecutionProfile* hlo_execution_profile); 144 145 using BufferAllocToDeviceMemoryMap = 146 absl::flat_hash_map<BufferAllocation::Index, se::DeviceMemoryBase>; 147 148 // Loads the PTX or CUBIN for this executable into `executor` and resolves the 149 // globals corresponding to constant buffers. Returns a map mapping buffer 150 // allocation indices to GPU pointers. 151 StatusOr<const BufferAllocToDeviceMemoryMap*> ResolveConstantGlobals( 152 stream_executor::Stream* stream); 153 154 // GpuExecutable check with either AMD's ISA version, or Nvidia's major minor 155 // version for compute capability, depending on the hardware. 156 Status CheckCompatibilityWithServiceExecutableRunOptions( 157 const ServiceExecutableRunOptions* run_options); 158 159 StatusOr<BufferAllocations> GenerateBufferAllocations( 160 VariantArguments arguments, 161 const GpuExecutable::BufferAllocToDeviceMemoryMap* globals, 162 se::DeviceMemoryAllocator* const memory_allocator, 163 se::StreamExecutor* executor); 164 165 StatusOr<se::DeviceMemoryBase> BufferForAllocation( 166 VariantArguments arguments, 167 const GpuExecutable::BufferAllocToDeviceMemoryMap* globals, 168 const BufferAllocation& allocation, 169 se::DeviceMemoryAllocator* const memory_allocator, int device_ordinal, 170 int64 arg_idx); 171 172 // The LLVM IR, in string format, of the unoptimized module generated for 173 // this GpuExecutable. We save a string instead of an llvm::Module* because 174 // leaving llvm::Module* in a singleton can cause the heap checker to emit 175 // false positives. 176 // 177 // This string should be modified only before ExecuteOnStream. 178 string ir_module_string_; 179 180 // The compiled code for the computation. 181 const string text_; 182 183 // The GPU machine code for the computation, targeting GPUs at 184 // compute_capability_. 185 // 186 // May be empty, in which case we leave compilation up to the GPU driver. 187 const std::vector<uint8> binary_; 188 189 // The GPU version for compute compatibility check. 190 GpuVersion gpu_version_; 191 192 // The thunks to be invoked by this GpuExecutable. They are generated by the 193 // IrEmitter. 194 const std::unique_ptr<const ThunkSchedule> thunk_schedule_; 195 196 std::string module_name_; 197 198 xla::Shape output_shape_; 199 200 // Owns the buffer data at runtime. It provides information to allocate 201 // memory for every output/temp buffers. 202 const std::vector<BufferAllocation> allocations_; 203 204 std::shared_ptr<BufferAssignmentProto> debug_buffer_assignment_; 205 206 size_t entry_computation_profile_index_ = -1; 207 208 // Cache of module handles and constant buffer allocation maps used by 209 // `ResolveConstantGlobals`. 210 tensorflow::mutex module_handle_mutex_; 211 std::map<stream_executor::StreamExecutor*, se::ScopedModuleHandle> 212 module_handles_ TF_GUARDED_BY(module_handle_mutex_); 213 std::map<stream_executor::StreamExecutor*, BufferAllocToDeviceMemoryMap> 214 module_globals_ TF_GUARDED_BY(module_handle_mutex_); 215 216 std::vector<ConstantInfo> constants_; 217 const absl::flat_hash_map<ShapeIndex, OutputInfo> output_info_; 218 219 TF_DISALLOW_COPY_AND_ASSIGN(GpuExecutable); 220 }; 221 222 StatusOr<absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>> 223 GetOutputInfo(const HloModule& hlo_module, const BufferAssignment& assignment); 224 225 } // namespace gpu 226 } // namespace xla 227 228 #endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_H_ 229