1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_H_ 17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_H_ 18 19 #include <memory> 20 #include <string> 21 22 #include "absl/container/flat_hash_map.h" 23 #include "absl/strings/string_view.h" 24 #include "absl/types/optional.h" 25 #include "absl/types/span.h" 26 #include "tensorflow/compiler/xla/service/buffer_assignment.h" 27 #include "tensorflow/compiler/xla/service/device_memory_allocator.h" 28 #include "tensorflow/compiler/xla/service/executable.h" 29 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" 30 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h" 31 #include "tensorflow/compiler/xla/service/gpu/thunk.h" 32 #include "tensorflow/compiler/xla/service/gpu/thunk_schedule.h" 33 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h" 34 #include "tensorflow/compiler/xla/service/hlo_module.h" 35 #include "tensorflow/compiler/xla/service/shaped_buffer.h" 36 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h" 37 #include "tensorflow/compiler/xla/statusor.h" 38 #include "tensorflow/compiler/xla/types.h" 39 #include "tensorflow/core/platform/macros.h" 40 #include "tensorflow/core/platform/stream_executor_no_cuda.h" 41 42 namespace xla { 43 namespace gpu { 44 45 // GPU-targeting implementation of the XLA Executable interface. 46 // 47 // Launches the given CUDA kernel via the StreamExecutor. 48 // 49 // This is an immutable data type after initialization, and thus thread safe. 50 class GpuExecutable : public Executable { 51 public: 52 // cubin (i.e. the compiled ptx) may be empty, in which case we leave 53 // compilation up to the GPU driver. 54 GpuExecutable(const string& ptx, const std::vector<uint8>& cubin, 55 std::pair<int, int> compute_capability, 56 std::unique_ptr<const ThunkSchedule> thunk_schedule, 57 std::unique_ptr<HloModule> hlo_module, 58 std::unique_ptr<const BufferAssignment> assignment, 59 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data, 60 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map); 61 62 // This should be called after set_ir_module_string. ir_module_string()63 const string& ir_module_string() const { return ir_module_string_; } 64 65 // This should be called before ExecuteOnStream. set_ir_module_string(const string & ir_module_string)66 void set_ir_module_string(const string& ir_module_string) { 67 ir_module_string_ = ir_module_string; 68 } 69 70 // Returns the compiled PTX for the computation. ptx()71 const string& ptx() const { return ptx_; } 72 73 // Returns the cubin (compiled PTX) stored in this GpuExecutable. May be 74 // empty, in which case compilation is left up to the GPU driver. cubin()75 const std::vector<uint8>& cubin() const { return cubin_; } 76 77 // ExecuteOnStream will fail if the compute capability of the stream doesn't 78 // match the compute capability passed to this object's constructor. 79 StatusOr<ScopedShapedBuffer> ExecuteOnStream( 80 const ServiceExecutableRunOptions* run_options, 81 absl::Span<const ShapedBuffer* const> arguments, 82 HloExecutionProfile* hlo_execution_profile) override; 83 84 StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream( 85 const ServiceExecutableRunOptions* run_options, 86 absl::Span<const ShapedBuffer* const> arguments) override; 87 88 private: 89 // If `block_host_until_done` is false, execution will not block the host 90 // until the kernels have completed. This is used as an optimization for 91 // clients, such as Tensorflow, that use a single stream of execution for 92 // computations, and allow host-side deallocation from the allocator before 93 // GPU execution completes. 94 Status ExecuteThunks(const ServiceExecutableRunOptions* run_options, 95 const BufferAllocations& buffer_allocations, 96 bool block_host_until_done, 97 HloExecutionProfile* hlo_execution_profile); 98 99 // Returns the points-to set of the root instruction of the entry 100 // computation. Uses points-to analysis from buffer assignment. 101 const PointsToSet& GetRootPointsToSet() const; 102 103 using BufferAllocToDeviceMemoryMap = 104 absl::flat_hash_map<BufferAllocation::Index, se::DeviceMemoryBase>; 105 106 // Loads the PTX or CUBIN for this executable into `executor` and resolves the 107 // globals corresponding to constant buffers. Returns a map mapping buffer 108 // allocation indices to GPU pointers. 109 StatusOr<const BufferAllocToDeviceMemoryMap*> ResolveConstantGlobals( 110 stream_executor::StreamExecutor* executor); 111 112 // The LLVM IR, in string format, of the unoptimized module generated for this 113 // GpuExecutable. We save a string instead of an llvm::Module* because leaving 114 // llvm::Module* in a singleton can cause the heap checker to emit false 115 // positives. 116 // 117 // This string should be modified only before ExecuteOnStream. 118 string ir_module_string_; 119 120 // The PTX for the computation. 121 const string ptx_; 122 123 // The GPU machine code for the computation, targeting GPUs at 124 // compute_capability_. 125 // 126 // May be empty, in which case we leave compilation up to the GPU driver. 127 const std::vector<uint8> cubin_; 128 129 // The compute capability of the GPU we're targeting with this GpuExecutable. 130 std::pair<int, int> compute_capability_; 131 132 // The thunks to be invoked by this GpuExecutable. They are generated by the 133 // IrEmitter. 134 const std::unique_ptr<const ThunkSchedule> thunk_schedule_; 135 136 // Owns the buffer data at runtime. It provides information to allocate 137 // memory for every output/temp buffers. 138 const std::unique_ptr<const BufferAssignment> assignment_; 139 140 // Cache of module handles and constant buffer allocation maps used by 141 // `ResolveConstantGlobals`. 142 tensorflow::mutex module_handle_mutex_; 143 std::map<stream_executor::StreamExecutor*, se::ScopedModuleHandle> 144 module_handles_ GUARDED_BY(module_handle_mutex_); 145 std::map<stream_executor::StreamExecutor*, BufferAllocToDeviceMemoryMap> 146 module_globals_ GUARDED_BY(module_handle_mutex_); 147 148 TF_DISALLOW_COPY_AND_ASSIGN(GpuExecutable); 149 }; 150 151 } // namespace gpu 152 } // namespace xla 153 154 #endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_H_ 155