1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_EXECUTABLE_H_ 17 #define TENSORFLOW_COMPILER_XLA_SERVICE_EXECUTABLE_H_ 18 19 #include <memory> 20 #include <utility> 21 #include <vector> 22 23 #include "absl/types/span.h" 24 #include "absl/types/variant.h" 25 #include "tensorflow/compiler/xla/debug_options_flags.h" 26 #include "tensorflow/compiler/xla/service/computation_layout.h" 27 #include "tensorflow/compiler/xla/service/hlo.pb.h" 28 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h" 29 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h" 30 #include "tensorflow/compiler/xla/service/hlo_module.h" 31 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h" 32 #include "tensorflow/compiler/xla/service/service_executable_run_options.h" 33 #include "tensorflow/compiler/xla/service/shaped_buffer.h" 34 #include "tensorflow/compiler/xla/shape_tree.h" 35 #include "tensorflow/compiler/xla/statusor.h" 36 #include "tensorflow/compiler/xla/util.h" 37 #include "tensorflow/compiler/xla/xla_data.pb.h" 38 #include "tensorflow/core/platform/mutex.h" 39 #include "tensorflow/core/platform/stream_executor_no_cuda.h" 40 #include "tensorflow/core/platform/thread_annotations.h" 41 #include "tensorflow/stream_executor/device_memory_allocator.h" 42 43 namespace xla { 44 45 // ExecutionOutput encapsulates the output buffers of a execution and the 46 // leftover buffers to be released by the caller. 47 class ExecutionOutput { 48 public: ExecutionOutput(ScopedShapedBuffer result,std::vector<se::OwningDeviceMemory> to_be_released,std::vector<ShapeIndex> aliased_indices,se::OwningDeviceMemory output_shape_table)49 ExecutionOutput(ScopedShapedBuffer result, 50 std::vector<se::OwningDeviceMemory> to_be_released, 51 std::vector<ShapeIndex> aliased_indices, 52 se::OwningDeviceMemory output_shape_table) 53 : result_(std::move(result)), 54 to_be_released_(std::move(to_be_released)), 55 aliased_indices_(std::move(aliased_indices)), 56 output_shape_table_(std::move(output_shape_table)) {} 57 ExecutionOutput(ExecutionOutput&&) = default; 58 ExecutionOutput& operator=(ExecutionOutput&&) = default; 59 ~ExecutionOutput()60 ~ExecutionOutput() { 61 // If the ExecutionOutput has not been committed, and if there are aliased 62 // indices, clear them off the ScopedShapedBuffer to prevent them to be 63 // released. 64 for (auto& index : aliased_indices_) { 65 result_.set_buffer(se::OwningDeviceMemory(), index); 66 } 67 } 68 69 // Should be called once it is known that the execute operation succeeded, 70 // before returning the ExecutionOutput to the caller. Commit()71 ExecutionOutput& Commit() { 72 aliased_indices_.clear(); 73 return *this; 74 } 75 Result()76 const ScopedShapedBuffer& Result() const { return result_; } 77 ShapeTable()78 const se::OwningDeviceMemory& ShapeTable() const { 79 return output_shape_table_; 80 } 81 ConsumeResult()82 ScopedShapedBuffer ConsumeResult() { 83 aliased_indices_.clear(); 84 return std::move(result_); 85 } 86 ConsumeShapeTable()87 se::OwningDeviceMemory ConsumeShapeTable() { 88 return std::move(output_shape_table_); 89 } 90 ToBeReleased()91 const std::vector<se::OwningDeviceMemory>& ToBeReleased() const { 92 return to_be_released_; 93 } 94 ConsumeToBeReleased()95 std::vector<se::OwningDeviceMemory> ConsumeToBeReleased() { 96 return std::move(to_be_released_); 97 } 98 99 private: 100 ScopedShapedBuffer result_; 101 102 // Leftover buffers for the caller to release. Elements in this list are 103 // donated input memory buffers that are not reused by XLA as outputs. 104 std::vector<se::OwningDeviceMemory> to_be_released_; 105 106 // These are the indices in result_ which have been aliased from the caller. 107 // If the execution operation fails, the caller should maintain ownership of 108 // the buffer, so we track the indices here, and unless the ExecutionOutput is 109 // committed, we remove them from the result_ before destruction. 110 std::vector<ShapeIndex> aliased_indices_; 111 112 // A shape table is a continuous region in memory that is used to hold the 113 // runtime dimension sizes of dynamic output shapes. 114 se::OwningDeviceMemory output_shape_table_; 115 }; 116 117 // A given platform's compiler will produce an Executable -- this is a uniform 118 // interface that is used for launching compiled programs across platforms. 119 class Executable { 120 public: Executable(std::shared_ptr<HloModule> hlo_module,std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)121 explicit Executable( 122 std::shared_ptr<HloModule> hlo_module, 123 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data, 124 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map) 125 : hlo_module_(std::move(hlo_module)), 126 hlo_profile_printer_data_(std::move(hlo_profile_printer_data)), 127 hlo_profile_index_map_(std::move(hlo_profile_index_map)) { 128 CHECK_EQ(hlo_profile_printer_data_.get() == nullptr, 129 hlo_profile_index_map_.get() == nullptr); 130 } ~Executable()131 virtual ~Executable() {} 132 133 // Enqueues the compilation result on the provided stream, passing the given 134 // arguments. This call is blocking and returns after the execution is done. 135 // 136 // If the hlo_execution_profile is provided as non-nullptr, profiling will be 137 // enabled. 138 // 139 // Returns a shaped buffer containing the result of the computation. 140 StatusOr<ScopedShapedBuffer> ExecuteOnStream( 141 const ServiceExecutableRunOptions* run_options, 142 absl::Span<const ShapedBuffer* const> arguments, 143 HloExecutionProfile* hlo_execution_profile); 144 145 // Starts the given program executing on the given stream/executor. 146 // 147 // `arguments` are ShapeTree containing the input parameters. For each element 148 // in the shape tree, if the element holds the ownership of the memory, it is 149 // considered donated and XLA will potentially reuse it as output buffers. For 150 // all donated inputs, XLA is also responsible for freeing them. 151 // 152 // If an input is donated to XLA but is not reused as output, it is returned 153 // as an leftover buffer for the caller to release. 154 // 155 // This call should be non-blocking and may return as soon as all of the 156 // operations are enqueued for launch on the stream. Note that some 157 // implementations may in fact block or may block in some circumstances (e.g., 158 // when profiling); i.e., asynchronous is a "may" not a "must". 159 // 160 // If the hlo_execution_profile is provided as non-nullptr, profiling will be 161 // enabled. Note that profiling is tricky to use correctly, as the profiling 162 // objects (when they exist) must out-live the task. 163 StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream( 164 const ServiceExecutableRunOptions* run_options, 165 absl::Span<const ShapedBuffer* const> arguments, 166 HloExecutionProfile* hlo_execution_profile); 167 168 // Same as ExecuteAsyncOnStream(), but blocks waiting for the computation to 169 // complete. 170 StatusOr<ExecutionOutput> ExecuteOnStream( 171 const ServiceExecutableRunOptions* run_options, 172 std::vector<ShapeTree<MaybeOwningDeviceMemory>> arguments, 173 HloExecutionProfile* hlo_execution_profile); 174 175 virtual StatusOr<ExecutionOutput> ExecuteAsyncOnStream( 176 const ServiceExecutableRunOptions* run_options, 177 std::vector<ShapeTree<MaybeOwningDeviceMemory>> arguments, 178 HloExecutionProfile* hlo_execution_profile) = 0; 179 180 // Same as ExecuteOnStream(), but runs this executable on multiple 181 // streams. arguments[i] contains the arguments to the execution on 182 // run_options[i]->stream() and the returned value is at index i of the 183 // returned vector. 184 virtual StatusOr<std::vector<ScopedShapedBuffer>> ExecuteOnStreams( 185 absl::Span<const ServiceExecutableRunOptions> run_options, 186 absl::Span<const absl::Span<const ShapedBuffer* const>> arguments); 187 188 // Populates `hlo_execution_profile` from `executor`. This is implicit in any 189 // Execute* API call that takes a hlo_execution_profile argument, but must be 190 // called explicitly for other (async, for example) variants after the stream 191 // has completed. PopulateExecutionProfile(ExecutionProfile * execution_profile,HloExecutionProfile * hlo_execution_profile,se::Stream * stream)192 virtual Status PopulateExecutionProfile( 193 ExecutionProfile* execution_profile, 194 HloExecutionProfile* hlo_execution_profile, se::Stream* stream) { 195 return Status::OK(); 196 } 197 198 // Convenience wrapper for calling Executable::ExecuteOnStream. Sets up a 199 // timer for the execution, sets up HLO profiling if enabled, and fills in the 200 // given ExecutionProfile if non-null. 201 StatusOr<ScopedShapedBuffer> ExecuteOnStreamWrapper( 202 const ServiceExecutableRunOptions* run_options, 203 absl::Span<const ShapedBuffer* const> arguments); 204 205 StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStreamWrapper( 206 const ServiceExecutableRunOptions* run_options, 207 absl::Span<const ShapedBuffer* const> arguments); 208 209 StatusOr<ExecutionOutput> ExecuteAsyncOnStreamWrapper( 210 const ServiceExecutableRunOptions* run_options, 211 std::vector<ShapeTree<MaybeOwningDeviceMemory>> arguments); 212 hlo_profile_printer_data()213 const HloProfilePrinterData& hlo_profile_printer_data() const { 214 CHECK(hlo_profiling_enabled()); 215 return *hlo_profile_printer_data_; 216 } 217 hlo_profile_index_map()218 const HloProfileIndexMap& hlo_profile_index_map() const { 219 CHECK(hlo_profiling_enabled()); 220 return *hlo_profile_index_map_; 221 } 222 223 // Returns whether this executable was compiled with HLO profilings support 224 // enabled. If not, the caller should not expect an hlo_execution_profile 225 // passed to ExecuteOnStream above to be populated during execution. hlo_profiling_enabled()226 bool hlo_profiling_enabled() const { 227 return hlo_profile_printer_data_ != nullptr; 228 } 229 module()230 HloModule& module() const { return *hlo_module_; } shared_module()231 std::shared_ptr<HloModule> shared_module() const { return hlo_module_; } 232 has_module()233 const bool has_module() const { return hlo_module_ != nullptr; } 234 module_config()235 const HloModuleConfig& module_config() const { return hlo_module_->config(); } 236 237 // The shape (including layout) that results from this execution. This is the 238 // shape of the DeviceMemoryBase result value in ExecuteOnStream above. result_shape()239 const Shape& result_shape() const { 240 return hlo_module_->config().entry_computation_layout().result_shape(); 241 } 242 243 // Returns the size of the executable in bytes. Returns -1 if this query is 244 // not supported by the executable. 245 // 246 // Does not include the size of used libraries (e.g. cuDNN, Eigen, etc.). 247 virtual int64 SizeOfGeneratedCodeInBytes(); 248 249 // Dumping helpers. set_hlo_proto(std::unique_ptr<xla::HloProto> hlo_proto)250 void set_hlo_proto(std::unique_ptr<xla::HloProto> hlo_proto) { 251 hlo_proto_ = std::move(hlo_proto); 252 } dumping_snapshot()253 bool dumping_snapshot() const { return hlo_proto_ != nullptr; } hlo_proto()254 HloProto const* hlo_proto() const { return hlo_proto_.get(); } 255 256 protected: 257 // HloModule this was compiled from. BufferAssignment keeps pointers to 258 // HloInstructions owned by the HloModule so we need to keep the HloModule 259 // around. 260 const std::shared_ptr<HloModule> hlo_module_; 261 262 // The serialized HLO proto. Non-null only if dumping snapshots is enabled. 263 std::unique_ptr<HloProto const> hlo_proto_; 264 265 // Execution count, used to generate a unique filename for each dumped 266 // execution. 267 int64 execution_count_ = 0; 268 269 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data_; 270 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map_; 271 }; 272 273 } // namespace xla 274 275 #endif // TENSORFLOW_COMPILER_XLA_SERVICE_EXECUTABLE_H_ 276