1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_EXECUTABLE_H_ 17 #define TENSORFLOW_COMPILER_XLA_SERVICE_EXECUTABLE_H_ 18 19 #include <memory> 20 #include <set> 21 #include <utility> 22 #include <vector> 23 24 #include "absl/types/span.h" 25 #include "absl/types/variant.h" 26 #include "tensorflow/compiler/xla/debug_options_flags.h" 27 #include "tensorflow/compiler/xla/service/computation_layout.h" 28 #include "tensorflow/compiler/xla/service/hlo.pb.h" 29 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h" 30 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h" 31 #include "tensorflow/compiler/xla/service/hlo_module.h" 32 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h" 33 #include "tensorflow/compiler/xla/service/service_executable_run_options.h" 34 #include "tensorflow/compiler/xla/service/shaped_buffer.h" 35 #include "tensorflow/compiler/xla/shape_tree.h" 36 #include "tensorflow/compiler/xla/statusor.h" 37 #include "tensorflow/compiler/xla/util.h" 38 #include "tensorflow/compiler/xla/xla_data.pb.h" 39 #include "tensorflow/core/platform/stream_executor_no_cuda.h" 40 #include "tensorflow/stream_executor/device_memory_allocator.h" 41 42 namespace xla { 43 44 // TODO(b/150633678): Both the ExecutionInput and ExecutionOutput need to be 45 // revisited, with the execute APIs taking data structure which can better model 46 // shareable buffers. 47 // 48 // ExecutionInput buffers are in one of three states: 49 // 50 // 1) Owned by the caller and immutable. 51 // 2) Donated by the caller but returned on error. 52 // 3) Donated by the caller and freed on error. 53 // 54 // Case (1) buffers are stored as MaybeOwningDeviceMemory(DeviceMemoryBase). 55 // Case (2) buffers are stored as MaybeOwningDeviceMemory(OwningDeviceMemory), 56 // with their indices present in unowned_indices_. 57 // Case (3) buffers are stored as MaybeOwningDeviceMemory(OwningDeviceMemory), 58 // with their indices absent from unowned_indices_. 59 class ExecutionInput { 60 public: ExecutionInput(xla::Shape shape)61 explicit ExecutionInput(xla::Shape shape) : buffers_(std::move(shape)) { 62 SetHostShape(ShapeUtil::DeviceShapeToHostShape(buffers_.shape())); 63 } 64 // TODO(b/170310047): remove this overload. ExecutionInput(xla::Shape shape,xla::Shape host_shape)65 ExecutionInput(xla::Shape shape, xla::Shape host_shape) 66 : buffers_(std::move(shape)) { 67 SetHostShape(ShapeUtil::DeviceShapeToHostShape(buffers_.shape())); 68 } 69 ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers)70 explicit ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers) 71 : buffers_(std::move(buffers)) { 72 SetHostShape(ShapeUtil::DeviceShapeToHostShape(buffers_.shape())); 73 } 74 // TODO(b/170310047): remove this overload. ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers,xla::Shape host_shape)75 ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers, 76 xla::Shape host_shape) 77 : buffers_(std::move(buffers)) { 78 SetHostShape(ShapeUtil::DeviceShapeToHostShape(buffers_.shape())); 79 } 80 81 ExecutionInput(ExecutionInput&&) = default; 82 83 ~ExecutionInput(); 84 85 ExecutionInput& operator=(ExecutionInput&&) = default; 86 shape()87 const Shape& shape() const { 88 return dynamic_shape_ != nullptr ? *dynamic_shape_ : buffers_.shape(); 89 } 90 host_shape()91 const Shape& host_shape() const { 92 return host_shape_ != nullptr ? *host_shape_ : shape(); 93 } 94 95 Status SetDynamicShape(Shape dynamic_shape); 96 97 xla::StatusOr<xla::ShapedBuffer> ToShapedBuffer( 98 se::DeviceMemoryAllocator* allocator, int device_ordinal) const; 99 SetBuffer(const ShapeIndex & index,MaybeOwningDeviceMemory buffer)100 void SetBuffer(const ShapeIndex& index, MaybeOwningDeviceMemory buffer) { 101 *buffers_.mutable_element(index) = std::move(buffer); 102 } 103 104 void SetUnownedBuffer(const ShapeIndex& index, 105 MaybeOwningDeviceMemory buffer); 106 SetUnownedIndex(const ShapeIndex & index)107 void SetUnownedIndex(const ShapeIndex& index) { 108 unowned_indices_.insert(index); 109 } 110 ClearUnownedIndex(const ShapeIndex & index)111 void ClearUnownedIndex(const ShapeIndex& index) { 112 unowned_indices_.erase(index); 113 } 114 unowned_indices()115 const std::set<ShapeIndex>& unowned_indices() { return unowned_indices_; } 116 Buffers()117 const ShapeTree<MaybeOwningDeviceMemory>& Buffers() const { return buffers_; } 118 MutableBuffers()119 ShapeTree<MaybeOwningDeviceMemory>* MutableBuffers() { return &buffers_; } 120 MutableBuffer(const ShapeIndex & index)121 MaybeOwningDeviceMemory* MutableBuffer(const ShapeIndex& index) { 122 return buffers_.mutable_element(index); 123 } 124 Buffer(const ShapeIndex & index)125 const MaybeOwningDeviceMemory& Buffer(const ShapeIndex& index) const { 126 return buffers_.element(index); 127 } 128 129 private: SetHostShape(xla::Shape host_shape)130 void SetHostShape(xla::Shape host_shape) { 131 if (shape() != host_shape) { 132 host_shape_ = std::make_unique<Shape>(std::move(host_shape)); 133 } 134 } 135 136 ShapeTree<MaybeOwningDeviceMemory> buffers_; 137 // Set of indices of buffers that should be returned to the caller if an error 138 // occurs when enqueuing the computation. 139 std::set<ShapeIndex> unowned_indices_; 140 std::unique_ptr<Shape> dynamic_shape_; 141 std::unique_ptr<Shape> host_shape_; 142 }; 143 144 // ExecutionOutput encapsulates the output buffers of a execution and the 145 // leftover buffers to be released by the caller. 146 class ExecutionOutput { 147 public: ExecutionOutput(ScopedShapedBuffer result)148 explicit ExecutionOutput(ScopedShapedBuffer result) 149 : result_(std::move(result)) {} ExecutionOutput(ScopedShapedBuffer result,std::vector<se::OwningDeviceMemory> to_be_released)150 ExecutionOutput(ScopedShapedBuffer result, 151 std::vector<se::OwningDeviceMemory> to_be_released) 152 : result_(std::move(result)), 153 to_be_released_(std::move(to_be_released)) {} 154 // TODO(b/170310047): remove this overload. ExecutionOutput(Shape on_host_shape,Shape on_device_shape,se::DeviceMemoryAllocator * allocator,int device_ordinal)155 ExecutionOutput(Shape on_host_shape, Shape on_device_shape, 156 se::DeviceMemoryAllocator* allocator, int device_ordinal) 157 : result_(std::move(on_device_shape), allocator, device_ordinal) {} ExecutionOutput(Shape on_device_shape,se::DeviceMemoryAllocator * allocator,int device_ordinal)158 ExecutionOutput(Shape on_device_shape, se::DeviceMemoryAllocator* allocator, 159 int device_ordinal) 160 : result_(std::move(on_device_shape), allocator, device_ordinal) {} 161 ExecutionOutput(ExecutionOutput&&) = default; 162 ExecutionOutput& operator=(ExecutionOutput&&) = default; 163 ~ExecutionOutput()164 ~ExecutionOutput() { 165 // If the ExecutionOutput has not been committed, and if there are aliased 166 // indices, clear them off the ScopedShapedBuffer to prevent them to be 167 // released. 168 for (auto& index : aliased_indices_) { 169 result_.set_buffer(se::OwningDeviceMemory(), index); 170 } 171 } 172 AddAliasedIndex(ShapeIndex index)173 void AddAliasedIndex(ShapeIndex index) { 174 aliased_indices_.push_back(std::move(index)); 175 } 176 AddToBeReleased(se::OwningDeviceMemory mem)177 void AddToBeReleased(se::OwningDeviceMemory mem) { 178 to_be_released_.push_back(std::move(mem)); 179 } 180 181 // Should be called once it is known that the execute operation succeeded, 182 // before returning the ExecutionOutput to the caller. Commit()183 ExecutionOutput& Commit() { 184 aliased_indices_.clear(); 185 return *this; 186 } 187 Result()188 const ScopedShapedBuffer& Result() const { return result_; } 189 MutableResult()190 ScopedShapedBuffer* MutableResult() { return &result_; } 191 ConsumeResult()192 ScopedShapedBuffer ConsumeResult() { 193 aliased_indices_.clear(); 194 return std::move(result_); 195 } 196 ToBeReleased()197 const std::vector<se::OwningDeviceMemory>& ToBeReleased() const { 198 return to_be_released_; 199 } 200 ConsumeToBeReleased()201 std::vector<se::OwningDeviceMemory> ConsumeToBeReleased() { 202 return std::move(to_be_released_); 203 } 204 ConsumeAliasedIndices()205 std::vector<ShapeIndex> ConsumeAliasedIndices() { 206 auto aliased = std::move(aliased_indices_); 207 aliased_indices_.clear(); 208 return aliased; 209 } 210 211 private: 212 ScopedShapedBuffer result_; 213 214 // Leftover buffers for the caller to release. Elements in this list are 215 // donated input memory buffers that are not reused by XLA as outputs. 216 std::vector<se::OwningDeviceMemory> to_be_released_; 217 218 // These are the indices in result_ which have been aliased from the caller. 219 // If the execution operation fails, the caller should maintain ownership of 220 // the buffer, so we track the indices here, and unless the ExecutionOutput is 221 // committed, we remove them from the result_ before destruction. 222 std::vector<ShapeIndex> aliased_indices_; 223 224 // A shape table is a continuous region in memory that is used to hold the 225 // runtime dimension sizes of dynamic output shapes. 226 se::OwningDeviceMemory output_shape_table_; 227 }; 228 229 // A given platform's compiler will produce an Executable -- this is a uniform 230 // interface that is used for launching compiled programs across platforms. 231 class Executable { 232 public: Executable(std::shared_ptr<HloModule> hlo_module)233 explicit Executable(std::shared_ptr<HloModule> hlo_module) 234 : hlo_module_(std::move(hlo_module)) {} 235 236 // TODO(b/172012028): Remove this constructor. Executable(std::shared_ptr<HloModule> hlo_module,std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)237 explicit Executable( 238 std::shared_ptr<HloModule> hlo_module, 239 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data, 240 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map) 241 : hlo_module_(std::move(hlo_module)), 242 hlo_profile_printer_data_(std::move(hlo_profile_printer_data)), 243 hlo_profile_index_map_(std::move(hlo_profile_index_map)) { 244 CHECK_EQ(hlo_profile_printer_data_.get() == nullptr, 245 hlo_profile_index_map_.get() == nullptr); 246 } ~Executable()247 virtual ~Executable() {} 248 249 // Enqueues the compilation result on the provided stream, passing the given 250 // arguments. This call is blocking and returns after the execution is done. 251 // 252 // If the hlo_execution_profile is provided as non-nullptr, profiling will be 253 // enabled. 254 // 255 // Returns a shaped buffer containing the result of the computation. 256 StatusOr<ScopedShapedBuffer> ExecuteOnStream( 257 const ServiceExecutableRunOptions* run_options, 258 absl::Span<const ShapedBuffer* const> arguments, 259 HloExecutionProfile* hlo_execution_profile); 260 261 // Starts the given program executing on the given stream/executor. 262 // 263 // `arguments` are ShapeTree containing the input parameters. For each element 264 // in the shape tree, if the element holds the ownership of the memory, it is 265 // considered donated and XLA will potentially reuse it as output buffers. For 266 // all donated inputs, XLA is also responsible for freeing them. 267 // 268 // If an input is donated to XLA but is not reused as output, it is returned 269 // as an leftover buffer for the caller to release. 270 // 271 // This call should be non-blocking and may return as soon as all of the 272 // operations are enqueued for launch on the stream. Note that some 273 // implementations may in fact block or may block in some circumstances (e.g., 274 // when profiling); i.e., asynchronous is a "may" not a "must". 275 // 276 // If the hlo_execution_profile is provided as non-nullptr, profiling will be 277 // enabled. Note that profiling is tricky to use correctly, as the profiling 278 // objects (when they exist) must out-live the task. 279 virtual StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream( 280 const ServiceExecutableRunOptions* run_options, 281 absl::Span<const ShapedBuffer* const> arguments, 282 HloExecutionProfile* hlo_execution_profile); 283 284 // Same as ExecuteAsyncOnStream(), but blocks waiting for the computation to 285 // complete. 286 StatusOr<ExecutionOutput> ExecuteOnStream( 287 const ServiceExecutableRunOptions* run_options, 288 std::vector<ExecutionInput> arguments, 289 HloExecutionProfile* hlo_execution_profile); 290 291 virtual StatusOr<ExecutionOutput> ExecuteAsyncOnStream( 292 const ServiceExecutableRunOptions* run_options, 293 std::vector<ExecutionInput> arguments, 294 HloExecutionProfile* hlo_execution_profile) = 0; 295 296 // Same as ExecuteOnStream(), but runs this executable on multiple 297 // streams. arguments[i] contains the arguments to the execution on 298 // run_options[i]->stream() and the returned value is at index i of the 299 // returned vector. 300 virtual StatusOr<std::vector<ScopedShapedBuffer>> ExecuteOnStreams( 301 absl::Span<const ServiceExecutableRunOptions> run_options, 302 absl::Span<const absl::Span<const ShapedBuffer* const>> arguments); 303 304 // Populates `hlo_execution_profile` from `executor`. This is implicit in any 305 // Execute* API call that takes a hlo_execution_profile argument, but must be 306 // called explicitly for other (async, for example) variants after the stream 307 // has completed. PopulateExecutionProfile(ExecutionProfile * execution_profile,HloExecutionProfile * hlo_execution_profile,se::Stream * stream)308 virtual Status PopulateExecutionProfile( 309 ExecutionProfile* execution_profile, 310 HloExecutionProfile* hlo_execution_profile, se::Stream* stream) { 311 return OkStatus(); 312 } 313 314 // Convenience wrapper for calling Executable::ExecuteOnStream. Sets up a 315 // timer for the execution, sets up HLO profiling if enabled, and fills in the 316 // given ExecutionProfile if non-null. 317 StatusOr<ScopedShapedBuffer> ExecuteOnStreamWrapper( 318 const ServiceExecutableRunOptions* run_options, 319 absl::Span<const ShapedBuffer* const> arguments); 320 321 StatusOr<ExecutionOutput> ExecuteOnStreamWrapper( 322 const ServiceExecutableRunOptions* run_options, 323 std::vector<ExecutionInput> arguments); 324 325 StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStreamWrapper( 326 const ServiceExecutableRunOptions* run_options, 327 absl::Span<const ShapedBuffer* const> arguments); 328 329 StatusOr<ExecutionOutput> ExecuteAsyncOnStreamWrapper( 330 const ServiceExecutableRunOptions* run_options, 331 std::vector<ExecutionInput> arguments); 332 hlo_profile_printer_data()333 const HloProfilePrinterData& hlo_profile_printer_data() const { 334 CHECK(hlo_profiling_enabled()); 335 return *hlo_profile_printer_data_; 336 } 337 hlo_profile_index_map()338 const HloProfileIndexMap& hlo_profile_index_map() const { 339 CHECK(hlo_profiling_enabled()); 340 return *hlo_profile_index_map_; 341 } 342 343 // Returns whether this executable was compiled with HLO profilings support 344 // enabled. If not, the caller should not expect an hlo_execution_profile 345 // passed to ExecuteOnStream above to be populated during execution. hlo_profiling_enabled()346 bool hlo_profiling_enabled() const { 347 return hlo_profile_printer_data_ != nullptr; 348 } 349 module()350 HloModule& module() const { return *hlo_module_; } shared_module()351 std::shared_ptr<HloModule> shared_module() const { return hlo_module_; } 352 has_module()353 const bool has_module() const { return hlo_module_ != nullptr; } 354 module_config()355 const HloModuleConfig& module_config() const { return hlo_module_->config(); } 356 357 // The shape (including layout) that results from this execution. This is the 358 // shape of the DeviceMemoryBase result value in ExecuteOnStream above. result_shape()359 const Shape& result_shape() const { 360 return hlo_module_->config().entry_computation_layout().result_shape(); 361 } 362 363 // Returns the size of the executable in bytes. Returns -1 if this query is 364 // not supported by the executable. 365 // 366 // Does not include the size of used libraries (e.g. cuDNN, Eigen, etc.). 367 virtual int64_t SizeOfGeneratedCodeInBytes() const; 368 369 // Dumping helpers. set_hlo_proto(std::unique_ptr<xla::HloProto> hlo_proto)370 void set_hlo_proto(std::unique_ptr<xla::HloProto> hlo_proto) { 371 hlo_proto_ = std::move(hlo_proto); 372 } dumping_snapshot()373 bool dumping_snapshot() const { 374 return module_config().debug_options().xla_dump_hlo_snapshots(); 375 } hlo_proto()376 HloProto const* hlo_proto() const { return hlo_proto_.get(); } 377 debug_info()378 std::string& debug_info() { return debug_info_; } set_debug_info(const std::string & debug_info)379 void set_debug_info(const std::string& debug_info) { 380 debug_info_ = debug_info; 381 } 382 // Gather unused but donated buffers, return them to the caller of this API. 383 // We don't free buffers inside this function since the caller could have 384 // different preferences for buffer deallocation. For example, in TensorFlow, 385 // buffers are mostly efficiently deallocated as soon as a program has been 386 // launched. However, in XRT, the buffers are expected to be deallocated after 387 // the program has finished since XRT doesn't support async deallocation. 388 void MarkToBeReleasedArguments(absl::Span<ExecutionInput> arguments, 389 ExecutionOutput& result); 390 391 protected: 392 // HloModule this was compiled from. BufferAssignment keeps pointers to 393 // HloInstructions owned by the HloModule so we need to keep the HloModule 394 // around. 395 const std::shared_ptr<HloModule> hlo_module_; 396 397 // The serialized HLO proto. Non-null only if dumping snapshots is enabled. 398 std::unique_ptr<HloProto const> hlo_proto_; 399 400 // Execution count, used to generate a unique filename for each dumped 401 // execution. 402 int64_t execution_count_ = 0; 403 404 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data_; 405 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map_; 406 407 // Generic debug information as a string. 408 std::string debug_info_; 409 }; 410 411 } // namespace xla 412 413 #endif // TENSORFLOW_COMPILER_XLA_SERVICE_EXECUTABLE_H_ 414