1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_EXECUTABLE_H_ 17 #define TENSORFLOW_COMPILER_XLA_SERVICE_EXECUTABLE_H_ 18 19 #include <memory> 20 #include <set> 21 #include <utility> 22 #include <vector> 23 24 #include "absl/types/span.h" 25 #include "absl/types/variant.h" 26 #include "tensorflow/compiler/xla/debug_options_flags.h" 27 #include "tensorflow/compiler/xla/service/computation_layout.h" 28 #include "tensorflow/compiler/xla/service/hlo.pb.h" 29 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h" 30 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h" 31 #include "tensorflow/compiler/xla/service/hlo_module.h" 32 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h" 33 #include "tensorflow/compiler/xla/service/service_executable_run_options.h" 34 #include "tensorflow/compiler/xla/service/shaped_buffer.h" 35 #include "tensorflow/compiler/xla/shape_tree.h" 36 #include "tensorflow/compiler/xla/statusor.h" 37 #include "tensorflow/compiler/xla/util.h" 38 #include "tensorflow/compiler/xla/xla_data.pb.h" 39 #include "tensorflow/core/platform/mutex.h" 40 #include "tensorflow/core/platform/stream_executor_no_cuda.h" 41 #include "tensorflow/core/platform/thread_annotations.h" 42 #include "tensorflow/stream_executor/device_memory_allocator.h" 43 44 namespace xla { 45 46 // TODO(b/150633678): Both the ExecutionInput and ExecutionOutput need to be 47 // revisited, with the execute APIs taking data structure which can better model 48 // shareable buffers. 49 // 50 // ExecutionInput buffers are in one of three states: 51 // 52 // 1) Owned by the caller and immutable. 53 // 2) Donated by the caller but returned on error. 54 // 3) Donated by the caller and freed on error. 55 // 56 // Case (1) buffers are stored as MaybeOwningDeviceMemory(DeviceMemoryBase). 57 // Case (2) buffers are stored as MaybeOwningDeviceMemory(OwningDeviceMemory), 58 // with their indices present in unowned_indices_. 59 // Case (3) buffers are stored as MaybeOwningDeviceMemory(OwningDeviceMemory), 60 // with their indices absent from unowned_indices_. 61 class ExecutionInput { 62 public: ExecutionInput(xla::Shape shape)63 explicit ExecutionInput(xla::Shape shape) : buffers_(std::move(shape)) { 64 SetHostShape(ShapeUtil::DeviceShapeToHostShape(buffers_.shape())); 65 } 66 // TODO(b/170310047): remove this overload. ExecutionInput(xla::Shape shape,xla::Shape host_shape)67 ExecutionInput(xla::Shape shape, xla::Shape host_shape) 68 : buffers_(std::move(shape)) { 69 SetHostShape(ShapeUtil::DeviceShapeToHostShape(buffers_.shape())); 70 } 71 ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers)72 explicit ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers) 73 : buffers_(std::move(buffers)) { 74 SetHostShape(ShapeUtil::DeviceShapeToHostShape(buffers_.shape())); 75 } 76 // TODO(b/170310047): remove this overload. ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers,xla::Shape host_shape)77 ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers, 78 xla::Shape host_shape) 79 : buffers_(std::move(buffers)) { 80 SetHostShape(ShapeUtil::DeviceShapeToHostShape(buffers_.shape())); 81 } 82 83 ExecutionInput(ExecutionInput&&) = default; 84 85 ~ExecutionInput(); 86 87 ExecutionInput& operator=(ExecutionInput&&) = default; 88 shape()89 const Shape& shape() const { 90 return dynamic_shape_ != nullptr ? *dynamic_shape_ : buffers_.shape(); 91 } 92 host_shape()93 const Shape& host_shape() const { 94 return host_shape_ != nullptr ? *host_shape_ : shape(); 95 } 96 97 Status SetDynamicShape(Shape dynamic_shape); 98 99 xla::StatusOr<xla::ShapedBuffer> ToShapedBuffer( 100 se::DeviceMemoryAllocator* allocator, int device_ordinal) const; 101 SetBuffer(const ShapeIndex & index,MaybeOwningDeviceMemory buffer)102 void SetBuffer(const ShapeIndex& index, MaybeOwningDeviceMemory buffer) { 103 *buffers_.mutable_element(index) = std::move(buffer); 104 } 105 106 void SetUnownedBuffer(const ShapeIndex& index, 107 MaybeOwningDeviceMemory buffer); 108 SetUnownedIndex(const ShapeIndex & index)109 void SetUnownedIndex(const ShapeIndex& index) { 110 unowned_indices_.insert(index); 111 } 112 ClearUnownedIndex(const ShapeIndex & index)113 void ClearUnownedIndex(const ShapeIndex& index) { 114 unowned_indices_.erase(index); 115 } 116 unowned_indices()117 const std::set<ShapeIndex>& unowned_indices() { return unowned_indices_; } 118 Buffers()119 const ShapeTree<MaybeOwningDeviceMemory>& Buffers() const { return buffers_; } 120 MutableBuffers()121 ShapeTree<MaybeOwningDeviceMemory>* MutableBuffers() { return &buffers_; } 122 MutableBuffer(const ShapeIndex & index)123 MaybeOwningDeviceMemory* MutableBuffer(const ShapeIndex& index) { 124 return buffers_.mutable_element(index); 125 } 126 Buffer(const ShapeIndex & index)127 const MaybeOwningDeviceMemory& Buffer(const ShapeIndex& index) const { 128 return buffers_.element(index); 129 } 130 131 private: SetHostShape(xla::Shape host_shape)132 void SetHostShape(xla::Shape host_shape) { 133 if (shape() != host_shape) { 134 host_shape_ = absl::make_unique<Shape>(std::move(host_shape)); 135 } 136 } 137 138 ShapeTree<MaybeOwningDeviceMemory> buffers_; 139 // Set of indices of buffers that should be returned to the caller if an error 140 // occurs when enqueuing the computation. 141 std::set<ShapeIndex> unowned_indices_; 142 std::unique_ptr<Shape> dynamic_shape_; 143 std::unique_ptr<Shape> host_shape_; 144 }; 145 146 // ExecutionOutput encapsulates the output buffers of a execution and the 147 // leftover buffers to be released by the caller. 148 class ExecutionOutput { 149 public: ExecutionOutput(ScopedShapedBuffer result)150 explicit ExecutionOutput(ScopedShapedBuffer result) 151 : result_(std::move(result)) {} ExecutionOutput(ScopedShapedBuffer result,std::vector<se::OwningDeviceMemory> to_be_released)152 ExecutionOutput(ScopedShapedBuffer result, 153 std::vector<se::OwningDeviceMemory> to_be_released) 154 : result_(std::move(result)), 155 to_be_released_(std::move(to_be_released)) {} 156 // TODO(b/170310047): remove this overload. ExecutionOutput(Shape on_host_shape,Shape on_device_shape,se::DeviceMemoryAllocator * allocator,int device_ordinal)157 ExecutionOutput(Shape on_host_shape, Shape on_device_shape, 158 se::DeviceMemoryAllocator* allocator, int device_ordinal) 159 : result_(std::move(on_device_shape), allocator, device_ordinal) {} ExecutionOutput(Shape on_device_shape,se::DeviceMemoryAllocator * allocator,int device_ordinal)160 ExecutionOutput(Shape on_device_shape, se::DeviceMemoryAllocator* allocator, 161 int device_ordinal) 162 : result_(std::move(on_device_shape), allocator, device_ordinal) {} 163 ExecutionOutput(ExecutionOutput&&) = default; 164 ExecutionOutput& operator=(ExecutionOutput&&) = default; 165 ~ExecutionOutput()166 ~ExecutionOutput() { 167 // If the ExecutionOutput has not been committed, and if there are aliased 168 // indices, clear them off the ScopedShapedBuffer to prevent them to be 169 // released. 170 for (auto& index : aliased_indices_) { 171 result_.set_buffer(se::OwningDeviceMemory(), index); 172 } 173 } 174 AddAliasedIndex(ShapeIndex index)175 void AddAliasedIndex(ShapeIndex index) { 176 aliased_indices_.push_back(std::move(index)); 177 } 178 AddToBeReleased(se::OwningDeviceMemory mem)179 void AddToBeReleased(se::OwningDeviceMemory mem) { 180 to_be_released_.push_back(std::move(mem)); 181 } 182 183 // Should be called once it is known that the execute operation succeeded, 184 // before returning the ExecutionOutput to the caller. Commit()185 ExecutionOutput& Commit() { 186 aliased_indices_.clear(); 187 return *this; 188 } 189 Result()190 const ScopedShapedBuffer& Result() const { return result_; } 191 MutableResult()192 ScopedShapedBuffer* MutableResult() { return &result_; } 193 ConsumeResult()194 ScopedShapedBuffer ConsumeResult() { 195 aliased_indices_.clear(); 196 return std::move(result_); 197 } 198 ToBeReleased()199 const std::vector<se::OwningDeviceMemory>& ToBeReleased() const { 200 return to_be_released_; 201 } 202 ConsumeToBeReleased()203 std::vector<se::OwningDeviceMemory> ConsumeToBeReleased() { 204 return std::move(to_be_released_); 205 } 206 ConsumeAliasedIndices()207 std::vector<ShapeIndex> ConsumeAliasedIndices() { 208 auto aliased = std::move(aliased_indices_); 209 aliased_indices_.clear(); 210 return aliased; 211 } 212 213 private: 214 ScopedShapedBuffer result_; 215 216 // Leftover buffers for the caller to release. Elements in this list are 217 // donated input memory buffers that are not reused by XLA as outputs. 218 std::vector<se::OwningDeviceMemory> to_be_released_; 219 220 // These are the indices in result_ which have been aliased from the caller. 221 // If the execution operation fails, the caller should maintain ownership of 222 // the buffer, so we track the indices here, and unless the ExecutionOutput is 223 // committed, we remove them from the result_ before destruction. 224 std::vector<ShapeIndex> aliased_indices_; 225 226 // A shape table is a continuous region in memory that is used to hold the 227 // runtime dimension sizes of dynamic output shapes. 228 se::OwningDeviceMemory output_shape_table_; 229 }; 230 231 // A given platform's compiler will produce an Executable -- this is a uniform 232 // interface that is used for launching compiled programs across platforms. 233 class Executable { 234 public: Executable(std::shared_ptr<HloModule> hlo_module)235 explicit Executable(std::shared_ptr<HloModule> hlo_module) 236 : hlo_module_(std::move(hlo_module)) {} 237 238 // TODO(b/172012028): Remove this constructor. Executable(std::shared_ptr<HloModule> hlo_module,std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)239 explicit Executable( 240 std::shared_ptr<HloModule> hlo_module, 241 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data, 242 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map) 243 : hlo_module_(std::move(hlo_module)), 244 hlo_profile_printer_data_(std::move(hlo_profile_printer_data)), 245 hlo_profile_index_map_(std::move(hlo_profile_index_map)) { 246 CHECK_EQ(hlo_profile_printer_data_.get() == nullptr, 247 hlo_profile_index_map_.get() == nullptr); 248 } ~Executable()249 virtual ~Executable() {} 250 251 // Enqueues the compilation result on the provided stream, passing the given 252 // arguments. This call is blocking and returns after the execution is done. 253 // 254 // If the hlo_execution_profile is provided as non-nullptr, profiling will be 255 // enabled. 256 // 257 // Returns a shaped buffer containing the result of the computation. 258 StatusOr<ScopedShapedBuffer> ExecuteOnStream( 259 const ServiceExecutableRunOptions* run_options, 260 absl::Span<const ShapedBuffer* const> arguments, 261 HloExecutionProfile* hlo_execution_profile); 262 263 // Starts the given program executing on the given stream/executor. 264 // 265 // `arguments` are ShapeTree containing the input parameters. For each element 266 // in the shape tree, if the element holds the ownership of the memory, it is 267 // considered donated and XLA will potentially reuse it as output buffers. For 268 // all donated inputs, XLA is also responsible for freeing them. 269 // 270 // If an input is donated to XLA but is not reused as output, it is returned 271 // as an leftover buffer for the caller to release. 272 // 273 // This call should be non-blocking and may return as soon as all of the 274 // operations are enqueued for launch on the stream. Note that some 275 // implementations may in fact block or may block in some circumstances (e.g., 276 // when profiling); i.e., asynchronous is a "may" not a "must". 277 // 278 // If the hlo_execution_profile is provided as non-nullptr, profiling will be 279 // enabled. Note that profiling is tricky to use correctly, as the profiling 280 // objects (when they exist) must out-live the task. 281 virtual StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream( 282 const ServiceExecutableRunOptions* run_options, 283 absl::Span<const ShapedBuffer* const> arguments, 284 HloExecutionProfile* hlo_execution_profile); 285 286 // Same as ExecuteAsyncOnStream(), but blocks waiting for the computation to 287 // complete. 288 StatusOr<ExecutionOutput> ExecuteOnStream( 289 const ServiceExecutableRunOptions* run_options, 290 std::vector<ExecutionInput> arguments, 291 HloExecutionProfile* hlo_execution_profile); 292 293 virtual StatusOr<ExecutionOutput> ExecuteAsyncOnStream( 294 const ServiceExecutableRunOptions* run_options, 295 std::vector<ExecutionInput> arguments, 296 HloExecutionProfile* hlo_execution_profile) = 0; 297 298 // Same as ExecuteOnStream(), but runs this executable on multiple 299 // streams. arguments[i] contains the arguments to the execution on 300 // run_options[i]->stream() and the returned value is at index i of the 301 // returned vector. 302 virtual StatusOr<std::vector<ScopedShapedBuffer>> ExecuteOnStreams( 303 absl::Span<const ServiceExecutableRunOptions> run_options, 304 absl::Span<const absl::Span<const ShapedBuffer* const>> arguments); 305 306 // Populates `hlo_execution_profile` from `executor`. This is implicit in any 307 // Execute* API call that takes a hlo_execution_profile argument, but must be 308 // called explicitly for other (async, for example) variants after the stream 309 // has completed. PopulateExecutionProfile(ExecutionProfile * execution_profile,HloExecutionProfile * hlo_execution_profile,se::Stream * stream)310 virtual Status PopulateExecutionProfile( 311 ExecutionProfile* execution_profile, 312 HloExecutionProfile* hlo_execution_profile, se::Stream* stream) { 313 return Status::OK(); 314 } 315 316 // Convenience wrapper for calling Executable::ExecuteOnStream. Sets up a 317 // timer for the execution, sets up HLO profiling if enabled, and fills in the 318 // given ExecutionProfile if non-null. 319 StatusOr<ScopedShapedBuffer> ExecuteOnStreamWrapper( 320 const ServiceExecutableRunOptions* run_options, 321 absl::Span<const ShapedBuffer* const> arguments); 322 323 StatusOr<ExecutionOutput> ExecuteOnStreamWrapper( 324 const ServiceExecutableRunOptions* run_options, 325 std::vector<ExecutionInput> arguments); 326 327 StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStreamWrapper( 328 const ServiceExecutableRunOptions* run_options, 329 absl::Span<const ShapedBuffer* const> arguments); 330 331 StatusOr<ExecutionOutput> ExecuteAsyncOnStreamWrapper( 332 const ServiceExecutableRunOptions* run_options, 333 std::vector<ExecutionInput> arguments); 334 hlo_profile_printer_data()335 const HloProfilePrinterData& hlo_profile_printer_data() const { 336 CHECK(hlo_profiling_enabled()); 337 return *hlo_profile_printer_data_; 338 } 339 hlo_profile_index_map()340 const HloProfileIndexMap& hlo_profile_index_map() const { 341 CHECK(hlo_profiling_enabled()); 342 return *hlo_profile_index_map_; 343 } 344 345 // Returns whether this executable was compiled with HLO profilings support 346 // enabled. If not, the caller should not expect an hlo_execution_profile 347 // passed to ExecuteOnStream above to be populated during execution. hlo_profiling_enabled()348 bool hlo_profiling_enabled() const { 349 return hlo_profile_printer_data_ != nullptr; 350 } 351 module()352 HloModule& module() const { return *hlo_module_; } shared_module()353 std::shared_ptr<HloModule> shared_module() const { return hlo_module_; } 354 has_module()355 const bool has_module() const { return hlo_module_ != nullptr; } 356 module_config()357 const HloModuleConfig& module_config() const { return hlo_module_->config(); } 358 359 // The shape (including layout) that results from this execution. This is the 360 // shape of the DeviceMemoryBase result value in ExecuteOnStream above. result_shape()361 const Shape& result_shape() const { 362 return hlo_module_->config().entry_computation_layout().result_shape(); 363 } 364 365 // Returns the size of the executable in bytes. Returns -1 if this query is 366 // not supported by the executable. 367 // 368 // Does not include the size of used libraries (e.g. cuDNN, Eigen, etc.). 369 virtual int64 SizeOfGeneratedCodeInBytes() const; 370 371 // Dumping helpers. set_hlo_proto(std::unique_ptr<xla::HloProto> hlo_proto)372 void set_hlo_proto(std::unique_ptr<xla::HloProto> hlo_proto) { 373 hlo_proto_ = std::move(hlo_proto); 374 } dumping_snapshot()375 bool dumping_snapshot() const { return hlo_proto_ != nullptr; } hlo_proto()376 HloProto const* hlo_proto() const { return hlo_proto_.get(); } 377 debug_info()378 std::string& debug_info() { return debug_info_; } set_debug_info(const std::string & debug_info)379 void set_debug_info(const std::string& debug_info) { 380 debug_info_ = debug_info; 381 } 382 // Gather unused but donated buffers, return them to the caller of this API. 383 // We don't free buffers inside this function since the caller could have 384 // different preferences for buffer deallocation. For example, in TensorFlow, 385 // buffers are mostly efficiently deallocated as soon as a program has been 386 // launched. However, in XRT, the buffers are expected to be deallocated after 387 // the program has finished since XRT doesn't support async deallocation. 388 void MarkToBeReleasedArguments(absl::Span<ExecutionInput> arguments, 389 ExecutionOutput& result); 390 391 protected: 392 // HloModule this was compiled from. BufferAssignment keeps pointers to 393 // HloInstructions owned by the HloModule so we need to keep the HloModule 394 // around. 395 const std::shared_ptr<HloModule> hlo_module_; 396 397 // The serialized HLO proto. Non-null only if dumping snapshots is enabled. 398 std::unique_ptr<HloProto const> hlo_proto_; 399 400 // Execution count, used to generate a unique filename for each dumped 401 // execution. 402 int64 execution_count_ = 0; 403 404 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data_; 405 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map_; 406 407 // Generic debug information as a string. 408 std::string debug_info_; 409 }; 410 411 } // namespace xla 412 413 #endif // TENSORFLOW_COMPILER_XLA_SERVICE_EXECUTABLE_H_ 414