• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_EXECUTABLE_H_
17 #define TENSORFLOW_COMPILER_XLA_SERVICE_EXECUTABLE_H_
18 
19 #include <memory>
20 #include <set>
21 #include <utility>
22 #include <vector>
23 
24 #include "absl/types/span.h"
25 #include "absl/types/variant.h"
26 #include "tensorflow/compiler/xla/debug_options_flags.h"
27 #include "tensorflow/compiler/xla/service/computation_layout.h"
28 #include "tensorflow/compiler/xla/service/hlo.pb.h"
29 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
30 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
31 #include "tensorflow/compiler/xla/service/hlo_module.h"
32 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
33 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
34 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
35 #include "tensorflow/compiler/xla/shape_tree.h"
36 #include "tensorflow/compiler/xla/statusor.h"
37 #include "tensorflow/compiler/xla/util.h"
38 #include "tensorflow/compiler/xla/xla_data.pb.h"
39 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
40 #include "tensorflow/stream_executor/device_memory_allocator.h"
41 
42 namespace xla {
43 
44 // TODO(b/150633678): Both the ExecutionInput and ExecutionOutput need to be
45 // revisited, with the execute APIs taking data structure which can better model
46 // shareable buffers.
47 //
48 // ExecutionInput buffers are in one of three states:
49 //
50 // 1) Owned by the caller and immutable.
51 // 2) Donated by the caller but returned on error.
52 // 3) Donated by the caller and freed on error.
53 //
54 // Case (1) buffers are stored as MaybeOwningDeviceMemory(DeviceMemoryBase).
55 // Case (2) buffers are stored as MaybeOwningDeviceMemory(OwningDeviceMemory),
56 //   with their indices present in unowned_indices_.
57 // Case (3) buffers are stored as MaybeOwningDeviceMemory(OwningDeviceMemory),
58 //   with their indices absent from unowned_indices_.
59 class ExecutionInput {
60  public:
ExecutionInput(xla::Shape shape)61   explicit ExecutionInput(xla::Shape shape) : buffers_(std::move(shape)) {
62     SetHostShape(ShapeUtil::DeviceShapeToHostShape(buffers_.shape()));
63   }
64   // TODO(b/170310047): remove this overload.
ExecutionInput(xla::Shape shape,xla::Shape host_shape)65   ExecutionInput(xla::Shape shape, xla::Shape host_shape)
66       : buffers_(std::move(shape)) {
67     SetHostShape(ShapeUtil::DeviceShapeToHostShape(buffers_.shape()));
68   }
69 
ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers)70   explicit ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers)
71       : buffers_(std::move(buffers)) {
72     SetHostShape(ShapeUtil::DeviceShapeToHostShape(buffers_.shape()));
73   }
74   // TODO(b/170310047): remove this overload.
ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers,xla::Shape host_shape)75   ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers,
76                  xla::Shape host_shape)
77       : buffers_(std::move(buffers)) {
78     SetHostShape(ShapeUtil::DeviceShapeToHostShape(buffers_.shape()));
79   }
80 
81   ExecutionInput(ExecutionInput&&) = default;
82 
83   ~ExecutionInput();
84 
85   ExecutionInput& operator=(ExecutionInput&&) = default;
86 
shape()87   const Shape& shape() const {
88     return dynamic_shape_ != nullptr ? *dynamic_shape_ : buffers_.shape();
89   }
90 
host_shape()91   const Shape& host_shape() const {
92     return host_shape_ != nullptr ? *host_shape_ : shape();
93   }
94 
95   Status SetDynamicShape(Shape dynamic_shape);
96 
97   xla::StatusOr<xla::ShapedBuffer> ToShapedBuffer(
98       se::DeviceMemoryAllocator* allocator, int device_ordinal) const;
99 
SetBuffer(const ShapeIndex & index,MaybeOwningDeviceMemory buffer)100   void SetBuffer(const ShapeIndex& index, MaybeOwningDeviceMemory buffer) {
101     *buffers_.mutable_element(index) = std::move(buffer);
102   }
103 
104   void SetUnownedBuffer(const ShapeIndex& index,
105                         MaybeOwningDeviceMemory buffer);
106 
SetUnownedIndex(const ShapeIndex & index)107   void SetUnownedIndex(const ShapeIndex& index) {
108     unowned_indices_.insert(index);
109   }
110 
ClearUnownedIndex(const ShapeIndex & index)111   void ClearUnownedIndex(const ShapeIndex& index) {
112     unowned_indices_.erase(index);
113   }
114 
unowned_indices()115   const std::set<ShapeIndex>& unowned_indices() { return unowned_indices_; }
116 
Buffers()117   const ShapeTree<MaybeOwningDeviceMemory>& Buffers() const { return buffers_; }
118 
MutableBuffers()119   ShapeTree<MaybeOwningDeviceMemory>* MutableBuffers() { return &buffers_; }
120 
MutableBuffer(const ShapeIndex & index)121   MaybeOwningDeviceMemory* MutableBuffer(const ShapeIndex& index) {
122     return buffers_.mutable_element(index);
123   }
124 
Buffer(const ShapeIndex & index)125   const MaybeOwningDeviceMemory& Buffer(const ShapeIndex& index) const {
126     return buffers_.element(index);
127   }
128 
129  private:
SetHostShape(xla::Shape host_shape)130   void SetHostShape(xla::Shape host_shape) {
131     if (shape() != host_shape) {
132       host_shape_ = std::make_unique<Shape>(std::move(host_shape));
133     }
134   }
135 
136   ShapeTree<MaybeOwningDeviceMemory> buffers_;
137   // Set of indices of buffers that should be returned to the caller if an error
138   // occurs when enqueuing the computation.
139   std::set<ShapeIndex> unowned_indices_;
140   std::unique_ptr<Shape> dynamic_shape_;
141   std::unique_ptr<Shape> host_shape_;
142 };
143 
144 // ExecutionOutput encapsulates the output buffers of a execution and the
145 // leftover buffers to be released by the caller.
146 class ExecutionOutput {
147  public:
ExecutionOutput(ScopedShapedBuffer result)148   explicit ExecutionOutput(ScopedShapedBuffer result)
149       : result_(std::move(result)) {}
ExecutionOutput(ScopedShapedBuffer result,std::vector<se::OwningDeviceMemory> to_be_released)150   ExecutionOutput(ScopedShapedBuffer result,
151                   std::vector<se::OwningDeviceMemory> to_be_released)
152       : result_(std::move(result)),
153         to_be_released_(std::move(to_be_released)) {}
154   // TODO(b/170310047): remove this overload.
ExecutionOutput(Shape on_host_shape,Shape on_device_shape,se::DeviceMemoryAllocator * allocator,int device_ordinal)155   ExecutionOutput(Shape on_host_shape, Shape on_device_shape,
156                   se::DeviceMemoryAllocator* allocator, int device_ordinal)
157       : result_(std::move(on_device_shape), allocator, device_ordinal) {}
ExecutionOutput(Shape on_device_shape,se::DeviceMemoryAllocator * allocator,int device_ordinal)158   ExecutionOutput(Shape on_device_shape, se::DeviceMemoryAllocator* allocator,
159                   int device_ordinal)
160       : result_(std::move(on_device_shape), allocator, device_ordinal) {}
161   ExecutionOutput(ExecutionOutput&&) = default;
162   ExecutionOutput& operator=(ExecutionOutput&&) = default;
163 
~ExecutionOutput()164   ~ExecutionOutput() {
165     // If the ExecutionOutput has not been committed, and if there are aliased
166     // indices, clear them off the ScopedShapedBuffer to prevent them to be
167     // released.
168     for (auto& index : aliased_indices_) {
169       result_.set_buffer(se::OwningDeviceMemory(), index);
170     }
171   }
172 
AddAliasedIndex(ShapeIndex index)173   void AddAliasedIndex(ShapeIndex index) {
174     aliased_indices_.push_back(std::move(index));
175   }
176 
AddToBeReleased(se::OwningDeviceMemory mem)177   void AddToBeReleased(se::OwningDeviceMemory mem) {
178     to_be_released_.push_back(std::move(mem));
179   }
180 
181   // Should be called once it is known that the execute operation succeeded,
182   // before returning the ExecutionOutput to the caller.
Commit()183   ExecutionOutput& Commit() {
184     aliased_indices_.clear();
185     return *this;
186   }
187 
Result()188   const ScopedShapedBuffer& Result() const { return result_; }
189 
MutableResult()190   ScopedShapedBuffer* MutableResult() { return &result_; }
191 
ConsumeResult()192   ScopedShapedBuffer ConsumeResult() {
193     aliased_indices_.clear();
194     return std::move(result_);
195   }
196 
ToBeReleased()197   const std::vector<se::OwningDeviceMemory>& ToBeReleased() const {
198     return to_be_released_;
199   }
200 
ConsumeToBeReleased()201   std::vector<se::OwningDeviceMemory> ConsumeToBeReleased() {
202     return std::move(to_be_released_);
203   }
204 
ConsumeAliasedIndices()205   std::vector<ShapeIndex> ConsumeAliasedIndices() {
206     auto aliased = std::move(aliased_indices_);
207     aliased_indices_.clear();
208     return aliased;
209   }
210 
211  private:
212   ScopedShapedBuffer result_;
213 
214   // Leftover buffers for the caller to release. Elements in this list are
215   // donated input memory buffers that are not reused by XLA as outputs.
216   std::vector<se::OwningDeviceMemory> to_be_released_;
217 
218   // These are the indices in result_ which have been aliased from the caller.
219   // If the execution operation fails, the caller should maintain ownership of
220   // the buffer, so we track the indices here, and unless the ExecutionOutput is
221   // committed, we remove them from the result_ before destruction.
222   std::vector<ShapeIndex> aliased_indices_;
223 
224   // A shape table is a continuous region in memory that is used to hold the
225   // runtime dimension sizes of dynamic output shapes.
226   se::OwningDeviceMemory output_shape_table_;
227 };
228 
229 // A given platform's compiler will produce an Executable -- this is a uniform
230 // interface that is used for launching compiled programs across platforms.
231 class Executable {
232  public:
Executable(std::shared_ptr<HloModule> hlo_module)233   explicit Executable(std::shared_ptr<HloModule> hlo_module)
234       : hlo_module_(std::move(hlo_module)) {}
235 
236   // TODO(b/172012028): Remove this constructor.
Executable(std::shared_ptr<HloModule> hlo_module,std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)237   explicit Executable(
238       std::shared_ptr<HloModule> hlo_module,
239       std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
240       std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
241       : hlo_module_(std::move(hlo_module)),
242         hlo_profile_printer_data_(std::move(hlo_profile_printer_data)),
243         hlo_profile_index_map_(std::move(hlo_profile_index_map)) {
244     CHECK_EQ(hlo_profile_printer_data_.get() == nullptr,
245              hlo_profile_index_map_.get() == nullptr);
246   }
~Executable()247   virtual ~Executable() {}
248 
249   // Enqueues the compilation result on the provided stream, passing the given
250   // arguments. This call is blocking and returns after the execution is done.
251   //
252   // If the hlo_execution_profile is provided as non-nullptr, profiling will be
253   // enabled.
254   //
255   // Returns a shaped buffer containing the result of the computation.
256   StatusOr<ScopedShapedBuffer> ExecuteOnStream(
257       const ServiceExecutableRunOptions* run_options,
258       absl::Span<const ShapedBuffer* const> arguments,
259       HloExecutionProfile* hlo_execution_profile);
260 
261   // Starts the given program executing on the given stream/executor.
262   //
263   // `arguments` are ShapeTree containing the input parameters. For each element
264   // in the shape tree, if the element holds the ownership of the memory, it is
265   // considered donated and XLA will potentially reuse it as output buffers. For
266   // all donated inputs, XLA is also responsible for freeing them.
267   //
268   // If an input is donated to XLA but is not reused as output, it is returned
269   // as an leftover buffer for the caller to release.
270   //
271   // This call should be non-blocking and may return as soon as all of the
272   // operations are enqueued for launch on the stream. Note that some
273   // implementations may in fact block or may block in some circumstances (e.g.,
274   // when profiling); i.e., asynchronous is a "may" not a "must".
275   //
276   // If the hlo_execution_profile is provided as non-nullptr, profiling will be
277   // enabled. Note that profiling is tricky to use correctly, as the profiling
278   // objects (when they exist) must out-live the task.
279   virtual StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
280       const ServiceExecutableRunOptions* run_options,
281       absl::Span<const ShapedBuffer* const> arguments,
282       HloExecutionProfile* hlo_execution_profile);
283 
284   // Same as ExecuteAsyncOnStream(), but blocks waiting for the computation to
285   // complete.
286   StatusOr<ExecutionOutput> ExecuteOnStream(
287       const ServiceExecutableRunOptions* run_options,
288       std::vector<ExecutionInput> arguments,
289       HloExecutionProfile* hlo_execution_profile);
290 
291   virtual StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
292       const ServiceExecutableRunOptions* run_options,
293       std::vector<ExecutionInput> arguments,
294       HloExecutionProfile* hlo_execution_profile) = 0;
295 
296   // Same as ExecuteOnStream(), but runs this executable on multiple
297   // streams. arguments[i] contains the arguments to the execution on
298   // run_options[i]->stream() and the returned value is at index i of the
299   // returned vector.
300   virtual StatusOr<std::vector<ScopedShapedBuffer>> ExecuteOnStreams(
301       absl::Span<const ServiceExecutableRunOptions> run_options,
302       absl::Span<const absl::Span<const ShapedBuffer* const>> arguments);
303 
304   // Populates `hlo_execution_profile` from `executor`. This is implicit in any
305   // Execute* API call that takes a hlo_execution_profile argument, but must be
306   // called explicitly for other (async, for example) variants after the stream
307   // has completed.
PopulateExecutionProfile(ExecutionProfile * execution_profile,HloExecutionProfile * hlo_execution_profile,se::Stream * stream)308   virtual Status PopulateExecutionProfile(
309       ExecutionProfile* execution_profile,
310       HloExecutionProfile* hlo_execution_profile, se::Stream* stream) {
311     return OkStatus();
312   }
313 
314   // Convenience wrapper for calling Executable::ExecuteOnStream. Sets up a
315   // timer for the execution, sets up HLO profiling if enabled, and fills in the
316   // given ExecutionProfile if non-null.
317   StatusOr<ScopedShapedBuffer> ExecuteOnStreamWrapper(
318       const ServiceExecutableRunOptions* run_options,
319       absl::Span<const ShapedBuffer* const> arguments);
320 
321   StatusOr<ExecutionOutput> ExecuteOnStreamWrapper(
322       const ServiceExecutableRunOptions* run_options,
323       std::vector<ExecutionInput> arguments);
324 
325   StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStreamWrapper(
326       const ServiceExecutableRunOptions* run_options,
327       absl::Span<const ShapedBuffer* const> arguments);
328 
329   StatusOr<ExecutionOutput> ExecuteAsyncOnStreamWrapper(
330       const ServiceExecutableRunOptions* run_options,
331       std::vector<ExecutionInput> arguments);
332 
hlo_profile_printer_data()333   const HloProfilePrinterData& hlo_profile_printer_data() const {
334     CHECK(hlo_profiling_enabled());
335     return *hlo_profile_printer_data_;
336   }
337 
hlo_profile_index_map()338   const HloProfileIndexMap& hlo_profile_index_map() const {
339     CHECK(hlo_profiling_enabled());
340     return *hlo_profile_index_map_;
341   }
342 
343   // Returns whether this executable was compiled with HLO profilings support
344   // enabled. If not, the caller should not expect an hlo_execution_profile
345   // passed to ExecuteOnStream above to be populated during execution.
hlo_profiling_enabled()346   bool hlo_profiling_enabled() const {
347     return hlo_profile_printer_data_ != nullptr;
348   }
349 
module()350   HloModule& module() const { return *hlo_module_; }
shared_module()351   std::shared_ptr<HloModule> shared_module() const { return hlo_module_; }
352 
has_module()353   const bool has_module() const { return hlo_module_ != nullptr; }
354 
module_config()355   const HloModuleConfig& module_config() const { return hlo_module_->config(); }
356 
357   // The shape (including layout) that results from this execution. This is the
358   // shape of the DeviceMemoryBase result value in ExecuteOnStream above.
result_shape()359   const Shape& result_shape() const {
360     return hlo_module_->config().entry_computation_layout().result_shape();
361   }
362 
363   // Returns the size of the executable in bytes. Returns -1 if this query is
364   // not supported by the executable.
365   //
366   // Does not include the size of used libraries (e.g. cuDNN, Eigen, etc.).
367   virtual int64_t SizeOfGeneratedCodeInBytes() const;
368 
369   // Dumping helpers.
set_hlo_proto(std::unique_ptr<xla::HloProto> hlo_proto)370   void set_hlo_proto(std::unique_ptr<xla::HloProto> hlo_proto) {
371     hlo_proto_ = std::move(hlo_proto);
372   }
dumping_snapshot()373   bool dumping_snapshot() const {
374     return module_config().debug_options().xla_dump_hlo_snapshots();
375   }
hlo_proto()376   HloProto const* hlo_proto() const { return hlo_proto_.get(); }
377 
debug_info()378   std::string& debug_info() { return debug_info_; }
set_debug_info(const std::string & debug_info)379   void set_debug_info(const std::string& debug_info) {
380     debug_info_ = debug_info;
381   }
382   // Gather unused but donated buffers, return them to the caller of this API.
383   // We don't free buffers inside this function since the caller could have
384   // different preferences for buffer deallocation. For example, in TensorFlow,
385   // buffers are mostly efficiently deallocated as soon as a program has been
386   // launched. However, in XRT, the buffers are expected to be deallocated after
387   // the program has finished since XRT doesn't support async deallocation.
388   void MarkToBeReleasedArguments(absl::Span<ExecutionInput> arguments,
389                                  ExecutionOutput& result);
390 
391  protected:
392   // HloModule this was compiled from. BufferAssignment keeps pointers to
393   // HloInstructions owned by the HloModule so we need to keep the HloModule
394   // around.
395   const std::shared_ptr<HloModule> hlo_module_;
396 
397   // The serialized HLO proto. Non-null only if dumping snapshots is enabled.
398   std::unique_ptr<HloProto const> hlo_proto_;
399 
400   // Execution count, used to generate a unique filename for each dumped
401   // execution.
402   int64_t execution_count_ = 0;
403 
404   std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data_;
405   std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map_;
406 
407   // Generic debug information as a string.
408   std::string debug_info_;
409 };
410 
411 }  // namespace xla
412 
413 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_EXECUTABLE_H_
414