• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HEAP_SIMULATOR_H_
17 #define TENSORFLOW_COMPILER_XLA_SERVICE_HEAP_SIMULATOR_H_
18 
19 #include <algorithm>
20 #include <memory>
21 #include <set>
22 #include <utility>
23 #include <vector>
24 
25 #include "absl/container/flat_hash_map.h"
26 #include "absl/container/flat_hash_set.h"
27 #include "tensorflow/compiler/xla/service/buffer_value.h"
28 #include "tensorflow/compiler/xla/service/buffer_value_containers.h"
29 #include "tensorflow/compiler/xla/service/hlo.pb.h"
30 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
31 #include "tensorflow/compiler/xla/service/hlo_buffer.h"
32 #include "tensorflow/compiler/xla/service/hlo_computation.h"
33 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
34 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
35 #include "tensorflow/compiler/xla/service/hlo_live_range.h"
36 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
37 #include "tensorflow/compiler/xla/service/hlo_schedule.h"
38 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
39 #include "tensorflow/compiler/xla/statusor.h"
40 
41 namespace xla {
42 
43 // Forward declare classes defined below.
44 template <typename BufferType>
45 class HeapAlgorithm;
46 template <typename BufferType>
47 class NoFragmentationStatsHeap;
48 
49 // HeapSimulator assigns buffer offsets by running a simulation of a regular
50 // memory heap with Alloc and Free calls.  It only works for completely
51 // sequential instruction sequences.  Unlike regular heaps, we have the
52 // advantage that the sequence of Alloc and Free calls is known up-front; we
53 // don't need to return the assignment of buffer offsets until the very end.
54 class HeapSimulator {
55  public:
56   // Chunk represents a contiguous piece of memory.  Each BufferValue will be
57   // associated with a chunk in the assignment result.
58   struct Chunk {
59     int64_t offset;
60     int64_t size;
61 
chunk_endChunk62     int64_t chunk_end() const { return offset + size; }
63 
64     bool OverlapsWith(Chunk other_chunk) const;
65 
66     bool operator==(const Chunk& other) const {
67       return offset == other.offset && size == other.size;
68     }
69   };
70 
71   template <typename BufferType>
72   struct HeapResult {
73     // Returns the updated heap size if `chunk` is added to the heap.
UpdatedHeapSizeHeapResult74     int64_t UpdatedHeapSize(const Chunk& chunk) const {
75       return std::max(heap_size, chunk.chunk_end());
76     }
77 
78     // The assignment of buffers to chunks.
79     absl::flat_hash_map<const BufferType*, Chunk> chunk_map;
80 
81     // The total size in bytes of the heap, containing all assigned chunks.
82     int64_t heap_size = 0;
83   };
84   // Result represents the result of the heap simulation.
85   template <typename BufferType>
86   struct Result {
87     // Heap results.
88     std::vector<HeapResult<BufferType>> heap_results;
89 
90     // The total size in bytes of the heaps.
91     // heap_size == sum([hr.heap_size for hr in heap_results]).
92     int64_t heap_size = 0;
93 
94     // The total size in bytes of heap fragmentation.
95     int64_t fragmentation_size = 0;
96 
97     // A trace of heap simulation events.
98     HeapSimulatorTrace debug_trace;
99   };
100 
101   // The different options to be passed to the Run() APIs.
102   struct Options {
OptionsOptions103     Options()
104         : may_reuse_operand_buffers(true),
105           alloc_constants(false),
106           buffers_to_assign(nullptr) {}
107 
108     // Whether a buffer about to be Free()-ed, can be recycled for a new born
109     // one, hence collapsing Free()+Alloc() calls (default true).
110     bool may_reuse_operand_buffers;
111     // Whether to issue Alloc() and Free() calls for constants (default false).
112     bool alloc_constants;
113     // If 'buffers_to_assign' is provided, only those buffers are assigned
114     // offsets, otherwise all buffers defined by the instructions are assigned.
115     const absl::flat_hash_set<const HloValue*>* buffers_to_assign;
116   };
117 
118   // Returns the minimum memory required to compute an HLO module where all
119   // computations have been scheduled (represented by the given
120   // schedule), assuming no fragmentation.
121   static StatusOr<int64_t> MinimumMemoryForModule(
122       const HloSchedule& schedule,
123       const LogicalBuffer::SizeFunction& size_function);
124 
125   // Returns the minimum memory required to compute the given computation,
126   // assuming no fragmentation.
127   static StatusOr<int64_t> MinimumMemoryForComputation(
128       const HloComputation& computation, const HloInstructionSequence& sequence,
129       const HloAliasAnalysis& alias_analysis,
130       const LogicalBuffer::SizeFunction& size_function,
131       const absl::flat_hash_map<const HloComputation*, int64_t>*
132           memory_by_computation = nullptr);
133 
134   static StatusOr<int64_t> MinimumMemoryForComputation(
135       const HloComputation& computation, const HloInstructionSequence& sequence,
136       const HloAliasAnalysis& alias_analysis,
137       const LogicalBuffer::SizeFunction& size_function,
138       const HloSchedule* schedule);
139 
140   // Run the heap simulation with the given algorithm, assuming the given
141   // schedule, which must contain a topologically-consistent total
142   // ordering of all instructions within each computation. The result is invalid
143   // if instructions are not run in exactly this sequence.
144   //
145   // Running heap simulation on the whole module tends to save memory, compared
146   // to running on a per-computation basis, since we can re-use buffer space for
147   // called sub-computations.
148   //
149   static StatusOr<Result<HloValue>> Run(
150       std::unique_ptr<HeapAlgorithm<HloValue>> algorithm,
151       const HloModule& module, const HloSchedule& schedule,
152       const HloAliasAnalysis& alias_analysis,
153       const BufferValue::SizeFunction& size_fn,
154       const Options& options = Options());
155 
156   // Same as above, but runs on a single computation. The 'instruction_sequence'
157   // must contain a topologically-consistent total ordering of all instructions
158   // in the computation. The result is invalid if instructions are not run in
159   // exactly this sequence.
160   static StatusOr<Result<HloValue>> Run(
161       std::unique_ptr<HeapAlgorithm<HloValue>> algorithm,
162       const HloComputation& computation,
163       const HloInstructionSequence& instruction_sequence,
164       const HloAliasAnalysis& alias_analysis,
165       const BufferValue::SizeFunction& size_fn,
166       const Options& options = Options(),
167       const absl::flat_hash_map<const HloComputation*, int64_t>*
168           memory_by_computation = nullptr);
169 
170   // Same as above, but runs on with a schedule that covers all nested
171   // computations.
172   static StatusOr<Result<HloValue>> Run(
173       std::unique_ptr<HeapAlgorithm<HloValue>> algorithm,
174       const HloComputation& computation,
175       const HloInstructionSequence& instruction_sequence,
176       const HloAliasAnalysis& alias_analysis,
177       const BufferValue::SizeFunction& size_fn, const HloSchedule* schedule,
178       const Options& options = Options());
179 
180  private:
181   // If 'schedule' is non-null, it is used to find kCall and kWhile
182   // sub-computations, and the heap simulation for those sub-computations will
183   // be run recursively. I.e. the simulation is run over the whole module.
184   HeapSimulator(std::unique_ptr<HeapAlgorithm<HloValue>> algorithm,
185                 const BufferValue::SizeFunction& size_fn,
186                 const Options& options, const HloSchedule* schedule = nullptr,
187                 const absl::flat_hash_map<const HloComputation*, int64_t>*
188                     memory_by_computation = nullptr);
189   ~HeapSimulator();
190 
191   Status RunComputation(const HloComputation& computation,
192                         const HloInstructionSequence& instruction_sequence,
193                         const HloAliasAnalysis& alias_analysis,
194                         HloLiveRange* live_range);
195 
196   bool IgnoreBuffer(const HloValue* buffer) const;
197   void Alloc(const HloValue* buffer, const HloInstruction* instruction);
198   void Free(const HloValue* buffer, const HloInstruction* instruction);
199   // ShareBuffer indicates that a new buffer is defined and it has to be the
200   // same address as the shared one.
201   void ShareBuffer(const HloValue* buffer, const HloValue* shared,
202                    const HloInstruction* instruction);
203 
204   // Returns true if:
205   //  Two buffers belong to the same shared group.
206   //  Eight of the buffer has no shared group assigned.
207   bool InSameSharedGroup(const HloValue* left, const HloValue* right);
208   Result<HloValue> Finish();
209 
210   void FillDebugTrace(HeapSimulatorTrace::Event::Kind kind,
211                       const HloValue* buffer, const HloInstruction* instruction,
212                       const HloValue* share_with_canonical);
213 
214   // Counterintuitive: the algorithm_ itself can be a NoFragmentationStatsHeap,
215   // in which case we are calculating the same allocs/frees twice in the
216   // simulation.
217   const std::unique_ptr<NoFragmentationStatsHeap<HloValue>>
218       no_fragmentation_stats_;
219   const std::unique_ptr<HeapAlgorithm<HloValue>> algorithm_;
220   const BufferValue::SizeFunction size_fn_;
221   const Options options_;
222   // schedule_ is set by buffer assignment, and memory_by_computation_ is
223   // set by hlo scheduling. Then, in RunComputation, we check both in order to
224   // handle subcomputations. It would be good to unify the handling of
225   // subcomputations, but it's not clear how.
226   const HloSchedule* schedule_;
227   const absl::flat_hash_map<const HloComputation*, int64_t>*
228       memory_by_computation_;
229 
230   // Hold some sets for error-checking the sequence of Alloc and Free calls.
231   absl::flat_hash_set<const HloValue*> allocated_buffers_;
232   absl::flat_hash_set<const HloValue*> freed_buffers_;
233 
234   // Debugging information filled in while the heap simulator runs.
235   HeapSimulatorTrace debug_trace_;
236 };
237 
238 // Abstract base class describing a heap simulation algorithm that assigns
239 // offsets to buffers.  A sequence of Alloc / Free calls will be made, with the
240 // same semantics as a regular memory heap.  Finish will be called at the end to
241 // collect the simulation results.
242 template <typename BufferType>
243 class HeapAlgorithm {
244  public:
245   using Chunk = HeapSimulator::Chunk;
246   using Result = HeapSimulator::Result<BufferType>;
247   using HeapResult = HeapSimulator::HeapResult<BufferType>;
248 
249   virtual ~HeapAlgorithm() = default;
250 
251   // Alloc allocates a buffer of 'size' bytes.
252   virtual void Alloc(const BufferType* buffer, int64_t size) = 0;
253 
254   // Takes memory usage of subcomputations into account when calculating the
255   // memory usage of a computation. Currently, we don't handle buffer aliasing
256   // between computations entirely correctly. We are careful to not double count
257   // for the output buffers of whiles/conds/calls. But we don't take into
258   // account other aliases, such as for the while init. A more thorough solution
259   // would require something like BufferAssignment::BuildColocatedBufferSets.
260   // TODO(b/65835246):
261   // Since TuplePointsToAnalysis is being replaced with a module-aware alias
262   // analysis, it's not worth making major changes to HeapSimulator now.
AccountForSubcomputationMemory(const HloInstruction * instruction,int64_t alloc_size_by_instruction,const absl::flat_hash_map<const HloComputation *,int64_t> & memory_by_computation)263   virtual void AccountForSubcomputationMemory(
264       const HloInstruction* instruction,
265       // The total number of bytes allocated by instruction.
266       int64_t alloc_size_by_instruction,
267       const absl::flat_hash_map<const HloComputation*, int64_t>&
268           memory_by_computation) {}
269 
270   // Free de-allocates a previously allocated buffer.
271   virtual void Free(const BufferType* buffer, int64_t size) = 0;
272 
273   // Indicates that a buffer has to be collocated with another buffer. In
274   // addition to Alloc and Free, the heap simulator exposes a concept of buffer
275   // sharing.  When ShareBuffer is called, instead of allocating new space for
276   // the buffer, it associates the buffer with a previously allocated (or
277   // shared) buffer.  Each group of mutually-shared buffers points to a single
278   // SharedGroup instance, which is a shared control block.
ShareWith(const BufferType * buffer,const BufferType * share_with,int64_t size)279   virtual void ShareWith(const BufferType* buffer, const BufferType* share_with,
280                          int64_t size) {
281     Alloc(buffer, size);
282   }
283 
284   // Finish collects the buffer offset assignment results.  Finish may only be
285   // called once, after all Alloc and Free calls.
286   virtual Result Finish() = 0;
287 };
288 
289 // NoFragmentationStatsHeap computes the heap size assuming no fragmentation;
290 // this is the absolute minimum size for a given instruction sequence.  The
291 // result.chunk_map returned in Finish is always empty, since we only collect
292 // stats, and don't actually compute chunk assignments.
293 template <typename BufferType>
294 class NoFragmentationStatsHeap : public HeapAlgorithm<BufferType> {
295  public:
296   using Result = HeapSimulator::Result<BufferType>;
297 
298   NoFragmentationStatsHeap() = default;
299   ~NoFragmentationStatsHeap() override = default;
300 
301   void Alloc(const BufferType* buffer, int64_t size) override;
302 
303   void AccountForSubcomputationMemory(
304       const HloInstruction* instruction, int64_t alloc_size_by_instruction,
305       const absl::flat_hash_map<const HloComputation*, int64_t>&
306           memory_by_computation) override;
307 
308   void Free(const BufferType* buffer, int64_t size) override;
309 
310   Result Finish() override;
311 
312  private:
313   int64_t current_heap_size_ = 0;
314   int64_t max_heap_size_ = 0;
315 };
316 
317 // Node in BufferIntervalTree that stores the alloc and free times of a buffer,
318 // and the chunk assigned to it.
319 struct BufferIntervalTreeNode {
320   // Alloc time.
321   int64_t start;
322   // Free time.
323   int64_t end;
324   // Maximum free time of all nodes in the subtree where this node is the root.
325   int64_t subtree_end;
326   // Allocated chunk for the buffer.
327   HeapSimulator::Chunk chunk;
328   // Left child.
329   BufferIntervalTreeNode* left;
330   // Right child.
331   BufferIntervalTreeNode* right;
332   // parent
333   BufferIntervalTreeNode* parent;
334 };
335 
336 // An interval tree that can query buffers overlapping in time.
337 class BufferIntervalTree {
338  public:
339   using Chunk = HeapSimulator::Chunk;
340   // Adds a buffer to the interval tree, with the time interval and allocated
341   // chunk specified.
342   void Add(int64_t start, int64_t end, const Chunk& chunk);
343 
344   // Remove the interval from the tree. Returns true if the chunk is removed.
345   bool Remove(int64_t start, int64_t end, const Chunk& chunk);
346 
347   // Returns vector of allocated chunks that overlap with the given time
348   // interval.
349   std::vector<Chunk> ChunksOverlappingInTime(int64_t start, int64_t end) const;
350 
GetRoot()351   BufferIntervalTreeNode* GetRoot() { return root_; }
352 
353  private:
354   BufferIntervalTreeNode* root_ = nullptr;
355   std::list<BufferIntervalTreeNode> node_storage_;
356 };
357 
358 // GlobalDecreasingSizeBestFitHeap collects the live intervals of all buffers,
359 // then allocates them in decreasing spatial or temporal size regardless of the
360 // alloc/free time. It internally tracks the allocated buffers and their live
361 // intervals; when allocating a buffer, it finds the best-fit free chunk during
362 // its live interval.
363 template <typename BufferType>
364 class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
365  public:
366   using HeapResult = HeapSimulator::HeapResult<BufferType>;
367   using Result = HeapSimulator::Result<BufferType>;
368   using Chunk = HeapSimulator::Chunk;
369 
370   enum Type {
371     kSpatial = 0,
372     kTemporal,
373   };
374 
375   // BufferInterval stores a buffer's size and time interval.
376   struct BufferInterval {
377     const BufferType* buffer;
378     int64_t size;
379     // Alloc time of the buffer.
380     int64_t start;
381     // Free time of the buffer.
382     int64_t end;
383 
384     // Colocation buffers that need to be collocated with this one.
385     absl::InlinedVector<const BufferType*, 2> colocations;
386 
387     // True if this buffer needs an allocation. False if it is collocated with
388     // other buffer.
389     bool need_allocation;
390   };
391 
392   // Comparison function that is used to store buffer intervals.
393   using BufferIntervalCompare =
394       std::function<bool(const BufferInterval&, const BufferInterval&)>;
395 
396   explicit GlobalDecreasingSizeBestFitHeap(int64_t alignment,
397                                            Type type = kSpatial);
~GlobalDecreasingSizeBestFitHeap()398   ~GlobalDecreasingSizeBestFitHeap() override {}
399 
400   void Alloc(const BufferType* buffer, int64_t size) override;
401   void Free(const BufferType* buffer, int64_t size) override;
402 
403   void ShareWith(const BufferType* buffer, const BufferType* share_with,
404                  int64_t size) override;
405 
406   Result Finish() override;
407 
408   // Return a BufferIntervalCompare function that sort by spatial size. We don't
409   // look at co-locates as they should have the same size.
410   static BufferIntervalCompare GetSpatialBufferIntervalCompare();
411 
412  protected:
413   // Returns the buffer intervals sorted according to buffer_interval_compare_.
414   std::vector<BufferInterval> GetSortedBufferIntervals() const;
415 
416   // These two methods below are exposed to other heap algorithms that inherit
417   // from this class. The Finish() method tries to find a candidate chunk for
418   // each BufferInterval, after calling GetSortedBufferIntervals. If a
419   // non-negative preferred_offset is provided, FindChunkCandidate attempts
420   // finding a chunk at this offset. The Finish() method can then call
421   // CommitChunk to associate the chunk with the BufferInterval, if the final
422   // heap size is within the limits.
423   Chunk FindChunkCandidate(const BufferInterval& buffer_interval,
424                            int64_t preferred_offset = -1) const;
425   void CommitChunk(const BufferInterval& buffer_interval, Chunk chunk);
426 
427   // Adds the buffer and the chunk to the result chunk map.
428   virtual void AddToChunkMap(const BufferType* buffer, Chunk chunk);
429 
430   // Return a BufferIntervalCompare function that sorts by live ranges.  A live
431   // range is defined by the range between the start of the first buffer and the
432   // end of the last co-located buffer.  There could be "holes" in the live
433   // ranges of each co-located buffers, but in this heuristics we think they are
434   // contiguous.
435   BufferIntervalCompare GetTemporalBufferIntervalCompare() const;
436 
437   absl::flat_hash_map<const BufferType*, BufferInterval> buffer_intervals_;
438   HeapResult result_;
439   BufferIntervalCompare buffer_interval_compare_;
440   BufferIntervalTree interval_tree_;
441 
442  private:
443   int64_t alignment_;
444 
445   // The current time represented as an integer. It increments by 1 at each
446   // Alloc or Free call.
447   int64_t current_time_ = 0;
448 
449   // Returns all transitive colocated buffers of this buffer interval. I.e., If
450   // a buffer A is colocated with B and B is colocated with C, this function
451   // returns all three of them.
452   absl::flat_hash_set<const BufferType*> GetTransitiveColocations(
453       const BufferInterval& interval) const;
454 };
455 
456 // This class implements an algorithm that will produce multiple heaps, where
457 // each heap size is constrained by a given limit. Note that the constraint is
458 // soft, meaning that a valid heap result is generated even if there are some
459 // buffer sizes larger than the given constraint size.
460 //
461 // Pseudocode:
462 //   while( `buffers` is not empty ) {
463 //     create a new heap `h`
464 //     for (each buffer `buf` in `buffers` in the size-decreasing order) {
465 //       if (buf.size() is larger than the heap size limit &&
466 //           `h` is empty) {
467 //         h.place(buf)
468 //         buffers.remove(buf)
469 //       } else if (placing `buf` into `h` does not violate size
470 //           constraint) {
471 //         h.place(buf)
472 //         buffers.remove(buf)
473 //       }
474 //     }
475 //   }
476 class ConstrainedGlobalDecreasingSizeBestFitHeap
477     : public GlobalDecreasingSizeBestFitHeap<HloValue> {
478  public:
479   explicit ConstrainedGlobalDecreasingSizeBestFitHeap(
480       uint64_t size_limit_per_heap, int64_t alignment, Type type = kSpatial)
481       : GlobalDecreasingSizeBestFitHeap<HloValue>(alignment, type),
482         size_limit_per_heap_(size_limit_per_heap) {}
~ConstrainedGlobalDecreasingSizeBestFitHeap()483   ~ConstrainedGlobalDecreasingSizeBestFitHeap() override {}
484 
485   Result Finish() override;
486 
487  private:
488   uint64_t size_limit_per_heap_;
489 };
490 
491 // A heap algorithm that chooses the best results from other algorithms added to
492 // it.
493 template <typename BufferType>
494 class ChooseBestHeapAlgorithm : public HeapAlgorithm<BufferType> {
495  public:
496   using Result = HeapSimulator::Result<BufferType>;
497 
ChooseBestHeapAlgorithm(std::unique_ptr<std::vector<std::unique_ptr<HeapAlgorithm<BufferType>>>> algorithms)498   ChooseBestHeapAlgorithm(
499       std::unique_ptr<std::vector<std::unique_ptr<HeapAlgorithm<BufferType>>>>
500           algorithms)
501       : algorithms_(std::move(*algorithms)) {}
~ChooseBestHeapAlgorithm()502   ~ChooseBestHeapAlgorithm() override {}
503 
Alloc(const BufferType * buffer,int64_t size)504   void Alloc(const BufferType* buffer, int64_t size) override {
505     for (auto& algorithm : algorithms_) {
506       algorithm->Alloc(buffer, size);
507     }
508   }
509 
ShareWith(const BufferType * buffer,const BufferType * share_with,int64_t size)510   void ShareWith(const BufferType* buffer, const BufferType* share_with,
511                  int64_t size) override {
512     for (auto& algorithm : algorithms_) {
513       algorithm->ShareWith(buffer, share_with, size);
514     }
515   }
516 
Free(const BufferType * buffer,int64_t size)517   void Free(const BufferType* buffer, int64_t size) override {
518     for (auto& algorithm : algorithms_) {
519       algorithm->Free(buffer, size);
520     }
521   }
522 
523   Result Finish() override;
524 
525  private:
526   std::vector<std::unique_ptr<HeapAlgorithm<BufferType>>> algorithms_;
527 };
528 
529 }  // namespace xla
530 
531 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HEAP_SIMULATOR_H_
532