• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HEAP_SIMULATOR_H_
17 #define TENSORFLOW_COMPILER_XLA_SERVICE_HEAP_SIMULATOR_H_
18 
19 #include <memory>
20 #include <set>
21 #include <utility>
22 #include <vector>
23 
24 #include "absl/container/flat_hash_map.h"
25 #include "absl/container/flat_hash_set.h"
26 #include "tensorflow/compiler/xla/service/buffer_value.h"
27 #include "tensorflow/compiler/xla/service/buffer_value_containers.h"
28 #include "tensorflow/compiler/xla/service/hlo.pb.h"
29 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
30 #include "tensorflow/compiler/xla/service/hlo_buffer.h"
31 #include "tensorflow/compiler/xla/service/hlo_computation.h"
32 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
33 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
34 #include "tensorflow/compiler/xla/service/hlo_live_range.h"
35 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
36 #include "tensorflow/compiler/xla/service/hlo_schedule.h"
37 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
38 #include "tensorflow/compiler/xla/statusor.h"
39 
40 namespace xla {
41 
42 // Forward declare classes defined below.
43 class HeapAlgorithm;
44 class NoFragmentationStatsHeap;
45 
46 // HeapSimulator assigns buffer offsets by running a simulation of a regular
47 // memory heap with Alloc and Free calls.  It only works for completely
48 // sequential instruction sequences.  Unlike regular heaps, we have the
49 // advantage that the sequence of Alloc and Free calls is known up-front; we
50 // don't need to return the assignment of buffer offsets until the very end.
51 class HeapSimulator {
52  public:
53   // Chunk represents a contiguous piece of memory.  Each BufferValue will be
54   // associated with a chunk in the assignment result.
55   struct Chunk {
56     int64 offset;
57     int64 size;
58 
chunk_endChunk59     int64 chunk_end() const { return offset + size; }
60 
61     bool OverlapsWith(Chunk other_chunk) const;
62   };
63 
64   // Result represents the result of the heap simulation.
65   struct Result {
66     // The assignment of buffers to chunks.
67     absl::flat_hash_map<const HloValue*, Chunk> chunk_map;
68 
69     // The total size in bytes of the heap, containing all assigned chunks.
70     int64 heap_size = 0;
71 
72     // The total size in bytes of heap fragmentation.
73     int64 fragmentation_size = 0;
74 
75     // A trace of heap simulation events.
76     HeapSimulatorTrace debug_trace;
77   };
78 
79   // The different options to be passed to the Run() APIs.
80   struct Options {
OptionsOptions81     Options()
82         : may_reuse_operand_buffers(true),
83           alloc_constants(false),
84           buffers_to_assign(nullptr) {}
85 
86     // Whether a buffer about to be Free()-ed, can be recycled for a new born
87     // one, hence collapsing Free()+Alloc() calls (default true).
88     bool may_reuse_operand_buffers;
89     // Whether to issue Alloc() and Free() calls for constants (default false).
90     bool alloc_constants;
91     // If 'buffers_to_assign' is provided, only those buffers are assigned
92     // offsets, otherwise all buffers defined by the instructions are assigned.
93     const absl::flat_hash_set<const HloValue*>* buffers_to_assign;
94   };
95 
96   // Returns the minimum memory required to compute an HLO module where all
97   // computations have been scheduled (represented by the given
98   // schedule), assuming no fragmentation.
99   static StatusOr<int64> MinimumMemoryForModule(
100       const HloSchedule& schedule,
101       const LogicalBuffer::SizeFunction& size_function);
102 
103   // Returns the minimum memory required to compute the given computation,
104   // assuming no fragmentation.
105   static StatusOr<int64> MinimumMemoryForComputation(
106       const HloComputation& computation, const HloInstructionSequence& sequence,
107       const HloAliasAnalysis& alias_analysis,
108       const LogicalBuffer::SizeFunction& size_function,
109       const absl::flat_hash_map<const HloComputation*, int64>*
110           memory_by_computation = nullptr);
111 
112   static StatusOr<int64> MinimumMemoryForComputation(
113       const HloComputation& computation, const HloInstructionSequence& sequence,
114       const HloAliasAnalysis& alias_analysis,
115       const LogicalBuffer::SizeFunction& size_function,
116       const HloSchedule* schedule);
117 
118   // Run the heap simulation with the given algorithm, assuming the given
119   // schedule, which must contain a topologically-consistent total
120   // ordering of all instructions within each computation. The result is invalid
121   // if instructions are not run in exactly this sequence.
122   //
123   // Running heap simulation on the whole module tends to save memory, compared
124   // to running on a per-computation basis, since we can re-use buffer space for
125   // called sub-computations.
126   //
127   static StatusOr<Result> Run(std::unique_ptr<HeapAlgorithm> algorithm,
128                               const HloModule& module,
129                               const HloSchedule& schedule,
130                               const HloAliasAnalysis& alias_analysis,
131                               const BufferValue::SizeFunction& size_fn,
132                               const Options& options = Options());
133 
134   // Same as above, but runs on a single computation. The 'instruction_sequence'
135   // must contain a topologically-consistent total ordering of all instructions
136   // in the computation. The result is invalid if instructions are not run in
137   // exactly this sequence.
138   static StatusOr<Result> Run(
139       std::unique_ptr<HeapAlgorithm> algorithm,
140       const HloComputation& computation,
141       const HloInstructionSequence& instruction_sequence,
142       const HloAliasAnalysis& alias_analysis,
143       const BufferValue::SizeFunction& size_fn,
144       const Options& options = Options(),
145       const absl::flat_hash_map<const HloComputation*, int64>*
146           memory_by_computation = nullptr);
147 
148   // Same as above, but runs on with a schedule that covers all nested
149   // computations.
150   static StatusOr<Result> Run(
151       std::unique_ptr<HeapAlgorithm> algorithm,
152       const HloComputation& computation,
153       const HloInstructionSequence& instruction_sequence,
154       const HloAliasAnalysis& alias_analysis,
155       const BufferValue::SizeFunction& size_fn, const HloSchedule* schedule,
156       const Options& options = Options());
157 
158  private:
159   // If 'schedule' is non-null, it is used to find kCall and kWhile
160   // sub-computations, and the heap simulation for those sub-computations will
161   // be run recursively. I.e. the simulation is run over the whole module.
162   HeapSimulator(std::unique_ptr<HeapAlgorithm> algorithm,
163                 const BufferValue::SizeFunction& size_fn,
164                 const Options& options, const HloSchedule* schedule = nullptr,
165                 const absl::flat_hash_map<const HloComputation*, int64>*
166                     memory_by_computation = nullptr);
167   ~HeapSimulator();
168 
169   Status RunComputation(const HloComputation& computation,
170                         const HloInstructionSequence& instruction_sequence,
171                         const HloAliasAnalysis& alias_analysis,
172                         HloLiveRange* live_range);
173 
174   bool IgnoreBuffer(const HloValue* buffer) const;
175   void Alloc(const HloValue* buffer, const HloInstruction* instruction);
176   void Free(const HloValue* buffer, const HloInstruction* instruction);
177   // ShareBuffer indicates that a new buffer is defined and it has to be the
178   // same address as the shared one.
179   void ShareBuffer(const HloValue* buffer, const HloValue* shared,
180                    const HloInstruction* instruction);
181 
182   // Returns true if:
183   //  Two buffers belong to the same shared group.
184   //  Eight of the buffer has no shared group assigned.
185   bool InSameSharedGroup(const HloValue* left, const HloValue* right);
186   Result Finish();
187 
188   void FillDebugTrace(HeapSimulatorTrace::Event::Kind kind,
189                       const HloValue* buffer, const HloInstruction* instruction,
190                       const HloValue* share_with_canonical);
191 
192   // Counterintuitive: the algorithm_ itself can be a NoFragmentationStatsHeap,
193   // in which case we are calculating the same allocs/frees twice in the
194   // simulation.
195   const std::unique_ptr<NoFragmentationStatsHeap> no_fragmentation_stats_;
196   const std::unique_ptr<HeapAlgorithm> algorithm_;
197   const BufferValue::SizeFunction size_fn_;
198   const Options options_;
199   // schedule_ is set by buffer assignment, and memory_by_computation_ is
200   // set by hlo scheduling. Then, in RunComputation, we check both in order to
201   // handle subcomputations. It would be good to unify the handling of
202   // subcomputations, but it's not clear how.
203   const HloSchedule* schedule_;
204   const absl::flat_hash_map<const HloComputation*, int64>*
205       memory_by_computation_;
206 
207   // Hold some sets for error-checking the sequence of Alloc and Free calls.
208   absl::flat_hash_set<const HloValue*> allocated_buffers_;
209   absl::flat_hash_set<const HloValue*> freed_buffers_;
210 
211   // Debugging information filled in while the heap simulator runs.
212   HeapSimulatorTrace debug_trace_;
213 };
214 
215 // Abstract base class describing a heap simulation algorithm that assigns
216 // offsets to buffers.  A sequence of Alloc / Free calls will be made, with the
217 // same semantics as a regular memory heap.  Finish will be called at the end to
218 // collect the simulation results.
219 class HeapAlgorithm {
220  public:
221   using Chunk = HeapSimulator::Chunk;
222   using Result = HeapSimulator::Result;
223 
224   virtual ~HeapAlgorithm() = default;
225 
226   // Alloc allocates a buffer of 'size' bytes.
227   virtual void Alloc(const HloValue* buffer, int64 size) = 0;
228 
229   // Takes memory usage of subcomputations into account when calculating the
230   // memory usage of a computation. Currently, we don't handle buffer aliasing
231   // between computations entirely correctly. We are careful to not double count
232   // for the output buffers of whiles/conds/calls. But we don't take into
233   // account other aliases, such as for the while init. A more thorough solution
234   // would require something like BufferAssignment::BuildColocatedBufferSets.
235   // TODO(b/65835246):
236   // Since TuplePointsToAnalysis is being replaced with a module-aware alias
237   // analysis, it's not worth making major changes to HeapSimulator now.
AccountForSubcomputationMemory(const HloInstruction * instruction,int64 alloc_size_by_instruction,const absl::flat_hash_map<const HloComputation *,int64> & memory_by_computation)238   virtual void AccountForSubcomputationMemory(
239       const HloInstruction* instruction,
240       // The total number of bytes allocated by instruction.
241       int64 alloc_size_by_instruction,
242       const absl::flat_hash_map<const HloComputation*, int64>&
243           memory_by_computation) {}
244 
245   // Free de-allocates a previously allocated buffer.
246   virtual void Free(const HloValue* buffer, int64 size) = 0;
247 
248   // Indicates that a buffer has to be collocated with another buffer. In
249   // addition to Alloc and Free, the heap simulator exposes a concept of buffer
250   // sharing.  When ShareBuffer is called, instead of allocating new space for
251   // the buffer, it associates the buffer with a previously allocated (or
252   // shared) buffer.  Each group of mutually-shared buffers points to a single
253   // SharedGroup instance, which is a shared control block.
ShareWith(const HloValue * buffer,const HloValue * share_with,int64 size)254   virtual void ShareWith(const HloValue* buffer, const HloValue* share_with,
255                          int64 size) {
256     Alloc(buffer, size);
257   }
258 
259   // Finish collects the buffer offset assignment results.  Finish may only be
260   // called once, after all Alloc and Free calls.
261   virtual Result Finish() = 0;
262 };
263 
264 // NoFragmentationStatsHeap computes the heap size assuming no fragmentation;
265 // this is the absolute minimum size for a given instruction sequence.  The
266 // result.chunk_map returned in Finish is always empty, since we only collect
267 // stats, and don't actually compute chunk assignments.
268 class NoFragmentationStatsHeap : public HeapAlgorithm {
269  public:
270   NoFragmentationStatsHeap() = default;
271   ~NoFragmentationStatsHeap() override = default;
272 
273   void Alloc(const HloValue* buffer, int64 size) override;
274 
275   void AccountForSubcomputationMemory(
276       const HloInstruction* instruction, int64 alloc_size_by_instruction,
277       const absl::flat_hash_map<const HloComputation*, int64>&
278           memory_by_computation) override;
279 
280   void Free(const HloValue* buffer, int64 size) override;
281 
282   Result Finish() override;
283 
284  private:
285   int64 current_heap_size_ = 0;
286   int64 max_heap_size_ = 0;
287 };
288 
289 // Node in BufferIntervalTree that stores the alloc and free times of a buffer,
290 // and the chunk assigned to it.
291 struct BufferIntervalTreeNode {
292   // Alloc time.
293   int64 start;
294   // Free time.
295   int64 end;
296   // Maximum free time of all nodes in the subtree where this node is the root.
297   int64 subtree_end;
298   // Allocated chunk for the buffer.
299   HeapSimulator::Chunk chunk;
300   // Left child.
301   BufferIntervalTreeNode* left;
302   // Right child.
303   BufferIntervalTreeNode* right;
304 };
305 
306 // An interval tree that can query buffers overlapping in time.
307 class BufferIntervalTree {
308  public:
309   using Chunk = HeapSimulator::Chunk;
310   // Adds a buffer to the interval tree, with the time interval and allocated
311   // chunk specified.
312   void Add(int64 start, int64 end, const Chunk& chunk);
313 
314   // Returns vector of allocated chunks that overlap with the given time
315   // interval.
316   std::vector<Chunk> ChunksOverlappingInTime(int64 start, int64 end) const;
317 
318  private:
319   std::list<BufferIntervalTreeNode> node_storage_;
320 };
321 
322 // GlobalDecreasingSizeBestFitHeap collects the live intervals of all buffers,
323 // then allocates them in decreasing spatial or temporal size regardless of the
324 // alloc/free time. It internally tracks the allocated buffers and their live
325 // intervals; when allocating a buffer, it finds the best-fit free chunk during
326 // its live interval.
327 class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm {
328  public:
329   enum Type {
330     kSpatial = 0,
331     kTemporal,
332   };
333 
334   // BufferInterval stores a buffer's size and time interval.
335   struct BufferInterval {
336     const HloValue* buffer;
337     int64 size;
338     // Alloc time of the buffer.
339     int64 start;
340     // Free time of the buffer.
341     int64 end;
342 
343     // Colocation buffers that need to be collocated with this one.
344     std::vector<const HloValue*> colocations;
345 
346     // True if this buffer needs an allocation. False if it is collocated with
347     // other buffer.
348     bool need_allocation;
349   };
350 
351   // Comparison function that is used to store buffer intervals.
352   using BufferIntervalCompare =
353       std::function<bool(const BufferInterval&, const BufferInterval&)>;
354 
355   explicit GlobalDecreasingSizeBestFitHeap(int64 alignment,
356                                            Type type = kSpatial);
~GlobalDecreasingSizeBestFitHeap()357   ~GlobalDecreasingSizeBestFitHeap() override {}
358 
359   void Alloc(const HloValue* buffer, int64 size) override;
360   void Free(const HloValue* buffer, int64 size) override;
361 
362   void ShareWith(const HloValue* buffer, const HloValue* share_with,
363                  int64 size) override;
364 
365   Result Finish() override;
366 
367   // Return a BufferIntervalCompare function that sort by spatial size. We don't
368   // look at co-locates as they should have the same size.
369   static BufferIntervalCompare GetSpatialBufferIntervalCompare();
370 
371  protected:
372   // The candidate contains a chunk and the resultant heap size if this
373   // chunk is to be committed.
374   struct ChunkCandidate {
375     Chunk chunk;
376     int64 heap_size;
377   };
378 
379   // Returns the buffer intervals sorted according to buffer_interval_compare_.
380   std::vector<BufferInterval> GetSortedBufferIntervals() const;
381 
382   // These two methods below are exposed to other heap algorithms that inherit
383   // from this class. The Finish() method tries to find a candidate chunk for
384   // each BufferInterval, after calling GetSortedBufferIntervals. If a
385   // non-negative preferred_offset is provided, FindChunkCandidate attempts
386   // finding a chunk at this offset. The ChunkCandidate returns the chunk and
387   // the final heap size if it chunk is to be committed. The Finish() method can
388   // then call CommitChunk to associate the chunk with the BufferInterval, if
389   // the final heap size is within the limits.
390   ChunkCandidate FindChunkCandidate(const BufferInterval& buffer_interval,
391                                     int64 preferred_offset = -1) const;
392   void CommitChunk(const BufferInterval& buffer_interval,
393                    ChunkCandidate chunk_candidate);
394   // Adds the buffer and the chunk to the result chunk map.
395   virtual void AddToChunkMap(const HloValue* buffer, Chunk chunk);
396 
397   // Return a BufferIntervalCompare function that sorts by live ranges.  A live
398   // range is defined by the range between the start of the first buffer and the
399   // end of the last co-located buffer.  There could be "holes" in the live
400   // ranges of each co-located buffers, but in this heuristics we think they are
401   // contiguous.
402   BufferIntervalCompare GetTemporalBufferIntervalCompare() const;
403 
404   absl::flat_hash_map<const HloValue*, BufferInterval> buffer_intervals_;
405   Result result_;
406   BufferIntervalCompare buffer_interval_compare_;
407 
408  private:
409   int64 alignment_;
410 
411   // The current time represented as an integer. It increments by 1 at each
412   // Alloc or Free call.
413   int64 current_time_ = 0;
414 
415   BufferIntervalTree interval_tree_;
416 
417   // Returns all transitive colocated buffers of this buffer interval. I.e., If
418   // a buffer A is colocated with B and B is colocated with C, this function
419   // returns all three of them.
420   absl::flat_hash_set<const HloValue*> GetTransitiveColocations(
421       const BufferInterval& interval) const;
422 };
423 
424 // A heap algorithm that chooses the best results from other algorithms added to
425 // it.
426 class ChooseBestHeapAlgorithm : public HeapAlgorithm {
427  public:
ChooseBestHeapAlgorithm(std::unique_ptr<std::vector<std::unique_ptr<HeapAlgorithm>>> algorithms)428   ChooseBestHeapAlgorithm(
429       std::unique_ptr<std::vector<std::unique_ptr<HeapAlgorithm>>> algorithms)
430       : algorithms_(std::move(*algorithms)) {}
~ChooseBestHeapAlgorithm()431   ~ChooseBestHeapAlgorithm() override {}
432 
Alloc(const HloValue * buffer,int64 size)433   void Alloc(const HloValue* buffer, int64 size) override {
434     for (auto& algorithm : algorithms_) {
435       algorithm->Alloc(buffer, size);
436     }
437   }
438 
ShareWith(const HloValue * buffer,const HloValue * share_with,int64 size)439   void ShareWith(const HloValue* buffer, const HloValue* share_with,
440                  int64 size) override {
441     for (auto& algorithm : algorithms_) {
442       algorithm->ShareWith(buffer, share_with, size);
443     }
444   }
445 
Free(const HloValue * buffer,int64 size)446   void Free(const HloValue* buffer, int64 size) override {
447     for (auto& algorithm : algorithms_) {
448       algorithm->Free(buffer, size);
449     }
450   }
451 
452   Result Finish() override;
453 
454  private:
455   std::vector<std::unique_ptr<HeapAlgorithm>> algorithms_;
456 };
457 
458 }  // namespace xla
459 
460 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HEAP_SIMULATOR_H_
461