1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HEAP_SIMULATOR_H_ 17 #define TENSORFLOW_COMPILER_XLA_SERVICE_HEAP_SIMULATOR_H_ 18 19 #include <memory> 20 #include <set> 21 #include <utility> 22 #include <vector> 23 24 #include "absl/container/flat_hash_map.h" 25 #include "absl/container/flat_hash_set.h" 26 #include "tensorflow/compiler/xla/service/buffer_value.h" 27 #include "tensorflow/compiler/xla/service/buffer_value_containers.h" 28 #include "tensorflow/compiler/xla/service/hlo.pb.h" 29 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h" 30 #include "tensorflow/compiler/xla/service/hlo_buffer.h" 31 #include "tensorflow/compiler/xla/service/hlo_computation.h" 32 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h" 33 #include "tensorflow/compiler/xla/service/hlo_instruction.h" 34 #include "tensorflow/compiler/xla/service/hlo_live_range.h" 35 #include "tensorflow/compiler/xla/service/hlo_ordering.h" 36 #include "tensorflow/compiler/xla/service/hlo_schedule.h" 37 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h" 38 #include "tensorflow/compiler/xla/statusor.h" 39 40 namespace xla { 41 42 // Forward declare classes defined below. 43 class HeapAlgorithm; 44 class NoFragmentationStatsHeap; 45 46 // HeapSimulator assigns buffer offsets by running a simulation of a regular 47 // memory heap with Alloc and Free calls. It only works for completely 48 // sequential instruction sequences. Unlike regular heaps, we have the 49 // advantage that the sequence of Alloc and Free calls is known up-front; we 50 // don't need to return the assignment of buffer offsets until the very end. 51 class HeapSimulator { 52 public: 53 // Chunk represents a contiguous piece of memory. Each BufferValue will be 54 // associated with a chunk in the assignment result. 55 struct Chunk { 56 int64 offset; 57 int64 size; 58 chunk_endChunk59 int64 chunk_end() const { return offset + size; } 60 61 bool OverlapsWith(Chunk other_chunk) const; 62 }; 63 64 // Result represents the result of the heap simulation. 65 struct Result { 66 // The assignment of buffers to chunks. 67 absl::flat_hash_map<const HloValue*, Chunk> chunk_map; 68 69 // The total size in bytes of the heap, containing all assigned chunks. 70 int64 heap_size = 0; 71 72 // The total size in bytes of heap fragmentation. 73 int64 fragmentation_size = 0; 74 75 // A trace of heap simulation events. 76 HeapSimulatorTrace debug_trace; 77 }; 78 79 // The different options to be passed to the Run() APIs. 80 struct Options { OptionsOptions81 Options() 82 : may_reuse_operand_buffers(true), 83 alloc_constants(false), 84 buffers_to_assign(nullptr) {} 85 86 // Whether a buffer about to be Free()-ed, can be recycled for a new born 87 // one, hence collapsing Free()+Alloc() calls (default true). 88 bool may_reuse_operand_buffers; 89 // Whether to issue Alloc() and Free() calls for constants (default false). 90 bool alloc_constants; 91 // If 'buffers_to_assign' is provided, only those buffers are assigned 92 // offsets, otherwise all buffers defined by the instructions are assigned. 93 const absl::flat_hash_set<const HloValue*>* buffers_to_assign; 94 }; 95 96 // Returns the minimum memory required to compute an HLO module where all 97 // computations have been scheduled (represented by the given 98 // schedule), assuming no fragmentation. 99 static StatusOr<int64> MinimumMemoryForModule( 100 const HloSchedule& schedule, 101 const LogicalBuffer::SizeFunction& size_function); 102 103 // Returns the minimum memory required to compute the given computation, 104 // assuming no fragmentation. 105 static StatusOr<int64> MinimumMemoryForComputation( 106 const HloComputation& computation, const HloInstructionSequence& sequence, 107 const HloAliasAnalysis& alias_analysis, 108 const LogicalBuffer::SizeFunction& size_function, 109 const absl::flat_hash_map<const HloComputation*, int64>* 110 memory_by_computation = nullptr); 111 112 static StatusOr<int64> MinimumMemoryForComputation( 113 const HloComputation& computation, const HloInstructionSequence& sequence, 114 const HloAliasAnalysis& alias_analysis, 115 const LogicalBuffer::SizeFunction& size_function, 116 const HloSchedule* schedule); 117 118 // Run the heap simulation with the given algorithm, assuming the given 119 // schedule, which must contain a topologically-consistent total 120 // ordering of all instructions within each computation. The result is invalid 121 // if instructions are not run in exactly this sequence. 122 // 123 // Running heap simulation on the whole module tends to save memory, compared 124 // to running on a per-computation basis, since we can re-use buffer space for 125 // called sub-computations. 126 // 127 static StatusOr<Result> Run(std::unique_ptr<HeapAlgorithm> algorithm, 128 const HloModule& module, 129 const HloSchedule& schedule, 130 const HloAliasAnalysis& alias_analysis, 131 const BufferValue::SizeFunction& size_fn, 132 const Options& options = Options()); 133 134 // Same as above, but runs on a single computation. The 'instruction_sequence' 135 // must contain a topologically-consistent total ordering of all instructions 136 // in the computation. The result is invalid if instructions are not run in 137 // exactly this sequence. 138 static StatusOr<Result> Run( 139 std::unique_ptr<HeapAlgorithm> algorithm, 140 const HloComputation& computation, 141 const HloInstructionSequence& instruction_sequence, 142 const HloAliasAnalysis& alias_analysis, 143 const BufferValue::SizeFunction& size_fn, 144 const Options& options = Options(), 145 const absl::flat_hash_map<const HloComputation*, int64>* 146 memory_by_computation = nullptr); 147 148 // Same as above, but runs on with a schedule that covers all nested 149 // computations. 150 static StatusOr<Result> Run( 151 std::unique_ptr<HeapAlgorithm> algorithm, 152 const HloComputation& computation, 153 const HloInstructionSequence& instruction_sequence, 154 const HloAliasAnalysis& alias_analysis, 155 const BufferValue::SizeFunction& size_fn, const HloSchedule* schedule, 156 const Options& options = Options()); 157 158 private: 159 // If 'schedule' is non-null, it is used to find kCall and kWhile 160 // sub-computations, and the heap simulation for those sub-computations will 161 // be run recursively. I.e. the simulation is run over the whole module. 162 HeapSimulator(std::unique_ptr<HeapAlgorithm> algorithm, 163 const BufferValue::SizeFunction& size_fn, 164 const Options& options, const HloSchedule* schedule = nullptr, 165 const absl::flat_hash_map<const HloComputation*, int64>* 166 memory_by_computation = nullptr); 167 ~HeapSimulator(); 168 169 Status RunComputation(const HloComputation& computation, 170 const HloInstructionSequence& instruction_sequence, 171 const HloAliasAnalysis& alias_analysis, 172 HloLiveRange* live_range); 173 174 bool IgnoreBuffer(const HloValue* buffer) const; 175 void Alloc(const HloValue* buffer, const HloInstruction* instruction); 176 void Free(const HloValue* buffer, const HloInstruction* instruction); 177 // ShareBuffer indicates that a new buffer is defined and it has to be the 178 // same address as the shared one. 179 void ShareBuffer(const HloValue* buffer, const HloValue* shared, 180 const HloInstruction* instruction); 181 182 // Returns true if: 183 // Two buffers belong to the same shared group. 184 // Eight of the buffer has no shared group assigned. 185 bool InSameSharedGroup(const HloValue* left, const HloValue* right); 186 Result Finish(); 187 188 void FillDebugTrace(HeapSimulatorTrace::Event::Kind kind, 189 const HloValue* buffer, const HloInstruction* instruction, 190 const HloValue* share_with_canonical); 191 192 // Counterintuitive: the algorithm_ itself can be a NoFragmentationStatsHeap, 193 // in which case we are calculating the same allocs/frees twice in the 194 // simulation. 195 const std::unique_ptr<NoFragmentationStatsHeap> no_fragmentation_stats_; 196 const std::unique_ptr<HeapAlgorithm> algorithm_; 197 const BufferValue::SizeFunction size_fn_; 198 const Options options_; 199 // schedule_ is set by buffer assignment, and memory_by_computation_ is 200 // set by hlo scheduling. Then, in RunComputation, we check both in order to 201 // handle subcomputations. It would be good to unify the handling of 202 // subcomputations, but it's not clear how. 203 const HloSchedule* schedule_; 204 const absl::flat_hash_map<const HloComputation*, int64>* 205 memory_by_computation_; 206 207 // Hold some sets for error-checking the sequence of Alloc and Free calls. 208 absl::flat_hash_set<const HloValue*> allocated_buffers_; 209 absl::flat_hash_set<const HloValue*> freed_buffers_; 210 211 // Debugging information filled in while the heap simulator runs. 212 HeapSimulatorTrace debug_trace_; 213 }; 214 215 // Abstract base class describing a heap simulation algorithm that assigns 216 // offsets to buffers. A sequence of Alloc / Free calls will be made, with the 217 // same semantics as a regular memory heap. Finish will be called at the end to 218 // collect the simulation results. 219 class HeapAlgorithm { 220 public: 221 using Chunk = HeapSimulator::Chunk; 222 using Result = HeapSimulator::Result; 223 224 virtual ~HeapAlgorithm() = default; 225 226 // Alloc allocates a buffer of 'size' bytes. 227 virtual void Alloc(const HloValue* buffer, int64 size) = 0; 228 229 // Takes memory usage of subcomputations into account when calculating the 230 // memory usage of a computation. Currently, we don't handle buffer aliasing 231 // between computations entirely correctly. We are careful to not double count 232 // for the output buffers of whiles/conds/calls. But we don't take into 233 // account other aliases, such as for the while init. A more thorough solution 234 // would require something like BufferAssignment::BuildColocatedBufferSets. 235 // TODO(b/65835246): 236 // Since TuplePointsToAnalysis is being replaced with a module-aware alias 237 // analysis, it's not worth making major changes to HeapSimulator now. AccountForSubcomputationMemory(const HloInstruction * instruction,int64 alloc_size_by_instruction,const absl::flat_hash_map<const HloComputation *,int64> & memory_by_computation)238 virtual void AccountForSubcomputationMemory( 239 const HloInstruction* instruction, 240 // The total number of bytes allocated by instruction. 241 int64 alloc_size_by_instruction, 242 const absl::flat_hash_map<const HloComputation*, int64>& 243 memory_by_computation) {} 244 245 // Free de-allocates a previously allocated buffer. 246 virtual void Free(const HloValue* buffer, int64 size) = 0; 247 248 // Indicates that a buffer has to be collocated with another buffer. In 249 // addition to Alloc and Free, the heap simulator exposes a concept of buffer 250 // sharing. When ShareBuffer is called, instead of allocating new space for 251 // the buffer, it associates the buffer with a previously allocated (or 252 // shared) buffer. Each group of mutually-shared buffers points to a single 253 // SharedGroup instance, which is a shared control block. ShareWith(const HloValue * buffer,const HloValue * share_with,int64 size)254 virtual void ShareWith(const HloValue* buffer, const HloValue* share_with, 255 int64 size) { 256 Alloc(buffer, size); 257 } 258 259 // Finish collects the buffer offset assignment results. Finish may only be 260 // called once, after all Alloc and Free calls. 261 virtual Result Finish() = 0; 262 }; 263 264 // NoFragmentationStatsHeap computes the heap size assuming no fragmentation; 265 // this is the absolute minimum size for a given instruction sequence. The 266 // result.chunk_map returned in Finish is always empty, since we only collect 267 // stats, and don't actually compute chunk assignments. 268 class NoFragmentationStatsHeap : public HeapAlgorithm { 269 public: 270 NoFragmentationStatsHeap() = default; 271 ~NoFragmentationStatsHeap() override = default; 272 273 void Alloc(const HloValue* buffer, int64 size) override; 274 275 void AccountForSubcomputationMemory( 276 const HloInstruction* instruction, int64 alloc_size_by_instruction, 277 const absl::flat_hash_map<const HloComputation*, int64>& 278 memory_by_computation) override; 279 280 void Free(const HloValue* buffer, int64 size) override; 281 282 Result Finish() override; 283 284 private: 285 int64 current_heap_size_ = 0; 286 int64 max_heap_size_ = 0; 287 }; 288 289 // Node in BufferIntervalTree that stores the alloc and free times of a buffer, 290 // and the chunk assigned to it. 291 struct BufferIntervalTreeNode { 292 // Alloc time. 293 int64 start; 294 // Free time. 295 int64 end; 296 // Maximum free time of all nodes in the subtree where this node is the root. 297 int64 subtree_end; 298 // Allocated chunk for the buffer. 299 HeapSimulator::Chunk chunk; 300 // Left child. 301 BufferIntervalTreeNode* left; 302 // Right child. 303 BufferIntervalTreeNode* right; 304 }; 305 306 // An interval tree that can query buffers overlapping in time. 307 class BufferIntervalTree { 308 public: 309 using Chunk = HeapSimulator::Chunk; 310 // Adds a buffer to the interval tree, with the time interval and allocated 311 // chunk specified. 312 void Add(int64 start, int64 end, const Chunk& chunk); 313 314 // Returns vector of allocated chunks that overlap with the given time 315 // interval. 316 std::vector<Chunk> ChunksOverlappingInTime(int64 start, int64 end) const; 317 318 private: 319 std::list<BufferIntervalTreeNode> node_storage_; 320 }; 321 322 // GlobalDecreasingSizeBestFitHeap collects the live intervals of all buffers, 323 // then allocates them in decreasing spatial or temporal size regardless of the 324 // alloc/free time. It internally tracks the allocated buffers and their live 325 // intervals; when allocating a buffer, it finds the best-fit free chunk during 326 // its live interval. 327 class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm { 328 public: 329 enum Type { 330 kSpatial = 0, 331 kTemporal, 332 }; 333 334 // BufferInterval stores a buffer's size and time interval. 335 struct BufferInterval { 336 const HloValue* buffer; 337 int64 size; 338 // Alloc time of the buffer. 339 int64 start; 340 // Free time of the buffer. 341 int64 end; 342 343 // Colocation buffers that need to be collocated with this one. 344 std::vector<const HloValue*> colocations; 345 346 // True if this buffer needs an allocation. False if it is collocated with 347 // other buffer. 348 bool need_allocation; 349 }; 350 351 // Comparison function that is used to store buffer intervals. 352 using BufferIntervalCompare = 353 std::function<bool(const BufferInterval&, const BufferInterval&)>; 354 355 explicit GlobalDecreasingSizeBestFitHeap(int64 alignment, 356 Type type = kSpatial); ~GlobalDecreasingSizeBestFitHeap()357 ~GlobalDecreasingSizeBestFitHeap() override {} 358 359 void Alloc(const HloValue* buffer, int64 size) override; 360 void Free(const HloValue* buffer, int64 size) override; 361 362 void ShareWith(const HloValue* buffer, const HloValue* share_with, 363 int64 size) override; 364 365 Result Finish() override; 366 367 // Return a BufferIntervalCompare function that sort by spatial size. We don't 368 // look at co-locates as they should have the same size. 369 static BufferIntervalCompare GetSpatialBufferIntervalCompare(); 370 371 protected: 372 // The candidate contains a chunk and the resultant heap size if this 373 // chunk is to be committed. 374 struct ChunkCandidate { 375 Chunk chunk; 376 int64 heap_size; 377 }; 378 379 // Returns the buffer intervals sorted according to buffer_interval_compare_. 380 std::vector<BufferInterval> GetSortedBufferIntervals() const; 381 382 // These two methods below are exposed to other heap algorithms that inherit 383 // from this class. The Finish() method tries to find a candidate chunk for 384 // each BufferInterval, after calling GetSortedBufferIntervals. If a 385 // non-negative preferred_offset is provided, FindChunkCandidate attempts 386 // finding a chunk at this offset. The ChunkCandidate returns the chunk and 387 // the final heap size if it chunk is to be committed. The Finish() method can 388 // then call CommitChunk to associate the chunk with the BufferInterval, if 389 // the final heap size is within the limits. 390 ChunkCandidate FindChunkCandidate(const BufferInterval& buffer_interval, 391 int64 preferred_offset = -1) const; 392 void CommitChunk(const BufferInterval& buffer_interval, 393 ChunkCandidate chunk_candidate); 394 // Adds the buffer and the chunk to the result chunk map. 395 virtual void AddToChunkMap(const HloValue* buffer, Chunk chunk); 396 397 // Return a BufferIntervalCompare function that sorts by live ranges. A live 398 // range is defined by the range between the start of the first buffer and the 399 // end of the last co-located buffer. There could be "holes" in the live 400 // ranges of each co-located buffers, but in this heuristics we think they are 401 // contiguous. 402 BufferIntervalCompare GetTemporalBufferIntervalCompare() const; 403 404 absl::flat_hash_map<const HloValue*, BufferInterval> buffer_intervals_; 405 Result result_; 406 BufferIntervalCompare buffer_interval_compare_; 407 408 private: 409 int64 alignment_; 410 411 // The current time represented as an integer. It increments by 1 at each 412 // Alloc or Free call. 413 int64 current_time_ = 0; 414 415 BufferIntervalTree interval_tree_; 416 417 // Returns all transitive colocated buffers of this buffer interval. I.e., If 418 // a buffer A is colocated with B and B is colocated with C, this function 419 // returns all three of them. 420 absl::flat_hash_set<const HloValue*> GetTransitiveColocations( 421 const BufferInterval& interval) const; 422 }; 423 424 // A heap algorithm that chooses the best results from other algorithms added to 425 // it. 426 class ChooseBestHeapAlgorithm : public HeapAlgorithm { 427 public: ChooseBestHeapAlgorithm(std::unique_ptr<std::vector<std::unique_ptr<HeapAlgorithm>>> algorithms)428 ChooseBestHeapAlgorithm( 429 std::unique_ptr<std::vector<std::unique_ptr<HeapAlgorithm>>> algorithms) 430 : algorithms_(std::move(*algorithms)) {} ~ChooseBestHeapAlgorithm()431 ~ChooseBestHeapAlgorithm() override {} 432 Alloc(const HloValue * buffer,int64 size)433 void Alloc(const HloValue* buffer, int64 size) override { 434 for (auto& algorithm : algorithms_) { 435 algorithm->Alloc(buffer, size); 436 } 437 } 438 ShareWith(const HloValue * buffer,const HloValue * share_with,int64 size)439 void ShareWith(const HloValue* buffer, const HloValue* share_with, 440 int64 size) override { 441 for (auto& algorithm : algorithms_) { 442 algorithm->ShareWith(buffer, share_with, size); 443 } 444 } 445 Free(const HloValue * buffer,int64 size)446 void Free(const HloValue* buffer, int64 size) override { 447 for (auto& algorithm : algorithms_) { 448 algorithm->Free(buffer, size); 449 } 450 } 451 452 Result Finish() override; 453 454 private: 455 std::vector<std::unique_ptr<HeapAlgorithm>> algorithms_; 456 }; 457 458 } // namespace xla 459 460 #endif // TENSORFLOW_COMPILER_XLA_SERVICE_HEAP_SIMULATOR_H_ 461