1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HEAP_SIMULATOR_H_ 17 #define TENSORFLOW_COMPILER_XLA_SERVICE_HEAP_SIMULATOR_H_ 18 19 #include <memory> 20 #include <set> 21 #include <utility> 22 #include <vector> 23 24 #include "absl/container/flat_hash_map.h" 25 #include "absl/container/flat_hash_set.h" 26 #include "tensorflow/compiler/xla/service/buffer_value.h" 27 #include "tensorflow/compiler/xla/service/buffer_value_containers.h" 28 #include "tensorflow/compiler/xla/service/hlo.pb.h" 29 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h" 30 #include "tensorflow/compiler/xla/service/hlo_buffer.h" 31 #include "tensorflow/compiler/xla/service/hlo_computation.h" 32 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h" 33 #include "tensorflow/compiler/xla/service/hlo_instruction.h" 34 #include "tensorflow/compiler/xla/service/hlo_live_range.h" 35 #include "tensorflow/compiler/xla/service/hlo_ordering.h" 36 #include "tensorflow/compiler/xla/service/hlo_schedule.h" 37 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h" 38 #include "tensorflow/compiler/xla/statusor.h" 39 40 namespace xla { 41 42 // Forward declare classes defined below. 43 template <typename BufferType> 44 class HeapAlgorithm; 45 template <typename BufferType> 46 class NoFragmentationStatsHeap; 47 48 // HeapSimulator assigns buffer offsets by running a simulation of a regular 49 // memory heap with Alloc and Free calls. It only works for completely 50 // sequential instruction sequences. Unlike regular heaps, we have the 51 // advantage that the sequence of Alloc and Free calls is known up-front; we 52 // don't need to return the assignment of buffer offsets until the very end. 53 class HeapSimulator { 54 public: 55 // Chunk represents a contiguous piece of memory. Each BufferValue will be 56 // associated with a chunk in the assignment result. 57 struct Chunk { 58 int64 offset; 59 int64 size; 60 chunk_endChunk61 int64 chunk_end() const { return offset + size; } 62 63 bool OverlapsWith(Chunk other_chunk) const; 64 65 bool operator==(const Chunk& other) const { 66 return offset == other.offset && size == other.size; 67 } 68 }; 69 70 template <typename BufferType> 71 struct HeapResult { 72 // The assignment of buffers to chunks. 73 absl::flat_hash_map<const BufferType*, Chunk> chunk_map; 74 75 // The total size in bytes of the heap, containing all assigned chunks. 76 int64 heap_size = 0; 77 }; 78 // Result represents the result of the heap simulation. 79 template <typename BufferType> 80 struct Result { 81 // Heap results. 82 std::vector<HeapResult<BufferType>> heap_results; 83 84 // The total size in bytes of the heaps. 85 // heap_size == sum([hr.heap_size for hr in heap_results]). 86 int64 heap_size = 0; 87 88 // The total size in bytes of heap fragmentation. 89 int64 fragmentation_size = 0; 90 91 // A trace of heap simulation events. 92 HeapSimulatorTrace debug_trace; 93 }; 94 95 // The different options to be passed to the Run() APIs. 96 struct Options { OptionsOptions97 Options() 98 : may_reuse_operand_buffers(true), 99 alloc_constants(false), 100 buffers_to_assign(nullptr) {} 101 102 // Whether a buffer about to be Free()-ed, can be recycled for a new born 103 // one, hence collapsing Free()+Alloc() calls (default true). 104 bool may_reuse_operand_buffers; 105 // Whether to issue Alloc() and Free() calls for constants (default false). 106 bool alloc_constants; 107 // If 'buffers_to_assign' is provided, only those buffers are assigned 108 // offsets, otherwise all buffers defined by the instructions are assigned. 109 const absl::flat_hash_set<const HloValue*>* buffers_to_assign; 110 }; 111 112 // Returns the minimum memory required to compute an HLO module where all 113 // computations have been scheduled (represented by the given 114 // schedule), assuming no fragmentation. 115 static StatusOr<int64> MinimumMemoryForModule( 116 const HloSchedule& schedule, 117 const LogicalBuffer::SizeFunction& size_function); 118 119 // Returns the minimum memory required to compute the given computation, 120 // assuming no fragmentation. 121 static StatusOr<int64> MinimumMemoryForComputation( 122 const HloComputation& computation, const HloInstructionSequence& sequence, 123 const HloAliasAnalysis& alias_analysis, 124 const LogicalBuffer::SizeFunction& size_function, 125 const absl::flat_hash_map<const HloComputation*, int64>* 126 memory_by_computation = nullptr); 127 128 static StatusOr<int64> MinimumMemoryForComputation( 129 const HloComputation& computation, const HloInstructionSequence& sequence, 130 const HloAliasAnalysis& alias_analysis, 131 const LogicalBuffer::SizeFunction& size_function, 132 const HloSchedule* schedule); 133 134 // Run the heap simulation with the given algorithm, assuming the given 135 // schedule, which must contain a topologically-consistent total 136 // ordering of all instructions within each computation. The result is invalid 137 // if instructions are not run in exactly this sequence. 138 // 139 // Running heap simulation on the whole module tends to save memory, compared 140 // to running on a per-computation basis, since we can re-use buffer space for 141 // called sub-computations. 142 // 143 static StatusOr<Result<HloValue>> Run( 144 std::unique_ptr<HeapAlgorithm<HloValue>> algorithm, 145 const HloModule& module, const HloSchedule& schedule, 146 const HloAliasAnalysis& alias_analysis, 147 const BufferValue::SizeFunction& size_fn, 148 const Options& options = Options()); 149 150 // Same as above, but runs on a single computation. The 'instruction_sequence' 151 // must contain a topologically-consistent total ordering of all instructions 152 // in the computation. The result is invalid if instructions are not run in 153 // exactly this sequence. 154 static StatusOr<Result<HloValue>> Run( 155 std::unique_ptr<HeapAlgorithm<HloValue>> algorithm, 156 const HloComputation& computation, 157 const HloInstructionSequence& instruction_sequence, 158 const HloAliasAnalysis& alias_analysis, 159 const BufferValue::SizeFunction& size_fn, 160 const Options& options = Options(), 161 const absl::flat_hash_map<const HloComputation*, int64>* 162 memory_by_computation = nullptr); 163 164 // Same as above, but runs on with a schedule that covers all nested 165 // computations. 166 static StatusOr<Result<HloValue>> Run( 167 std::unique_ptr<HeapAlgorithm<HloValue>> algorithm, 168 const HloComputation& computation, 169 const HloInstructionSequence& instruction_sequence, 170 const HloAliasAnalysis& alias_analysis, 171 const BufferValue::SizeFunction& size_fn, const HloSchedule* schedule, 172 const Options& options = Options()); 173 174 private: 175 // If 'schedule' is non-null, it is used to find kCall and kWhile 176 // sub-computations, and the heap simulation for those sub-computations will 177 // be run recursively. I.e. the simulation is run over the whole module. 178 HeapSimulator(std::unique_ptr<HeapAlgorithm<HloValue>> algorithm, 179 const BufferValue::SizeFunction& size_fn, 180 const Options& options, const HloSchedule* schedule = nullptr, 181 const absl::flat_hash_map<const HloComputation*, int64>* 182 memory_by_computation = nullptr); 183 ~HeapSimulator(); 184 185 Status RunComputation(const HloComputation& computation, 186 const HloInstructionSequence& instruction_sequence, 187 const HloAliasAnalysis& alias_analysis, 188 HloLiveRange* live_range); 189 190 bool IgnoreBuffer(const HloValue* buffer) const; 191 void Alloc(const HloValue* buffer, const HloInstruction* instruction); 192 void Free(const HloValue* buffer, const HloInstruction* instruction); 193 // ShareBuffer indicates that a new buffer is defined and it has to be the 194 // same address as the shared one. 195 void ShareBuffer(const HloValue* buffer, const HloValue* shared, 196 const HloInstruction* instruction); 197 198 // Returns true if: 199 // Two buffers belong to the same shared group. 200 // Eight of the buffer has no shared group assigned. 201 bool InSameSharedGroup(const HloValue* left, const HloValue* right); 202 Result<HloValue> Finish(); 203 204 void FillDebugTrace(HeapSimulatorTrace::Event::Kind kind, 205 const HloValue* buffer, const HloInstruction* instruction, 206 const HloValue* share_with_canonical); 207 208 // Counterintuitive: the algorithm_ itself can be a NoFragmentationStatsHeap, 209 // in which case we are calculating the same allocs/frees twice in the 210 // simulation. 211 const std::unique_ptr<NoFragmentationStatsHeap<HloValue>> 212 no_fragmentation_stats_; 213 const std::unique_ptr<HeapAlgorithm<HloValue>> algorithm_; 214 const BufferValue::SizeFunction size_fn_; 215 const Options options_; 216 // schedule_ is set by buffer assignment, and memory_by_computation_ is 217 // set by hlo scheduling. Then, in RunComputation, we check both in order to 218 // handle subcomputations. It would be good to unify the handling of 219 // subcomputations, but it's not clear how. 220 const HloSchedule* schedule_; 221 const absl::flat_hash_map<const HloComputation*, int64>* 222 memory_by_computation_; 223 224 // Hold some sets for error-checking the sequence of Alloc and Free calls. 225 absl::flat_hash_set<const HloValue*> allocated_buffers_; 226 absl::flat_hash_set<const HloValue*> freed_buffers_; 227 228 // Debugging information filled in while the heap simulator runs. 229 HeapSimulatorTrace debug_trace_; 230 }; 231 232 // Abstract base class describing a heap simulation algorithm that assigns 233 // offsets to buffers. A sequence of Alloc / Free calls will be made, with the 234 // same semantics as a regular memory heap. Finish will be called at the end to 235 // collect the simulation results. 236 template <typename BufferType> 237 class HeapAlgorithm { 238 public: 239 using Chunk = HeapSimulator::Chunk; 240 using Result = HeapSimulator::Result<BufferType>; 241 using HeapResult = HeapSimulator::HeapResult<BufferType>; 242 243 virtual ~HeapAlgorithm() = default; 244 245 // Alloc allocates a buffer of 'size' bytes. 246 virtual void Alloc(const BufferType* buffer, int64 size) = 0; 247 248 // Takes memory usage of subcomputations into account when calculating the 249 // memory usage of a computation. Currently, we don't handle buffer aliasing 250 // between computations entirely correctly. We are careful to not double count 251 // for the output buffers of whiles/conds/calls. But we don't take into 252 // account other aliases, such as for the while init. A more thorough solution 253 // would require something like BufferAssignment::BuildColocatedBufferSets. 254 // TODO(b/65835246): 255 // Since TuplePointsToAnalysis is being replaced with a module-aware alias 256 // analysis, it's not worth making major changes to HeapSimulator now. AccountForSubcomputationMemory(const HloInstruction * instruction,int64 alloc_size_by_instruction,const absl::flat_hash_map<const HloComputation *,int64> & memory_by_computation)257 virtual void AccountForSubcomputationMemory( 258 const HloInstruction* instruction, 259 // The total number of bytes allocated by instruction. 260 int64 alloc_size_by_instruction, 261 const absl::flat_hash_map<const HloComputation*, int64>& 262 memory_by_computation) {} 263 264 // Free de-allocates a previously allocated buffer. 265 virtual void Free(const BufferType* buffer, int64 size) = 0; 266 267 // Indicates that a buffer has to be collocated with another buffer. In 268 // addition to Alloc and Free, the heap simulator exposes a concept of buffer 269 // sharing. When ShareBuffer is called, instead of allocating new space for 270 // the buffer, it associates the buffer with a previously allocated (or 271 // shared) buffer. Each group of mutually-shared buffers points to a single 272 // SharedGroup instance, which is a shared control block. ShareWith(const BufferType * buffer,const BufferType * share_with,int64 size)273 virtual void ShareWith(const BufferType* buffer, const BufferType* share_with, 274 int64 size) { 275 Alloc(buffer, size); 276 } 277 278 // Finish collects the buffer offset assignment results. Finish may only be 279 // called once, after all Alloc and Free calls. 280 virtual Result Finish() = 0; 281 }; 282 283 // NoFragmentationStatsHeap computes the heap size assuming no fragmentation; 284 // this is the absolute minimum size for a given instruction sequence. The 285 // result.chunk_map returned in Finish is always empty, since we only collect 286 // stats, and don't actually compute chunk assignments. 287 template <typename BufferType> 288 class NoFragmentationStatsHeap : public HeapAlgorithm<BufferType> { 289 public: 290 using Result = HeapSimulator::Result<BufferType>; 291 292 NoFragmentationStatsHeap() = default; 293 ~NoFragmentationStatsHeap() override = default; 294 295 void Alloc(const BufferType* buffer, int64 size) override; 296 297 void AccountForSubcomputationMemory( 298 const HloInstruction* instruction, int64 alloc_size_by_instruction, 299 const absl::flat_hash_map<const HloComputation*, int64>& 300 memory_by_computation) override; 301 302 void Free(const BufferType* buffer, int64 size) override; 303 304 Result Finish() override; 305 306 private: 307 int64 current_heap_size_ = 0; 308 int64 max_heap_size_ = 0; 309 }; 310 311 // Node in BufferIntervalTree that stores the alloc and free times of a buffer, 312 // and the chunk assigned to it. 313 struct BufferIntervalTreeNode { 314 // Alloc time. 315 int64 start; 316 // Free time. 317 int64 end; 318 // Maximum free time of all nodes in the subtree where this node is the root. 319 int64 subtree_end; 320 // Allocated chunk for the buffer. 321 HeapSimulator::Chunk chunk; 322 // Left child. 323 BufferIntervalTreeNode* left; 324 // Right child. 325 BufferIntervalTreeNode* right; 326 // parent 327 BufferIntervalTreeNode* parent; 328 }; 329 330 // An interval tree that can query buffers overlapping in time. 331 class BufferIntervalTree { 332 public: 333 using Chunk = HeapSimulator::Chunk; 334 // Adds a buffer to the interval tree, with the time interval and allocated 335 // chunk specified. 336 void Add(int64 start, int64 end, const Chunk& chunk); 337 338 // Remove the interval from the tree. Returns true if the chunk is removed. 339 bool Remove(int64 start, int64 end, const Chunk& chunk); 340 341 // Returns vector of allocated chunks that overlap with the given time 342 // interval. 343 std::vector<Chunk> ChunksOverlappingInTime(int64 start, int64 end) const; 344 GetRoot()345 BufferIntervalTreeNode* GetRoot() { return root_; } 346 347 private: 348 BufferIntervalTreeNode* root_ = nullptr; 349 std::list<BufferIntervalTreeNode> node_storage_; 350 }; 351 352 // GlobalDecreasingSizeBestFitHeap collects the live intervals of all buffers, 353 // then allocates them in decreasing spatial or temporal size regardless of the 354 // alloc/free time. It internally tracks the allocated buffers and their live 355 // intervals; when allocating a buffer, it finds the best-fit free chunk during 356 // its live interval. 357 template <typename BufferType> 358 class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> { 359 public: 360 using HeapResult = HeapSimulator::HeapResult<BufferType>; 361 using Result = HeapSimulator::Result<BufferType>; 362 using Chunk = HeapSimulator::Chunk; 363 364 enum Type { 365 kSpatial = 0, 366 kTemporal, 367 }; 368 369 // BufferInterval stores a buffer's size and time interval. 370 struct BufferInterval { 371 const BufferType* buffer; 372 int64 size; 373 // Alloc time of the buffer. 374 int64 start; 375 // Free time of the buffer. 376 int64 end; 377 378 // Colocation buffers that need to be collocated with this one. 379 std::vector<const BufferType*> colocations; 380 381 // True if this buffer needs an allocation. False if it is collocated with 382 // other buffer. 383 bool need_allocation; 384 }; 385 386 // Comparison function that is used to store buffer intervals. 387 using BufferIntervalCompare = 388 std::function<bool(const BufferInterval&, const BufferInterval&)>; 389 390 explicit GlobalDecreasingSizeBestFitHeap(int64 alignment, 391 Type type = kSpatial); ~GlobalDecreasingSizeBestFitHeap()392 ~GlobalDecreasingSizeBestFitHeap() override {} 393 394 void Alloc(const BufferType* buffer, int64 size) override; 395 void Free(const BufferType* buffer, int64 size) override; 396 397 void ShareWith(const BufferType* buffer, const BufferType* share_with, 398 int64 size) override; 399 400 Result Finish() override; 401 402 // Return a BufferIntervalCompare function that sort by spatial size. We don't 403 // look at co-locates as they should have the same size. 404 static BufferIntervalCompare GetSpatialBufferIntervalCompare(); 405 406 protected: 407 // The candidate contains a chunk and the resultant heap size if this 408 // chunk is to be committed. 409 struct ChunkCandidate { 410 Chunk chunk; 411 int64 heap_size; 412 }; 413 414 // Returns the buffer intervals sorted according to buffer_interval_compare_. 415 std::vector<BufferInterval> GetSortedBufferIntervals() const; 416 417 // These two methods below are exposed to other heap algorithms that inherit 418 // from this class. The Finish() method tries to find a candidate chunk for 419 // each BufferInterval, after calling GetSortedBufferIntervals. If a 420 // non-negative preferred_offset is provided, FindChunkCandidate attempts 421 // finding a chunk at this offset. The ChunkCandidate returns the chunk and 422 // the final heap size if it chunk is to be committed. The Finish() method can 423 // then call CommitChunk to associate the chunk with the BufferInterval, if 424 // the final heap size is within the limits. 425 ChunkCandidate FindChunkCandidate(const BufferInterval& buffer_interval, 426 int64 preferred_offset = -1) const; 427 void CommitChunk(const BufferInterval& buffer_interval, 428 ChunkCandidate chunk_candidate); 429 430 // Adds the buffer and the chunk to the result chunk map. 431 virtual void AddToChunkMap(const BufferType* buffer, Chunk chunk); 432 433 // Return a BufferIntervalCompare function that sorts by live ranges. A live 434 // range is defined by the range between the start of the first buffer and the 435 // end of the last co-located buffer. There could be "holes" in the live 436 // ranges of each co-located buffers, but in this heuristics we think they are 437 // contiguous. 438 BufferIntervalCompare GetTemporalBufferIntervalCompare() const; 439 440 absl::flat_hash_map<const BufferType*, BufferInterval> buffer_intervals_; 441 HeapResult result_; 442 BufferIntervalCompare buffer_interval_compare_; 443 BufferIntervalTree interval_tree_; 444 445 private: 446 int64 alignment_; 447 448 // The current time represented as an integer. It increments by 1 at each 449 // Alloc or Free call. 450 int64 current_time_ = 0; 451 452 // Returns all transitive colocated buffers of this buffer interval. I.e., If 453 // a buffer A is colocated with B and B is colocated with C, this function 454 // returns all three of them. 455 absl::flat_hash_set<const BufferType*> GetTransitiveColocations( 456 const BufferInterval& interval) const; 457 }; 458 459 // This class implements an algorithm that will produce multiple heaps, where 460 // each heap size is constrained by a given limit. Note that the constraint is 461 // soft, meaning that a valid heap result is generated even if there are some 462 // buffer sizes larger than the given constraint size. 463 // 464 // Pseudocode: 465 // while( `buffers` is not empty ) { 466 // create a new heap `h` 467 // for (each buffer `buf` in `buffers` in the size-decreasing order) { 468 // if (buf.size() is larger than the heap size limit && 469 // `h` is empty) { 470 // h.place(buf) 471 // buffers.remove(buf) 472 // } else if (placing `buf` into `h` does not violate size 473 // constraint) { 474 // h.place(buf) 475 // buffers.remove(buf) 476 // } 477 // } 478 // } 479 class ConstrainedGlobalDecreasingSizeBestFitHeap 480 : public GlobalDecreasingSizeBestFitHeap<HloValue> { 481 public: 482 explicit ConstrainedGlobalDecreasingSizeBestFitHeap( 483 uint64 size_limit_per_heap, int64 alignment, Type type = kSpatial) 484 : GlobalDecreasingSizeBestFitHeap<HloValue>(alignment, type), 485 size_limit_per_heap_(size_limit_per_heap) {} ~ConstrainedGlobalDecreasingSizeBestFitHeap()486 ~ConstrainedGlobalDecreasingSizeBestFitHeap() override {} 487 488 Result Finish() override; 489 490 private: 491 uint64 size_limit_per_heap_; 492 }; 493 494 // A heap algorithm that chooses the best results from other algorithms added to 495 // it. 496 template <typename BufferType> 497 class ChooseBestHeapAlgorithm : public HeapAlgorithm<BufferType> { 498 public: 499 using Result = HeapSimulator::Result<BufferType>; 500 ChooseBestHeapAlgorithm(std::unique_ptr<std::vector<std::unique_ptr<HeapAlgorithm<BufferType>>>> algorithms)501 ChooseBestHeapAlgorithm( 502 std::unique_ptr<std::vector<std::unique_ptr<HeapAlgorithm<BufferType>>>> 503 algorithms) 504 : algorithms_(std::move(*algorithms)) {} ~ChooseBestHeapAlgorithm()505 ~ChooseBestHeapAlgorithm() override {} 506 Alloc(const BufferType * buffer,int64 size)507 void Alloc(const BufferType* buffer, int64 size) override { 508 for (auto& algorithm : algorithms_) { 509 algorithm->Alloc(buffer, size); 510 } 511 } 512 ShareWith(const BufferType * buffer,const BufferType * share_with,int64 size)513 void ShareWith(const BufferType* buffer, const BufferType* share_with, 514 int64 size) override { 515 for (auto& algorithm : algorithms_) { 516 algorithm->ShareWith(buffer, share_with, size); 517 } 518 } 519 Free(const BufferType * buffer,int64 size)520 void Free(const BufferType* buffer, int64 size) override { 521 for (auto& algorithm : algorithms_) { 522 algorithm->Free(buffer, size); 523 } 524 } 525 526 Result Finish() override; 527 528 private: 529 std::vector<std::unique_ptr<HeapAlgorithm<BufferType>>> algorithms_; 530 }; 531 532 } // namespace xla 533 534 #endif // TENSORFLOW_COMPILER_XLA_SERVICE_HEAP_SIMULATOR_H_ 535