1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HEAP_SIMULATOR_H_ 17 #define TENSORFLOW_COMPILER_XLA_SERVICE_HEAP_SIMULATOR_H_ 18 19 #include <algorithm> 20 #include <memory> 21 #include <set> 22 #include <utility> 23 #include <vector> 24 25 #include "absl/container/flat_hash_map.h" 26 #include "absl/container/flat_hash_set.h" 27 #include "tensorflow/compiler/xla/service/buffer_value.h" 28 #include "tensorflow/compiler/xla/service/buffer_value_containers.h" 29 #include "tensorflow/compiler/xla/service/hlo.pb.h" 30 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h" 31 #include "tensorflow/compiler/xla/service/hlo_buffer.h" 32 #include "tensorflow/compiler/xla/service/hlo_computation.h" 33 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h" 34 #include "tensorflow/compiler/xla/service/hlo_instruction.h" 35 #include "tensorflow/compiler/xla/service/hlo_live_range.h" 36 #include "tensorflow/compiler/xla/service/hlo_ordering.h" 37 #include "tensorflow/compiler/xla/service/hlo_schedule.h" 38 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h" 39 #include "tensorflow/compiler/xla/statusor.h" 40 41 namespace xla { 42 43 // Forward declare classes defined below. 44 template <typename BufferType> 45 class HeapAlgorithm; 46 template <typename BufferType> 47 class NoFragmentationStatsHeap; 48 49 // HeapSimulator assigns buffer offsets by running a simulation of a regular 50 // memory heap with Alloc and Free calls. It only works for completely 51 // sequential instruction sequences. Unlike regular heaps, we have the 52 // advantage that the sequence of Alloc and Free calls is known up-front; we 53 // don't need to return the assignment of buffer offsets until the very end. 54 class HeapSimulator { 55 public: 56 // Chunk represents a contiguous piece of memory. Each BufferValue will be 57 // associated with a chunk in the assignment result. 58 struct Chunk { 59 int64_t offset; 60 int64_t size; 61 chunk_endChunk62 int64_t chunk_end() const { return offset + size; } 63 64 bool OverlapsWith(Chunk other_chunk) const; 65 66 bool operator==(const Chunk& other) const { 67 return offset == other.offset && size == other.size; 68 } 69 }; 70 71 template <typename BufferType> 72 struct HeapResult { 73 // Returns the updated heap size if `chunk` is added to the heap. UpdatedHeapSizeHeapResult74 int64_t UpdatedHeapSize(const Chunk& chunk) const { 75 return std::max(heap_size, chunk.chunk_end()); 76 } 77 78 // The assignment of buffers to chunks. 79 absl::flat_hash_map<const BufferType*, Chunk> chunk_map; 80 81 // The total size in bytes of the heap, containing all assigned chunks. 82 int64_t heap_size = 0; 83 }; 84 // Result represents the result of the heap simulation. 85 template <typename BufferType> 86 struct Result { 87 // Heap results. 88 std::vector<HeapResult<BufferType>> heap_results; 89 90 // The total size in bytes of the heaps. 91 // heap_size == sum([hr.heap_size for hr in heap_results]). 92 int64_t heap_size = 0; 93 94 // The total size in bytes of heap fragmentation. 95 int64_t fragmentation_size = 0; 96 97 // A trace of heap simulation events. 98 HeapSimulatorTrace debug_trace; 99 }; 100 101 // The different options to be passed to the Run() APIs. 102 struct Options { OptionsOptions103 Options() 104 : may_reuse_operand_buffers(true), 105 alloc_constants(false), 106 buffers_to_assign(nullptr) {} 107 108 // Whether a buffer about to be Free()-ed, can be recycled for a new born 109 // one, hence collapsing Free()+Alloc() calls (default true). 110 bool may_reuse_operand_buffers; 111 // Whether to issue Alloc() and Free() calls for constants (default false). 112 bool alloc_constants; 113 // If 'buffers_to_assign' is provided, only those buffers are assigned 114 // offsets, otherwise all buffers defined by the instructions are assigned. 115 const absl::flat_hash_set<const HloValue*>* buffers_to_assign; 116 }; 117 118 // Returns the minimum memory required to compute an HLO module where all 119 // computations have been scheduled (represented by the given 120 // schedule), assuming no fragmentation. 121 static StatusOr<int64_t> MinimumMemoryForModule( 122 const HloSchedule& schedule, 123 const LogicalBuffer::SizeFunction& size_function); 124 125 // Returns the minimum memory required to compute the given computation, 126 // assuming no fragmentation. 127 static StatusOr<int64_t> MinimumMemoryForComputation( 128 const HloComputation& computation, const HloInstructionSequence& sequence, 129 const HloAliasAnalysis& alias_analysis, 130 const LogicalBuffer::SizeFunction& size_function, 131 const absl::flat_hash_map<const HloComputation*, int64_t>* 132 memory_by_computation = nullptr); 133 134 static StatusOr<int64_t> MinimumMemoryForComputation( 135 const HloComputation& computation, const HloInstructionSequence& sequence, 136 const HloAliasAnalysis& alias_analysis, 137 const LogicalBuffer::SizeFunction& size_function, 138 const HloSchedule* schedule); 139 140 // Run the heap simulation with the given algorithm, assuming the given 141 // schedule, which must contain a topologically-consistent total 142 // ordering of all instructions within each computation. The result is invalid 143 // if instructions are not run in exactly this sequence. 144 // 145 // Running heap simulation on the whole module tends to save memory, compared 146 // to running on a per-computation basis, since we can re-use buffer space for 147 // called sub-computations. 148 // 149 static StatusOr<Result<HloValue>> Run( 150 std::unique_ptr<HeapAlgorithm<HloValue>> algorithm, 151 const HloModule& module, const HloSchedule& schedule, 152 const HloAliasAnalysis& alias_analysis, 153 const BufferValue::SizeFunction& size_fn, 154 const Options& options = Options()); 155 156 // Same as above, but runs on a single computation. The 'instruction_sequence' 157 // must contain a topologically-consistent total ordering of all instructions 158 // in the computation. The result is invalid if instructions are not run in 159 // exactly this sequence. 160 static StatusOr<Result<HloValue>> Run( 161 std::unique_ptr<HeapAlgorithm<HloValue>> algorithm, 162 const HloComputation& computation, 163 const HloInstructionSequence& instruction_sequence, 164 const HloAliasAnalysis& alias_analysis, 165 const BufferValue::SizeFunction& size_fn, 166 const Options& options = Options(), 167 const absl::flat_hash_map<const HloComputation*, int64_t>* 168 memory_by_computation = nullptr); 169 170 // Same as above, but runs on with a schedule that covers all nested 171 // computations. 172 static StatusOr<Result<HloValue>> Run( 173 std::unique_ptr<HeapAlgorithm<HloValue>> algorithm, 174 const HloComputation& computation, 175 const HloInstructionSequence& instruction_sequence, 176 const HloAliasAnalysis& alias_analysis, 177 const BufferValue::SizeFunction& size_fn, const HloSchedule* schedule, 178 const Options& options = Options()); 179 180 private: 181 // If 'schedule' is non-null, it is used to find kCall and kWhile 182 // sub-computations, and the heap simulation for those sub-computations will 183 // be run recursively. I.e. the simulation is run over the whole module. 184 HeapSimulator(std::unique_ptr<HeapAlgorithm<HloValue>> algorithm, 185 const BufferValue::SizeFunction& size_fn, 186 const Options& options, const HloSchedule* schedule = nullptr, 187 const absl::flat_hash_map<const HloComputation*, int64_t>* 188 memory_by_computation = nullptr); 189 ~HeapSimulator(); 190 191 Status RunComputation(const HloComputation& computation, 192 const HloInstructionSequence& instruction_sequence, 193 const HloAliasAnalysis& alias_analysis, 194 HloLiveRange* live_range); 195 196 bool IgnoreBuffer(const HloValue* buffer) const; 197 void Alloc(const HloValue* buffer, const HloInstruction* instruction); 198 void Free(const HloValue* buffer, const HloInstruction* instruction); 199 // ShareBuffer indicates that a new buffer is defined and it has to be the 200 // same address as the shared one. 201 void ShareBuffer(const HloValue* buffer, const HloValue* shared, 202 const HloInstruction* instruction); 203 204 // Returns true if: 205 // Two buffers belong to the same shared group. 206 // Eight of the buffer has no shared group assigned. 207 bool InSameSharedGroup(const HloValue* left, const HloValue* right); 208 Result<HloValue> Finish(); 209 210 void FillDebugTrace(HeapSimulatorTrace::Event::Kind kind, 211 const HloValue* buffer, const HloInstruction* instruction, 212 const HloValue* share_with_canonical); 213 214 // Counterintuitive: the algorithm_ itself can be a NoFragmentationStatsHeap, 215 // in which case we are calculating the same allocs/frees twice in the 216 // simulation. 217 const std::unique_ptr<NoFragmentationStatsHeap<HloValue>> 218 no_fragmentation_stats_; 219 const std::unique_ptr<HeapAlgorithm<HloValue>> algorithm_; 220 const BufferValue::SizeFunction size_fn_; 221 const Options options_; 222 // schedule_ is set by buffer assignment, and memory_by_computation_ is 223 // set by hlo scheduling. Then, in RunComputation, we check both in order to 224 // handle subcomputations. It would be good to unify the handling of 225 // subcomputations, but it's not clear how. 226 const HloSchedule* schedule_; 227 const absl::flat_hash_map<const HloComputation*, int64_t>* 228 memory_by_computation_; 229 230 // Hold some sets for error-checking the sequence of Alloc and Free calls. 231 absl::flat_hash_set<const HloValue*> allocated_buffers_; 232 absl::flat_hash_set<const HloValue*> freed_buffers_; 233 234 // Debugging information filled in while the heap simulator runs. 235 HeapSimulatorTrace debug_trace_; 236 }; 237 238 // Abstract base class describing a heap simulation algorithm that assigns 239 // offsets to buffers. A sequence of Alloc / Free calls will be made, with the 240 // same semantics as a regular memory heap. Finish will be called at the end to 241 // collect the simulation results. 242 template <typename BufferType> 243 class HeapAlgorithm { 244 public: 245 using Chunk = HeapSimulator::Chunk; 246 using Result = HeapSimulator::Result<BufferType>; 247 using HeapResult = HeapSimulator::HeapResult<BufferType>; 248 249 virtual ~HeapAlgorithm() = default; 250 251 // Alloc allocates a buffer of 'size' bytes. 252 virtual void Alloc(const BufferType* buffer, int64_t size) = 0; 253 254 // Takes memory usage of subcomputations into account when calculating the 255 // memory usage of a computation. Currently, we don't handle buffer aliasing 256 // between computations entirely correctly. We are careful to not double count 257 // for the output buffers of whiles/conds/calls. But we don't take into 258 // account other aliases, such as for the while init. A more thorough solution 259 // would require something like BufferAssignment::BuildColocatedBufferSets. 260 // TODO(b/65835246): 261 // Since TuplePointsToAnalysis is being replaced with a module-aware alias 262 // analysis, it's not worth making major changes to HeapSimulator now. AccountForSubcomputationMemory(const HloInstruction * instruction,int64_t alloc_size_by_instruction,const absl::flat_hash_map<const HloComputation *,int64_t> & memory_by_computation)263 virtual void AccountForSubcomputationMemory( 264 const HloInstruction* instruction, 265 // The total number of bytes allocated by instruction. 266 int64_t alloc_size_by_instruction, 267 const absl::flat_hash_map<const HloComputation*, int64_t>& 268 memory_by_computation) {} 269 270 // Free de-allocates a previously allocated buffer. 271 virtual void Free(const BufferType* buffer, int64_t size) = 0; 272 273 // Indicates that a buffer has to be collocated with another buffer. In 274 // addition to Alloc and Free, the heap simulator exposes a concept of buffer 275 // sharing. When ShareBuffer is called, instead of allocating new space for 276 // the buffer, it associates the buffer with a previously allocated (or 277 // shared) buffer. Each group of mutually-shared buffers points to a single 278 // SharedGroup instance, which is a shared control block. ShareWith(const BufferType * buffer,const BufferType * share_with,int64_t size)279 virtual void ShareWith(const BufferType* buffer, const BufferType* share_with, 280 int64_t size) { 281 Alloc(buffer, size); 282 } 283 284 // Finish collects the buffer offset assignment results. Finish may only be 285 // called once, after all Alloc and Free calls. 286 virtual Result Finish() = 0; 287 }; 288 289 // NoFragmentationStatsHeap computes the heap size assuming no fragmentation; 290 // this is the absolute minimum size for a given instruction sequence. The 291 // result.chunk_map returned in Finish is always empty, since we only collect 292 // stats, and don't actually compute chunk assignments. 293 template <typename BufferType> 294 class NoFragmentationStatsHeap : public HeapAlgorithm<BufferType> { 295 public: 296 using Result = HeapSimulator::Result<BufferType>; 297 298 NoFragmentationStatsHeap() = default; 299 ~NoFragmentationStatsHeap() override = default; 300 301 void Alloc(const BufferType* buffer, int64_t size) override; 302 303 void AccountForSubcomputationMemory( 304 const HloInstruction* instruction, int64_t alloc_size_by_instruction, 305 const absl::flat_hash_map<const HloComputation*, int64_t>& 306 memory_by_computation) override; 307 308 void Free(const BufferType* buffer, int64_t size) override; 309 310 Result Finish() override; 311 312 private: 313 int64_t current_heap_size_ = 0; 314 int64_t max_heap_size_ = 0; 315 }; 316 317 // Node in BufferIntervalTree that stores the alloc and free times of a buffer, 318 // and the chunk assigned to it. 319 struct BufferIntervalTreeNode { 320 // Alloc time. 321 int64_t start; 322 // Free time. 323 int64_t end; 324 // Maximum free time of all nodes in the subtree where this node is the root. 325 int64_t subtree_end; 326 // Allocated chunk for the buffer. 327 HeapSimulator::Chunk chunk; 328 // Left child. 329 BufferIntervalTreeNode* left; 330 // Right child. 331 BufferIntervalTreeNode* right; 332 // parent 333 BufferIntervalTreeNode* parent; 334 }; 335 336 // An interval tree that can query buffers overlapping in time. 337 class BufferIntervalTree { 338 public: 339 using Chunk = HeapSimulator::Chunk; 340 // Adds a buffer to the interval tree, with the time interval and allocated 341 // chunk specified. 342 void Add(int64_t start, int64_t end, const Chunk& chunk); 343 344 // Remove the interval from the tree. Returns true if the chunk is removed. 345 bool Remove(int64_t start, int64_t end, const Chunk& chunk); 346 347 // Returns vector of allocated chunks that overlap with the given time 348 // interval. 349 std::vector<Chunk> ChunksOverlappingInTime(int64_t start, int64_t end) const; 350 GetRoot()351 BufferIntervalTreeNode* GetRoot() { return root_; } 352 353 private: 354 BufferIntervalTreeNode* root_ = nullptr; 355 std::list<BufferIntervalTreeNode> node_storage_; 356 }; 357 358 // GlobalDecreasingSizeBestFitHeap collects the live intervals of all buffers, 359 // then allocates them in decreasing spatial or temporal size regardless of the 360 // alloc/free time. It internally tracks the allocated buffers and their live 361 // intervals; when allocating a buffer, it finds the best-fit free chunk during 362 // its live interval. 363 template <typename BufferType> 364 class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> { 365 public: 366 using HeapResult = HeapSimulator::HeapResult<BufferType>; 367 using Result = HeapSimulator::Result<BufferType>; 368 using Chunk = HeapSimulator::Chunk; 369 370 enum Type { 371 kSpatial = 0, 372 kTemporal, 373 }; 374 375 // BufferInterval stores a buffer's size and time interval. 376 struct BufferInterval { 377 const BufferType* buffer; 378 int64_t size; 379 // Alloc time of the buffer. 380 int64_t start; 381 // Free time of the buffer. 382 int64_t end; 383 384 // Colocation buffers that need to be collocated with this one. 385 absl::InlinedVector<const BufferType*, 2> colocations; 386 387 // True if this buffer needs an allocation. False if it is collocated with 388 // other buffer. 389 bool need_allocation; 390 }; 391 392 // Comparison function that is used to store buffer intervals. 393 using BufferIntervalCompare = 394 std::function<bool(const BufferInterval&, const BufferInterval&)>; 395 396 explicit GlobalDecreasingSizeBestFitHeap(int64_t alignment, 397 Type type = kSpatial); ~GlobalDecreasingSizeBestFitHeap()398 ~GlobalDecreasingSizeBestFitHeap() override {} 399 400 void Alloc(const BufferType* buffer, int64_t size) override; 401 void Free(const BufferType* buffer, int64_t size) override; 402 403 void ShareWith(const BufferType* buffer, const BufferType* share_with, 404 int64_t size) override; 405 406 Result Finish() override; 407 408 // Return a BufferIntervalCompare function that sort by spatial size. We don't 409 // look at co-locates as they should have the same size. 410 static BufferIntervalCompare GetSpatialBufferIntervalCompare(); 411 412 protected: 413 // Returns the buffer intervals sorted according to buffer_interval_compare_. 414 std::vector<BufferInterval> GetSortedBufferIntervals() const; 415 416 // These two methods below are exposed to other heap algorithms that inherit 417 // from this class. The Finish() method tries to find a candidate chunk for 418 // each BufferInterval, after calling GetSortedBufferIntervals. If a 419 // non-negative preferred_offset is provided, FindChunkCandidate attempts 420 // finding a chunk at this offset. The Finish() method can then call 421 // CommitChunk to associate the chunk with the BufferInterval, if the final 422 // heap size is within the limits. 423 Chunk FindChunkCandidate(const BufferInterval& buffer_interval, 424 int64_t preferred_offset = -1) const; 425 void CommitChunk(const BufferInterval& buffer_interval, Chunk chunk); 426 427 // Adds the buffer and the chunk to the result chunk map. 428 virtual void AddToChunkMap(const BufferType* buffer, Chunk chunk); 429 430 // Return a BufferIntervalCompare function that sorts by live ranges. A live 431 // range is defined by the range between the start of the first buffer and the 432 // end of the last co-located buffer. There could be "holes" in the live 433 // ranges of each co-located buffers, but in this heuristics we think they are 434 // contiguous. 435 BufferIntervalCompare GetTemporalBufferIntervalCompare() const; 436 437 absl::flat_hash_map<const BufferType*, BufferInterval> buffer_intervals_; 438 HeapResult result_; 439 BufferIntervalCompare buffer_interval_compare_; 440 BufferIntervalTree interval_tree_; 441 442 private: 443 int64_t alignment_; 444 445 // The current time represented as an integer. It increments by 1 at each 446 // Alloc or Free call. 447 int64_t current_time_ = 0; 448 449 // Returns all transitive colocated buffers of this buffer interval. I.e., If 450 // a buffer A is colocated with B and B is colocated with C, this function 451 // returns all three of them. 452 absl::flat_hash_set<const BufferType*> GetTransitiveColocations( 453 const BufferInterval& interval) const; 454 }; 455 456 // This class implements an algorithm that will produce multiple heaps, where 457 // each heap size is constrained by a given limit. Note that the constraint is 458 // soft, meaning that a valid heap result is generated even if there are some 459 // buffer sizes larger than the given constraint size. 460 // 461 // Pseudocode: 462 // while( `buffers` is not empty ) { 463 // create a new heap `h` 464 // for (each buffer `buf` in `buffers` in the size-decreasing order) { 465 // if (buf.size() is larger than the heap size limit && 466 // `h` is empty) { 467 // h.place(buf) 468 // buffers.remove(buf) 469 // } else if (placing `buf` into `h` does not violate size 470 // constraint) { 471 // h.place(buf) 472 // buffers.remove(buf) 473 // } 474 // } 475 // } 476 class ConstrainedGlobalDecreasingSizeBestFitHeap 477 : public GlobalDecreasingSizeBestFitHeap<HloValue> { 478 public: 479 explicit ConstrainedGlobalDecreasingSizeBestFitHeap( 480 uint64_t size_limit_per_heap, int64_t alignment, Type type = kSpatial) 481 : GlobalDecreasingSizeBestFitHeap<HloValue>(alignment, type), 482 size_limit_per_heap_(size_limit_per_heap) {} ~ConstrainedGlobalDecreasingSizeBestFitHeap()483 ~ConstrainedGlobalDecreasingSizeBestFitHeap() override {} 484 485 Result Finish() override; 486 487 private: 488 uint64_t size_limit_per_heap_; 489 }; 490 491 // A heap algorithm that chooses the best results from other algorithms added to 492 // it. 493 template <typename BufferType> 494 class ChooseBestHeapAlgorithm : public HeapAlgorithm<BufferType> { 495 public: 496 using Result = HeapSimulator::Result<BufferType>; 497 ChooseBestHeapAlgorithm(std::unique_ptr<std::vector<std::unique_ptr<HeapAlgorithm<BufferType>>>> algorithms)498 ChooseBestHeapAlgorithm( 499 std::unique_ptr<std::vector<std::unique_ptr<HeapAlgorithm<BufferType>>>> 500 algorithms) 501 : algorithms_(std::move(*algorithms)) {} ~ChooseBestHeapAlgorithm()502 ~ChooseBestHeapAlgorithm() override {} 503 Alloc(const BufferType * buffer,int64_t size)504 void Alloc(const BufferType* buffer, int64_t size) override { 505 for (auto& algorithm : algorithms_) { 506 algorithm->Alloc(buffer, size); 507 } 508 } 509 ShareWith(const BufferType * buffer,const BufferType * share_with,int64_t size)510 void ShareWith(const BufferType* buffer, const BufferType* share_with, 511 int64_t size) override { 512 for (auto& algorithm : algorithms_) { 513 algorithm->ShareWith(buffer, share_with, size); 514 } 515 } 516 Free(const BufferType * buffer,int64_t size)517 void Free(const BufferType* buffer, int64_t size) override { 518 for (auto& algorithm : algorithms_) { 519 algorithm->Free(buffer, size); 520 } 521 } 522 523 Result Finish() override; 524 525 private: 526 std::vector<std::unique_ptr<HeapAlgorithm<BufferType>>> algorithms_; 527 }; 528 529 } // namespace xla 530 531 #endif // TENSORFLOW_COMPILER_XLA_SERVICE_HEAP_SIMULATOR_H_ 532