1 /**
2 * Copyright 2020-2023 Huawei Technologies Co., Ltd
3
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7
8 * http://www.apache.org/licenses/LICENSE-2.0
9
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <algorithm>
18 #include <chrono>
19 #include <cstdio>
20 #include <ctime>
21 #include <memory>
22 #include <string>
23 #include <vector>
24 #include <map>
25 #include "utils/hash_map.h"
26 #include "backend/common/somas/somas_solver_alg.h"
27 #include "backend/common/somas/somas_solver_core.h"
28 #include "backend/common/somas/somas_solver_pre.h"
29
30 using mindspore::HashMap;
31 using std::sort;
32 using std::vector;
33
34 namespace mindspore {
35 namespace somas {
36 constexpr auto kSolBytesThreshold = 100 * 1024 * 1024;
MemoryAllocationSolver()37 Status SomasSolverCore::MemoryAllocationSolver() {
38 Status retval = SUCCESS;
39 // print only for single heuristic no multi thread
40 if (!is_multi_thread_valid_) {
41 MS_LOG(INFO) << "Algorithm strategy: " << algorithmTypeNames[algorithm_];
42 MS_LOG(INFO) << "Sorting strategy: " << sortingNames[sort_strategy_];
43 MS_LOG(INFO) << "Offset strategy: " << branchingNames[branching_strategy_];
44 }
45 BuildBlocks();
46 SortTensors();
47 upperbound_ = FindSolutions();
48 Verify();
49 return retval;
50 }
51
Verify()52 Status SomasSolverCore::Verify() {
53 Status retval = SUCCESS;
54 if (verify_) {
55 MS_LOG(INFO) << "Verifying solution..";
56
57 if (!Verify(upperbound_)) {
58 MS_LOG(WARNING) << "Solver Allocation Memory Check FAILS";
59 retval = FAILED;
60 } else {
61 const double giga = 1024. * 1024. * 1024.;
62 MS_LOG(INFO) << "Solver Allocation Memory Check SUCCESS !!";
63 MS_LOG(INFO) << "Result: " << upperbound_ << " (" << (upperbound_) / (giga) << " GB)";
64 retval = SUCCESS;
65 }
66 }
67
68 return retval;
69 }
70
Verify(const size_t & upperbound)71 bool SomasSolverCore::Verify(const size_t &upperbound) {
72 auto start = std::chrono::system_clock::now();
73 bool retval = true;
74 size_t result = 0;
75 SomasSolverTensorDescPtr t1;
76 SomasSolverTensorDescPtr t2;
77
78 for (auto t1_ : tensors_) {
79 // check alignment
80 result = std::max(result, t1_.second->size_ + t1_.second->offset_);
81 for (auto t2_ : tensors_) {
82 t1 = t1_.second;
83 t2 = t2_.second;
84 if (t1->index_ == t2->index_) {
85 continue;
86 }
87 bool blifelong = (t1->lifelong_ || t2->lifelong_) && (t1->index_ != t2->index_);
88 if (t2->right_ == t1) { // continuous constraint
89 // t1 must be continuous to t2
90 bool bcontinuous = t1->offset_ == (t2->offset_ + t2->size_);
91 if (!bcontinuous) {
92 MS_LOG(WARNING) << "Continuous constraint violation in tensors " << t1->index_ << " and" << t2->index_;
93 retval = false;
94 }
95 } else if (blifelong || constraints_[t1->index_].IsBitTrue(t2->index_) == false) { // conflict constraint
96 size_t t1_ub = t1->offset_ + t1->size_;
97 size_t t2_ub = t2->offset_ + t2->size_;
98 bool b_overlap_lb = ((t2->offset_ >= t1->offset_) && (t2->offset_ < t1_ub));
99 bool b_overlap_ub = ((t2_ub > t1->offset_) && (t2_ub < t1_ub));
100 bool b_overlap = b_overlap_lb || b_overlap_ub;
101 bool biszerosized = t1->size_ == 0 || t2->size_ == 0;
102 if (b_overlap && !biszerosized) {
103 MS_LOG(WARNING) << "Non-overlap constraint violation in tensors " << t1->index_ << " and" << t2->index_;
104 retval = false;
105 }
106 }
107 }
108 }
109 if (upperbound != result) {
110 MS_LOG(WARNING) << "ERROR Invalid upperbound result --> Footprint Result: " << upperbound_
111 << " Tensor Result: " << result + lifelong_memory_;
112 retval = false;
113 }
114 MS_LOG(DEBUG)
115 << "\nElapsed time of Fast Heuristic Check: "
116 << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start).count() << " ms";
117 return retval;
118 }
119
BuildBlocks()120 void SomasSolverCore::BuildBlocks() {
121 MS_LOG(DEBUG) << "Building block of tensors";
122
123 lifelong_memory_ = 0;
124 uint64_t tensors_block_count = 0;
125 for (auto tensor : tensors_) {
126 SomasSolverTensorDescPtr pTensor = tensor.second;
127 MS_EXCEPTION_IF_NULL(pTensor);
128 if (pTensor->blocked_) {
129 continue;
130 }
131 if (pTensor->lifelong_) {
132 lifelong_memory_ += pTensor->size_;
133 continue;
134 }
135 // move to the left
136 while (pTensor->left_) {
137 pTensor = pTensor->left_;
138 }
139
140 // set start tensor
141 BlockTensor bTensor;
142 bTensor.m_bre_allocate_ = true;
143 bTensor.m_start_tensor_ = pTensor;
144 // find size
145 bTensor.m_size_ = 0;
146
147 do {
148 bTensor.m_size_ += pTensor->size_;
149 pTensor->blocked_ = true;
150 pTensor = pTensor->right_;
151 tensors_block_count++;
152 } while (pTensor != nullptr);
153
154 // add to the list
155 this->block_tensors_.emplace_back(bTensor);
156 }
157
158 if (tensors_block_count != tensors_.size()) {
159 MS_LOG(INFO) << static_cast<int>(tensors_.size() - tensors_block_count) << " lifelong tensors found";
160 }
161 }
162
Clean()163 void SomasSolverCore::Clean() {
164 for (auto &block : block_tensors_) {
165 block.m_bre_allocate_ = true;
166 auto pTensor = block.m_start_tensor_;
167 while (pTensor) {
168 pTensor->offset_ = 0;
169 pTensor = pTensor->right_;
170 }
171 }
172 upperbound_ = SIZE_MAX;
173 }
174
GreaterSizeSmallerIndex(const BlockTensor & t1,const BlockTensor & t2)175 static bool GreaterSizeSmallerIndex(const BlockTensor &t1, const BlockTensor &t2) {
176 if (t1.m_start_tensor_->is_graph_output_ != t2.m_start_tensor_->is_graph_output_) {
177 return t1.m_start_tensor_->is_graph_output_;
178 }
179 return t1.m_size_ > t2.m_size_ ||
180 (t1.m_size_ == t2.m_size_ && t1.m_start_tensor_->index_ < t2.m_start_tensor_->index_);
181 }
SmallerReusePeakMemGreaterSizeSmallerIndex(const BlockTensor & t1,const BlockTensor & t2)182 static bool SmallerReusePeakMemGreaterSizeSmallerIndex(const BlockTensor &t1, const BlockTensor &t2) {
183 if (t1.m_start_tensor_->is_graph_output_ != t2.m_start_tensor_->is_graph_output_) {
184 return t1.m_start_tensor_->is_graph_output_;
185 }
186 if (t1.m_start_tensor_->can_reuse_peak_mem_ != t2.m_start_tensor_->can_reuse_peak_mem_) {
187 return t1.m_start_tensor_->can_reuse_peak_mem_ < t2.m_start_tensor_->can_reuse_peak_mem_;
188 }
189 return t1.m_size_ > t2.m_size_ ||
190 (t1.m_size_ == t2.m_size_ && t1.m_start_tensor_->index_ < t2.m_start_tensor_->index_);
191 }
192 #ifdef SOMAS_DEBUG
GreaterSizeGreaterIndex(const BlockTensor & t1,const BlockTensor & t2)193 static bool GreaterSizeGreaterIndex(const BlockTensor &t1, const BlockTensor &t2) {
194 return t1.m_size_ > t2.m_size_ ||
195 (t1.m_size_ == t2.m_size_ && t1.m_start_tensor_->index_ > t2.m_start_tensor_->index_);
196 }
GreaterSizeSmallerConstraintsSmallerIndex(const BlockTensor & t1,const BlockTensor & t2)197 static bool GreaterSizeSmallerConstraintsSmallerIndex(const BlockTensor &t1, const BlockTensor &t2) {
198 return t1.m_size_ > t2.m_size_ ||
199 (t1.m_size_ == t2.m_size_ && t1.m_start_tensor_->constraints_ < t2.m_start_tensor_->constraints_) ||
200 (t1.m_size_ == t2.m_size_ && t1.m_start_tensor_->constraints_ == t2.m_start_tensor_->constraints_ &&
201 t1.m_start_tensor_->index_ < t2.m_start_tensor_->index_);
202 }
GreaterSizeSmallerConstraintsGreaterIndex(const BlockTensor & t1,const BlockTensor & t2)203 static bool GreaterSizeSmallerConstraintsGreaterIndex(const BlockTensor &t1, const BlockTensor &t2) {
204 return t1.m_size_ > t2.m_size_ ||
205 (t1.m_size_ == t2.m_size_ && t1.m_start_tensor_->constraints_ < t2.m_start_tensor_->constraints_) ||
206 (t1.m_size_ == t2.m_size_ && t1.m_start_tensor_->constraints_ == t2.m_start_tensor_->constraints_ &&
207 t1.m_start_tensor_->index_ > t2.m_start_tensor_->index_);
208 }
GreaterSizeGreaterConstraintsSmallerIndex(const BlockTensor & t1,const BlockTensor & t2)209 static bool GreaterSizeGreaterConstraintsSmallerIndex(const BlockTensor &t1, const BlockTensor &t2) {
210 return t1.m_size_ > t2.m_size_ ||
211 (t1.m_size_ == t2.m_size_ && t1.m_start_tensor_->constraints_ > t2.m_start_tensor_->constraints_) ||
212 (t1.m_size_ == t2.m_size_ && t1.m_start_tensor_->constraints_ == t2.m_start_tensor_->constraints_ &&
213 t1.m_start_tensor_->index_ < t2.m_start_tensor_->index_);
214 }
GreaterSizeGreaterConstraintsGreaterIndex(const BlockTensor & t1,const BlockTensor & t2)215 static bool GreaterSizeGreaterConstraintsGreaterIndex(const BlockTensor &t1, const BlockTensor &t2) {
216 return t1.m_size_ > t2.m_size_ ||
217 (t1.m_size_ == t2.m_size_ && t1.m_start_tensor_->constraints_ > t2.m_start_tensor_->constraints_) ||
218 (t1.m_size_ == t2.m_size_ && t1.m_start_tensor_->constraints_ == t2.m_start_tensor_->constraints_ &&
219 t1.m_start_tensor_->index_ > t2.m_start_tensor_->index_);
220 }
221 #endif
222
SortTensors()223 void SomasSolverCore::SortTensors() { // need to sort the tensors for Fast Heuristic
224 MS_LOG(DEBUG) << "Sorting Blocks of tensor, strategy: " << sortingNames[sort_strategy_];
225 typedef bool (*SortingFunction)(const BlockTensor &, const BlockTensor &);
226 mindspore::HashMap<SortingType, SortingFunction> sort_map;
227 sort_map[kGreaterSizeSmallerIndex] = &GreaterSizeSmallerIndex;
228 sort_map[kSmallerReusePeakMemGreaterSizeSmallerIndex] = &SmallerReusePeakMemGreaterSizeSmallerIndex;
229 #ifdef SOMAS_DEBUG
230 sort_map[kGreaterSizeGreaterIndex] = &GreaterSizeGreaterIndex;
231 sort_map[kGreaterSizeSmallerConstraintsSmallerIndex] = &GreaterSizeSmallerConstraintsSmallerIndex;
232 sort_map[kGreaterSizeSmallerConstraintsGreaterIndex] = &GreaterSizeSmallerConstraintsGreaterIndex;
233 sort_map[kGreaterSizeGreaterConstraintsSmallerIndex] = &GreaterSizeGreaterConstraintsSmallerIndex;
234 sort_map[kGreaterSizeGreaterConstraintsGreaterIndex] = &GreaterSizeGreaterConstraintsGreaterIndex;
235 #endif
236 if (sort_strategy_ < kNumSortingTypes) {
237 sort(block_tensors_.begin(), block_tensors_.end(), *(sort_map[sort_strategy_]));
238 }
239 }
240
Search(const std::shared_ptr<FootPrint> & pFootprint)241 size_t SomasSolverCore::Search(const std::shared_ptr<FootPrint> &pFootprint) {
242 size_t result = 0;
243 FastHeuristic fh;
244 MS_LOG(INFO) << "Calling FastSolver Search for " << block_tensors_.size() << " tensors ";
245 auto start = std::chrono::system_clock::now();
246 if (fh.Eval(&block_tensors_, pFootprint, &constraints_)) {
247 result = pFootprint->Result();
248 auto end = std::chrono::system_clock::now();
249 timing_ = std::chrono::duration_cast<std::chrono::milliseconds>((end - start)).count();
250 // print for serial all_ or multi thread solver
251 if (is_multi_thread_valid_) {
252 const double giga = 1073741824.;
253 MS_LOG(INFO) << timing_ << " ms\t" << sol_count_ + 1 << "/"
254 << static_cast<size_t>(kNumFittingTypes) * static_cast<size_t>(kNumAlgorithmTypes) *
255 static_cast<size_t>(kNumSortingTypes)
256 << "\t" << result << " Bytes (" << result / giga << " GB)\t" << algorithmTypeNames[algorithm_]
257 << "\t" << sortingNames[sort_strategy_] << "\t" << branchingNames[branching_strategy_];
258 }
259 } else {
260 MS_LOG(INFO) << "FastSolver could not find solution";
261 }
262
263 if (result < upperbound_) {
264 upperbound_ = result;
265 best_sol_ = pFootprint->m_solId_;
266 }
267
268 return upperbound_;
269 }
270
AppendLifelongTensors()271 void SomasSolverCore::AppendLifelongTensors() {
272 MS_LOG(DEBUG) << "Appending lifelong tensors to solution";
273 size_t offset = upperbound_;
274 std::map<size_t, SomasSolverTensorDescPtr> lifelongTensors;
275 for (const auto &t : tensors_) {
276 if (t.second->lifelong_) {
277 (void)lifelongTensors.emplace(t.first, t.second);
278 }
279 }
280 for (const auto &t : lifelongTensors) {
281 auto &pTensor = t.second;
282 pTensor->offset_ = offset;
283 offset += pTensor->size_;
284 }
285 upperbound_ += lifelong_memory_;
286 MS_LOG(DEBUG) << lifelong_memory_ << " bytes from lifelong tensors added to solution";
287 }
288
FindSolutions()289 size_t SomasSolverCore::FindSolutions() {
290 MS_LOG(DEBUG) << "Start allocating blocks,offset strategy: " << branchingNames[branching_strategy_];
291
292 std::shared_ptr<FootPrint> pFootprint = std::make_shared<FootPrint>();
293 pFootprint->setBranchingStrategy(static_cast<uint32_t>(branching_strategy_));
294 pFootprint->setCurrentSol(sol_count_);
295 pFootprint->setAlgorithm(static_cast<uint32_t>(algorithm_));
296 Search(pFootprint);
297 AppendLifelongTensors();
298 Destroy(&pFootprint);
299 return upperbound_;
300 }
301
Destroy(std::shared_ptr<FootPrint> * pFootprint) const302 void SomasSolverCore::Destroy(std::shared_ptr<FootPrint> *pFootprint) const {
303 while ((*pFootprint) != nullptr) {
304 if ((*pFootprint)->Next() != nullptr) {
305 std::shared_ptr<FootPrint> &p = (*pFootprint);
306 (*pFootprint) = (*pFootprint)->Next();
307 p = nullptr;
308 } else {
309 (*pFootprint) = nullptr;
310 }
311 }
312 }
313 } // namespace somas
314 } // namespace mindspore
315