• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019-2022 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #ifndef MINDSPORE_CCSRC_DEBUG_TENSOR_LOAD_H_
17 #define MINDSPORE_CCSRC_DEBUG_TENSOR_LOAD_H_
18 
19 #include <memory>
20 #include <vector>
21 #include <map>
22 #include <mutex>
23 #include <tuple>
24 #include <string>
25 #include <utility>
26 #include <deque>
27 #include <algorithm>
28 #include "include/backend/debug/tensor_data.h"
29 #ifndef OFFLINE_DBG_MODE
30 #include "include/backend/debug/data_dump/dump_json_parser.h"
31 #endif
32 namespace mindspore {
33 class TensorLoader {
34  public:
TensorLoader()35   TensorLoader() : mem_total_(0), mem_usage_(0) {}
36 
~TensorLoader()37   ~TensorLoader() { EmptyTensor(); }
38 
MoveTensorCurrentToPrev(const std::string & tensor_name)39   void MoveTensorCurrentToPrev(const std::string &tensor_name) {
40     auto handle = tensor_list_map_.extract(tensor_name);
41     if (!handle.empty()) {
42       MS_LOG(INFO) << "Moving " << tensor_name << " from current map to previous map";
43       prev_tensor_list_map_.insert(std::move(handle));
44     }
45   }
46 
SwapCurrentPrev()47   void SwapCurrentPrev() { tensor_list_map_.swap(prev_tensor_list_map_); }
48 
TensorExistsInCurrent(const std::string & tensor_name)49   bool TensorExistsInCurrent(const std::string &tensor_name) const {
50     return tensor_list_map_.find(tensor_name) != tensor_list_map_.end();
51   }
52 
53   // only parameters will return true
PrevTensorExistsInCurrent(const std::string & tensor_name)54   bool PrevTensorExistsInCurrent(const std::string &tensor_name) const {
55     return TensorExistsInCurrent(tensor_name + ":prev");
56   }
57 
MoveParametersCurrentToPrev()58   void MoveParametersCurrentToPrev() {
59     MS_LOG(INFO) << "Moving parameters from current map to previous map";
60     auto iter = tensor_list_map_.begin();
61     while (iter != tensor_list_map_.end()) {
62       auto key = iter->first;
63       if (PrevTensorExistsInCurrent(key)) {
64         // :prev tensor only exists for parameter. Move it to prev
65         ++iter;
66         MoveTensorCurrentToPrev(key);
67       } else {
68         ++iter;
69       }
70     }
71   }
72 
IsPrevTensor(std::string tensor_name)73   bool IsPrevTensor(std::string tensor_name) const {
74     const std::string suffix = ":prev";
75     if (tensor_name.length() <= suffix.length()) {
76       return false;
77     }
78     return std::equal(suffix.rbegin(), suffix.rend(), tensor_name.rbegin());
79   }
80 
81   /*
82    * Feature group: Dump, Online debugger and Offline debugger.
83    * Target device group: Ascend, GPU.
84    * Runtime category: Old runtime, MindRT.
85    * Description: Load new tensor into tensor_list_map_ (debugger backend cache). In offline debugger, add ":prev" to
86    * the previous tensor's name to avoid segfault caused by wrongly evicting the tensor when memory limit is enabled.
87    */
LoadNewTensor(const std::shared_ptr<TensorData> & tensor,bool keep_prev)88   bool LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
89     std::lock_guard<std::mutex> lg(lock_);
90     auto tensor_name = tensor->GetName();
91     if (keep_prev) {
92       // add prev step tensor into current step map with ":prev" suffix
93       auto handle = prev_tensor_list_map_.extract(tensor_name);
94       if (!handle.empty()) {
95         handle.key() = tensor_name + ":prev";
96         tensor_list_map_.insert(std::move(handle));
97       }
98     }
99     std::string key_name = tensor_name;
100 #ifdef OFFLINE_DBG_MODE
101     std::string output_type = tensor->GetIsOutput() ? "1" : "0";
102     key_name += (":" + std::to_string(tensor->GetDeviceId()) + ":" + std::to_string(tensor->GetRootGraphId()) + ":" +
103                  output_type + ":" + std::to_string(tensor->GetSlot()));
104     if (tensor_list_map_.find(key_name) != tensor_list_map_.end() &&
105         tensor->GetIteration() == tensor_list_map_[key_name]->GetPrevIteration()) {
106       key_name += ":prev";
107     }
108 #endif
109     tensor_list_map_[key_name] = tensor;  // use [] instead of insert to ensure latest value
110     return true;
111   }
112 
GetTensor()113   std::vector<std::shared_ptr<TensorData>> GetTensor() {
114     std::vector<std::shared_ptr<TensorData>> tensor_list;
115     for (auto it = tensor_list_map_.cbegin(); it != tensor_list_map_.cend(); ++it) {
116       if (!IsPrevTensor(it->first)) {
117         tensor_list.push_back(it->second);
118       }
119     }
120     return tensor_list;
121   }
122 
GetTensor(const std::string & tensor_name)123   std::shared_ptr<TensorData> GetTensor(const std::string &tensor_name) const {
124     auto iter = tensor_list_map_.find(tensor_name);
125     if (iter != tensor_list_map_.end()) {
126       return iter->second;
127     }
128     return nullptr;
129   }
130 
GetPrevTensor(const std::string & tensor_name)131   std::shared_ptr<TensorData> GetPrevTensor(const std::string &tensor_name) {
132     if (tensor_list_map_.find(tensor_name + ":prev") != tensor_list_map_.end()) {
133       return tensor_list_map_[tensor_name + ":prev"];
134     }
135     return nullptr;
136   }
137 
138   /*
139    * Feature group: Online debugger.
140    * Target device group: Ascend, GPU.
141    * Runtime category: Old runtime, MindRT.
142    * Description: Search and obtain TensorData for a list of tensors from tensor_list_map_ (debugger backend cache).
143    * Return nullptr if the tensor is not found.
144    */
SearchTensors(const std::vector<std::string> & search_list,std::vector<std::tuple<std::string,std::shared_ptr<TensorData>>> * result_list)145   void SearchTensors(const std::vector<std::string> &search_list,
146                      std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list) {
147     for (auto i : search_list) {
148       std::map<std::string, std::shared_ptr<TensorData>>::const_iterator iter = tensor_list_map_.find(i);
149       if (iter != tensor_list_map_.cend()) {
150         result_list->push_back(std::make_tuple(i, iter->second));
151       } else {
152         result_list->push_back(std::make_tuple(i, nullptr));
153       }
154     }
155   }
156 
EmptyTensor()157   void EmptyTensor() noexcept {
158     std::lock_guard<std::mutex> lg(lock_);
159     prev_tensor_list_map_.clear();
160     tensor_list_map_.swap(prev_tensor_list_map_);
161   }
162 
EmptyCurrentTensor()163   void EmptyCurrentTensor() { tensor_list_map_.clear(); }
164 
EnableMemoryControl()165   bool EnableMemoryControl() const { return mem_total_ > 0; }
166 
167   /*
168    * Feature group: Offline debugger.
169    * Target device group: Ascend, GPU.
170    * Runtime category: Old runtime, MindRT.
171    * Description: This function is for memory control feature only. When finishing using a tensor in offline debugger,
172    * it will be added to cache_evict_queue_ and become an eviction candidate. Once there is no memory to read in a new
173    * tensor, it will be evicted from cache.
174    */
AppendToCacheEvictQueue(const std::string & tensor_name)175   void AppendToCacheEvictQueue(const std::string &tensor_name) {
176     std::lock_guard<std::mutex> lk(mem_lock_);
177     if (std::find(cache_evict_queue_.begin(), cache_evict_queue_.end(), tensor_name) == cache_evict_queue_.end()) {
178       cache_evict_queue_.push_back(tensor_name);
179       evict_cond.notify_one();
180     }
181   }
182 
183   /*
184    * Feature group: Offline debugger.
185    * Target device group: Ascend, GPU.
186    * Runtime category: Old runtime, MindRT.
187    * Description: This function is for memory control feature only. Check if the tensor size is greater than the preset
188    * limit. If not, evect the candidate tensor in cache_evict_queue_ to make room for it.
189    */
CheckMemoryAvailable(const std::string & backend_name,const uint64_t data_size)190   bool CheckMemoryAvailable(const std::string &backend_name, const uint64_t data_size) {
191     // 1. Check if the tensor can fit in the entire limit. If not, don't attempt any read or evictions and generate
192     // warning.
193     if (data_size > mem_total_) {
194       MS_LOG(ERROR) << "Failed to load data of tensor " << backend_name << " because the its data size (" << data_size
195                     << ") exceeds the maximum memory limit (" << mem_total_ << ").";
196       return false;
197     }
198     // 2. Check if there's is enough cache space available for current tensor. If not, try evict cache.
199     bool ret = CheckAndEvictTensorCache(data_size);
200     return ret;
201   }
202 
203   /*
204    * Feature group: Offline debugger.
205    * Target device group: Ascend, GPU.
206    * Runtime category: Old runtime, MindRT.
207    * Description: This function is for memory control feature only. Greedily evict not-in-use tensors from cache queue.
208    * If no candidate in the queue, block the thread until there is any candidate available.
209    */
CheckAndEvictTensorCache(const uint64_t data_size)210   bool CheckAndEvictTensorCache(const uint64_t data_size) {
211     std::string candidate_name;
212     uint64_t candidates_size;
213     std::unique_lock<std::mutex> lk(mem_lock_);
214     while (data_size > mem_total_ - mem_usage_) {
215       // wait until there is any not-in-use candidate to be evicted from cache
216       evict_cond.wait(lk, [this] { return !cache_evict_queue_.empty(); });
217       candidate_name = cache_evict_queue_.front();
218       cache_evict_queue_.pop_front();
219       // evict candidate tensor
220       auto tensor = GetTensor(candidate_name);
221       if (tensor == nullptr) {
222         MS_LOG(INFO) << "Tensor: " << candidate_name << " has already been evicted.";
223         lock_.unlock();
224         continue;
225       }
226       candidates_size = tensor->GetByteSize();
227       tensor_list_map_.erase(candidate_name);
228       mem_usage_ = std::max(uint64_t(0), mem_usage_ - candidates_size);
229       MS_LOG(INFO) << "Evict tensor: " << candidate_name;
230     }
231     // Reserve space for the current target tensor.
232     mem_usage_ = std::min(mem_total_, mem_usage_ + data_size);
233     return true;
234   }
235 
SetMemTotal(uint64_t total_mem_size)236   void SetMemTotal(uint64_t total_mem_size) { this->mem_total_ = total_mem_size; }
237 
238 #ifndef OFFLINE_DBG_MODE
239   /*
240    * Feature group: Dump.
241    * Target device group: GPU, Ascend.
242    * Runtime category: Old runtime, MindRT.
243    * Description: Load tensor data from debugger backend cache (tensor_list_map_) and dump to file in npy format,
244    *              used for GPU and Ascend KernelByKernel mode.
245    */
DumpTensorToFile(const std::string & filepath,const std::string & tensor_name,size_t slot)246   bool DumpTensorToFile(const std::string &filepath, const std::string &tensor_name, size_t slot) {
247     if (filepath.empty()) {
248       MS_LOG(ERROR) << "Dump file path is null!";
249       return false;
250     }
251 
252     std::string tensor_loader_name = tensor_name + ":" + std::to_string(slot);
253     std::map<std::string, std::shared_ptr<TensorData>>::const_iterator iter = tensor_list_map_.find(tensor_loader_name);
254     if (iter != tensor_list_map_.cend()) {
255       std::shared_ptr<TensorData> node = iter->second;
256       std::string path = filepath + '.' + node->GetFormat();
257       if (node->GetByteSize() == 0) {
258         MS_LOG(INFO) << "The byte size is 0 for tensor: " << tensor_loader_name;
259         return false;
260       }
261       auto type_string = node->GetTypeString();
262       if (type_string == "bfloat16") {
263         std::shared_ptr<tensor::Tensor> bfloat16_tensor = std::make_shared<tensor::Tensor>(
264           TypeId::kNumberTypeBFloat16, node->GetShape(), static_cast<void *>(const_cast<char *>(node->GetDataPtr())),
265           node->GetByteSize());
266         std::shared_ptr<tensor::Tensor> float32_tensor =
267           std::make_shared<tensor::Tensor>(*bfloat16_tensor, TypeId::kNumberTypeFloat32);
268         return DumpJsonParser::DumpToFile(path, float32_tensor->data_c(), float32_tensor->Size(),
269                                           float32_tensor->shape_c(),
270                                           static_cast<TypeId>(float32_tensor->data_type_c()));
271       }
272       return DumpJsonParser::DumpToFile(path, node->GetDataPtr(), node->GetByteSize(), node->GetShape(),
273                                         StringToTypeId(node->GetTypeString()));
274     }
275     MS_LOG(INFO) << "Tensor name:" << tensor_name << " not found in tensor_list_map_";
276     return false;
277   }
278 #endif
279 
280  private:
281   // the pair is (device_id, iteration)
282   std::map<std::string, std::shared_ptr<TensorData>> tensor_list_map_;
283   std::map<std::string, std::shared_ptr<TensorData>> prev_tensor_list_map_;
284   std::mutex lock_;
285   std::mutex mem_lock_;
286   uint64_t mem_total_;
287   uint64_t mem_usage_;
288   std::deque<std::string> cache_evict_queue_;
289   std::condition_variable evict_cond;
290 };
291 }  // namespace mindspore
292 #endif  // MINDSPORE_CCSRC_DEBUG_TENSOR_LOAD_H_
293