• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2024 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "include/backend/mem_reuse/mem_tracker.h"
18 #include <fstream>
19 #include "frontend/parallel/group_manager.h"
20 #include "utils/ms_context.h"
21 #include "include/common/debug/common.h"
22 #include "include/common/utils/comm_manager.h"
23 #include "include/backend/device_type.h"
24 #include "include/backend/mem_reuse/mem_dynamic_allocator.h"
25 #include "include/common/utils/utils.h"
26 #include "include/backend/distributed/collective/collective_manager.h"
27 #include "utils/file_utils.h"
28 
29 namespace mindspore {
30 namespace device {
31 namespace tracker {
32 constexpr int64_t kIllegalStartTimeStamp = -1L;
33 namespace {
GetRankID()34 std::string GetRankID() {
35   uint32_t rank_id = 0;
36 #if !defined(BUILD_LITE)
37   if (distributed::collective::CollectiveManager::instance()->initialized()) {
38     rank_id = CommManager::GetInstance().GetRank();
39   }
40 #endif
41   return std::to_string(rank_id);
42 }
43 
GetAllocatorType(MemType mem_type)44 AllocatorType GetAllocatorType(MemType mem_type) {
45   static std::map<MemType, device::AllocatorType> mem_allocator_type_map = {
46     {MemType::kWeight, AllocatorType::kWeight},
47     {MemType::kConstantValue, AllocatorType::kConstantValue},
48     {MemType::kKernel, AllocatorType::kConstantValue},
49     {MemType::kGraphOutput, AllocatorType::kGraphOutput},
50     {MemType::kSomas, AllocatorType::kConstantValue},
51     {MemType::kInSideSomas, AllocatorType::kConstantValue},
52     {MemType::kSomasOutput, AllocatorType::kKernelOutput},
53     {MemType::kGeConst, AllocatorType::kConstantValue},
54     {MemType::kBatchMemory, AllocatorType::kConstantValue},
55     {MemType::kContinuousMemory, AllocatorType::kConstantValue},
56     {MemType::kPyNativeInput, AllocatorType::kConstantValue},
57     {MemType::kPyNativeOutput, AllocatorType::kKernelOutput},
58     {MemType::kGeFeatureMemory, AllocatorType::kConstantValue},
59     {MemType::kWorkSpace, AllocatorType::kWorkspace},
60     {MemType::kOther, AllocatorType::kOther}};
61 
62   auto iter = mem_allocator_type_map.find(mem_type);
63   if (iter == mem_allocator_type_map.end()) {
64     MS_LOG(WARNING) << "Not found mem_type:" << mem_type << " in mem_allocator_type_map.";
65     return AllocatorType::kOther;
66   }
67   return iter->second;
68 }
69 }  // namespace
70 
GetPath()71 std::pair<std::string, std::string> MemoryTrackerEnabled::GetPath() {
72   std::string block_csv_path;
73   std::string task_csv_path;
74 
75   auto ms_context = MsContext::GetInstance();
76   auto trace_path = ms_context->get_param<std::string>(MS_CTX_PROF_MEM_OUTPUT_PATH);
77   if (trace_path.empty()) {
78     trace_path = "./";
79   }
80 
81   if (enable_hccl_) {
82     block_csv_path = trace_path + "/rank_" + GetRankID() + "/memory_block.csv";
83     task_csv_path = trace_path + "/rank_" + GetRankID() + "/task.csv";
84   } else {
85     block_csv_path = trace_path + "/memory_block.csv";
86     task_csv_path = trace_path + "/task.csv";
87   }
88   return std::make_pair(block_csv_path, task_csv_path);
89 }
90 
AddTask(const std::string & task_name,const std::string & node_name,const std::string & graph_name,const std::string & file_name,size_t line_num)91 void MemoryTrackerEnabled::AddTask(const std::string &task_name, const std::string &node_name,
92                                    const std::string &graph_name, const std::string &file_name, size_t line_num) {
93   std::string python_stack;
94   if (WithPythonStack()) {
95     python_stack = GetPythonStackStr();
96   }
97 
98   std::lock_guard lock(mutex_);
99   if (!is_init_enable_hccl_) {
100     // MS_CTX_ENABLE_HCCL will be reset when the process is destroyed.
101     // Therefore, record the enable_hccl when AddTask for the first time.
102     auto ms_context = MsContext::GetInstance();
103     enable_hccl_ = ms_context->get_param<bool>(MS_CTX_ENABLE_HCCL);
104     is_init_enable_hccl_ = true;
105   }
106 
107   time_stamp_++;
108   auto task_info = std::make_shared<TaskInfo>();
109   MS_EXCEPTION_IF_NULL(task_info);
110   task_info->task_name = task_name;
111   task_info->node_name = node_name;
112   task_info->graph_name = graph_name;
113   task_info->file_name = file_name;
114   task_info->line_num = line_num;
115   task_info->time_stamp = time_stamp_;
116   task_info->python_stack = python_stack;
117   task_map_[task_name] = task_info;
118   task_list_.push_back(task_info);
119 }
120 
NewMemInfo(const std::string & task_name,MemType type,size_t size,KernelTensorPtr kernel_tensor,const std::string & file_name,size_t line_num)121 MemInfoPtr MemoryTrackerEnabled::NewMemInfo(const std::string &task_name, MemType type, size_t size,
122                                             KernelTensorPtr kernel_tensor, const std::string &file_name,
123                                             size_t line_num) {
124   auto mem_info = std::make_shared<MemInfo>();
125   MS_EXCEPTION_IF_NULL(mem_info);
126   mem_info->type = type;
127   mem_info->size = size;
128   mem_info->kernel_tensor = kernel_tensor;
129   mem_info->file_name = file_name;
130   mem_info->line_num = line_num;
131   auto iter = task_map_.find(task_name);
132   if (iter == task_map_.end()) {
133     MS_LOG(ERROR) << "MemoryTracker AddMemInfo failed, task_name:" << task_name << " not found, " << file_name << ":"
134                   << line_num;
135     return nullptr;
136   }
137 
138   const auto &node_name = iter->second->node_name;
139   DynamicMemAllocatorDebugInfo::SetDebugInfo(node_name, GetAllocatorType(type));
140 
141   mem_info->producer_task = iter->second;
142   mem_info_list_.push_back(mem_info);
143   return mem_info;
144 }
145 
AddMemInfoForKernelTensor(const std::string & task_name,MemType type,size_t size,KernelTensorPtr kernel_tensor,const std::string & file_name,size_t line_num)146 void MemoryTrackerEnabled::AddMemInfoForKernelTensor(const std::string &task_name, MemType type, size_t size,
147                                                      KernelTensorPtr kernel_tensor, const std::string &file_name,
148                                                      size_t line_num) {
149   auto mem_info = NewMemInfo(task_name, type, size, kernel_tensor, file_name, line_num);
150   if (mem_info != nullptr) {
151     kernel_tensor_mem_map[kernel_tensor] = mem_info;
152   }
153 }
154 
AddMemInfo(const std::string & task_name,MemType type,size_t size,DeviceAddress * device_address,const std::string & file_name,size_t line_num)155 void MemoryTrackerEnabled::AddMemInfo(const std::string &task_name, MemType type, size_t size,
156                                       DeviceAddress *device_address, const std::string &file_name, size_t line_num) {
157   MS_EXCEPTION_IF_NULL(device_address);
158   if (device_address->GetDeviceType() == DeviceType::kCPU) {
159     return;
160   }
161   std::lock_guard<std::mutex> lock(mutex_);
162 
163   if (device_address->kernel_tensor() == nullptr) {
164     auto mem_info = NewMemInfo(task_name, type, size, nullptr, file_name, line_num);
165     device_address_mem_map[device_address] = mem_info;
166   } else {
167     AddMemInfoForKernelTensor(task_name, type, size, device_address->kernel_tensor().get(), file_name, line_num);
168   }
169 }
170 
UpdateMemInfo(const DeviceAddress * device_address,MemType mem_type,const std::string & file_name,size_t line_num)171 void MemoryTrackerEnabled::UpdateMemInfo(const DeviceAddress *device_address, MemType mem_type,
172                                          const std::string &file_name, size_t line_num) {
173   std::lock_guard lock(mutex_);
174   if (device_address->GetDeviceType() == DeviceType::kCPU) {
175     return;
176   }
177   auto kernel_tensor = device_address->kernel_tensor().get();
178   auto iter = kernel_tensor_mem_map.find(kernel_tensor);
179   if (iter == kernel_tensor_mem_map.end()) {
180     MS_LOG(ERROR) << "MemoryTracker UpdateMemInfoMemType failed, kernel_tensor:" << kernel_tensor << " not found";
181     return;
182   }
183   iter->second->type = mem_type;
184   iter->second->file_name = file_name;
185   iter->second->line_num = line_num;
186 }
187 
AddCompileTimeMemInfo(const std::string & task_name,size_t size,DeviceMemPtr device_ptr,MemType mem_type,const std::string & file_name,size_t line_num)188 void MemoryTrackerEnabled::AddCompileTimeMemInfo(const std::string &task_name, size_t size, DeviceMemPtr device_ptr,
189                                                  MemType mem_type, const std::string &file_name, size_t line_num) {
190   std::lock_guard lock(mutex_);
191   auto mem_info = std::make_shared<MemInfo>();
192   MS_EXCEPTION_IF_NULL(mem_info);
193   mem_info->type = mem_type;
194   mem_info->size = size;
195   mem_info->file_name = file_name;
196   mem_info->line_num = line_num;
197   auto iter = task_map_.find(task_name);
198   if (iter == task_map_.end()) {
199     MS_LOG(ERROR) << "MemoryTracker AddCompileTimeMemInfo failed, task_name:" << task_name << " not found, "
200                   << file_name << ":" << line_num;
201     return;
202   }
203   mem_info->producer_task = iter->second;
204   auto mem_block_iter = device_mem_block_map.find(device_ptr);
205   if (mem_block_iter == device_mem_block_map.end()) {
206     MS_LOG(ERROR) << "MemoryTracker AddCompileTimeMemInfo failed, device_ptr:" << device_ptr << " not found, "
207                   << file_name << ":" << line_num;
208     return;
209   }
210   mem_info->mem_block = mem_block_iter->second;
211   mem_info->mem_block->is_bind = true;
212   mem_info->mem_block->mem_info = mem_info;
213   mem_info_list_.push_back(mem_info);
214 }
215 
BindDevicePtr(DeviceAddress * device_address,DeviceMemPtr device_ptr,const std::string & file_name,size_t line_num)216 void MemoryTrackerEnabled::BindDevicePtr(DeviceAddress *device_address, DeviceMemPtr device_ptr,
217                                          const std::string &file_name, size_t line_num) {
218   if (device_address == nullptr) {
219     return;
220   }
221   std::lock_guard<std::mutex> lock(mutex_);
222   if (device_address->GetDeviceType() == DeviceType::kCPU) {
223     return;
224   }
225   MemInfoPtr mem_info{nullptr};
226   if (device_address->kernel_tensor() == nullptr) {
227     auto iter = device_address_mem_map.find(device_address);
228     if (iter == device_address_mem_map.end()) {
229       MS_LOG(ERROR) << "MemoryTracker BindDevicePtr failed, device_address:" << device_address << " not found, "
230                     << file_name << ":" << line_num;
231       return;
232     }
233     mem_info = iter->second;
234   } else {
235     auto iter = kernel_tensor_mem_map.find(device_address->kernel_tensor().get());
236     if (iter == kernel_tensor_mem_map.end()) {
237       MS_LOG(ERROR) << "MemoryTracker BindDevicePtr failed, kernel_tensor:" << device_address->kernel_tensor().get()
238                     << " not found, " << file_name << ":" << line_num;
239       return;
240     }
241     mem_info = iter->second;
242   }
243 
244   if (mem_info->type == MemType::kInSideSomas) {
245     auto mem_block_info = std::make_shared<MemBlockInfo>();
246     MS_EXCEPTION_IF_NULL(mem_block_info);
247     mem_block_info->device_addr = device_ptr;
248     mem_block_info->size = mem_info->size;
249     mem_block_info->start_time_stamp = -1;
250     mem_block_info->end_time_stamp = -1;
251     mem_block_info->is_bind = true;
252     mem_block_info->mem_info = mem_info;
253     mem_info->mem_block = mem_block_info;
254     device_mem_block_map[device_ptr] = mem_block_info;
255     mem_block_list_.push_back(mem_block_info);
256     // mem_block need to dump again, after mem_block_list_ changed
257     has_dump = false;
258     return;
259   }
260   auto mem_block_iter = device_mem_block_map.find(device_ptr);
261   if (mem_block_iter == device_mem_block_map.end()) {
262     MS_LOG(ERROR) << "MemoryTracker BindDevicePtr failed, device_ptr:" << device_ptr << " not found, " << file_name
263                   << ":" << line_num;
264     return;
265   }
266   mem_info->mem_block = mem_block_iter->second;
267   mem_info->mem_block->is_bind = true;
268   mem_info->mem_block->mem_info = mem_info;
269 }
270 
UpdateDevicePtrInfo(DeviceMemPtr device_ptr,MemType mem_type,const std::string & task_name,const std::string & file_name,size_t line_num)271 void MemoryTrackerEnabled::UpdateDevicePtrInfo(DeviceMemPtr device_ptr, MemType mem_type, const std::string &task_name,
272                                                const std::string &file_name, size_t line_num) {
273   std::lock_guard lock(mutex_);
274   auto mem_block_iter = device_mem_block_map.find(device_ptr);
275   if (mem_block_iter == device_mem_block_map.end()) {
276     MS_LOG(ERROR) << "MemoryTracker AddCompileTimeMemInfo failed, device_ptr:" << device_ptr << " not found, "
277                   << file_name << ":" << line_num;
278     return;
279   }
280   auto mem_info = std::make_shared<MemInfo>();
281   MS_EXCEPTION_IF_NULL(mem_info);
282   auto task_info = std::make_shared<TaskInfo>();
283   MS_EXCEPTION_IF_NULL(task_info);
284   task_info->task_name = task_name;
285   mem_info->producer_task = task_info;
286   mem_info->file_name = file_name;
287   mem_info->line_num = line_num;
288   mem_info->type = mem_type;
289   mem_info->mem_block = mem_block_iter->second;
290   mem_info->mem_block->is_bind = true;
291   mem_info->mem_block->mem_info = mem_info;
292   mem_info_list_.push_back(mem_info);
293 }
294 
AllocMemBlock(DeviceMemPtr device_addr,size_t size,const std::string & pool_name,size_t actual_peak_memory,size_t in_used_size,size_t total_size,uint32_t stream_id)295 void MemoryTrackerEnabled::AllocMemBlock(DeviceMemPtr device_addr, size_t size, const std::string &pool_name,
296                                          size_t actual_peak_memory, size_t in_used_size, size_t total_size,
297                                          uint32_t stream_id) {
298   std::lock_guard lock(mutex_);
299   time_stamp_++;
300   auto mem_block = std::make_shared<MemBlockInfo>();
301   MS_EXCEPTION_IF_NULL(mem_block);
302   mem_block->device_addr = device_addr;
303   mem_block->start_time_stamp = time_stamp_;
304   mem_block->actual_peak_memory = actual_peak_memory;
305   mem_block->size = size;
306   mem_block->pool_name = pool_name;
307   mem_block->stream_id = stream_id;
308   mem_block->real_start_time = GetCurrentUSec();
309   mem_block->alloc_in_used_size = in_used_size;
310   mem_block->alloc_total_size = total_size;
311   device_mem_block_map[device_addr] = mem_block;
312   real_device_mem_block_map[device_addr] = mem_block;
313   mem_block_list_.emplace_back(mem_block);
314   // mem_block need to dump again, after mem_block_list_ changed
315   has_dump = false;
316 }
317 
FreeMemBlock(DeviceMemPtr device_addr,size_t in_used_size,size_t total_size)318 void MemoryTrackerEnabled::FreeMemBlock(DeviceMemPtr device_addr, size_t in_used_size, size_t total_size) {
319   std::lock_guard lock(mutex_);
320   time_stamp_++;
321   auto iter = real_device_mem_block_map.find(device_addr);
322   if (iter == real_device_mem_block_map.end()) {
323     MS_LOG(ERROR) << "MemoryTracker FreeMemBlock failed, device_addr:" << device_addr << " not found";
324     return;
325   }
326   iter->second->end_time_stamp = time_stamp_;
327   iter->second->real_end_time = GetCurrentUSec();
328   iter->second->release_in_used_size = in_used_size;
329   iter->second->release_total_size = total_size;
330 }
331 
UseMemBlock(const std::string & task_name,DeviceMemPtr device_addr,const std::string & file_name,size_t line_num)332 void MemoryTrackerEnabled::UseMemBlock(const std::string &task_name, DeviceMemPtr device_addr,
333                                        const std::string &file_name, size_t line_num) {
334   std::lock_guard lock(mutex_);
335   auto iter = device_mem_block_map.find(device_addr);
336   if (iter == device_mem_block_map.end()) {
337     MS_LOG(ERROR) << "MemoryTracker UseMemBlock failed, device_addr:" << device_addr << " not found, " << file_name
338                   << ":" << line_num;
339     return;
340   }
341   if (iter->second->pool_name == "CPU") {
342     return;
343   }
344   auto task_iter = task_map_.find(task_name);
345   if (task_iter == task_map_.end()) {
346     MS_LOG(ERROR) << "MemoryTracker UseMemBlock failed, task_name:" << task_name << " not found, " << file_name << ":"
347                   << line_num;
348     return;
349   }
350   auto mem_info = iter->second->mem_info.lock();
351   if (mem_info == nullptr) {
352     MS_LOG(ERROR) << "MemoryTracker UseMemBlock failed, mem_info is null, " << file_name << ":" << line_num;
353     return;
354   }
355   mem_info->user_tasks.push_back(task_iter->second);
356 }
357 
358 namespace {
359 constexpr size_t kKBToByte = 1024;
360 constexpr size_t kMBToKB = 1024;
361 static const int kPrecisionDigits = 20;
362 
__anona88153740302(const std::vector<TaskInfoPtr> &task_list) 363 auto task_list_to_str = [](const std::vector<TaskInfoPtr> &task_list) -> std::string {
364   std::stringstream ss;
365   ss << "{";
366   for (auto &task : task_list) {
367     ss << task->time_stamp << "-";
368   }
369   ss << "}";
370   return ss.str();
371 };
372 
373 const std::vector<std::pair<std::string, std::function<void(const MemBlockInfoPtr &, std::ofstream &)>>> block_csv = {
374   {"start_time_stamp",
__anona88153740402() 375    [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) { oss << mem_block->start_time_stamp; }},
__anona88153740502() 376   {"end_time_stamp", [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) { oss << mem_block->end_time_stamp; }},
__anona88153740602() 377   {"device_addr", [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) { oss << mem_block->device_addr; }},
__anona88153740702() 378   {"stream_id", [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) { oss << mem_block->stream_id; }},
__anona88153740802() 379   {"pool_type", [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) { oss << mem_block->pool_name; }},
__anona88153740902() 380   {"size", [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) { oss << mem_block->size; }},
381   {"actual_peak_memory",
__anona88153740a02() 382    [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) { oss << mem_block->actual_peak_memory; }},
383   {"file_name",
__anona88153740b02() 384    [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
385      auto mem_info = mem_block->mem_info.lock();
386      if (mem_info) {
387        oss << mem_info->file_name;
388      }
389    }},
390   {"line_num",
__anona88153740c02() 391    [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
392      auto mem_info = mem_block->mem_info.lock();
393      if (mem_info) {
394        oss << mem_info->line_num;
395      }
396    }},
397   {"type",
__anona88153740d02() 398    [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
399      auto mem_info = mem_block->mem_info.lock();
400      if (mem_info) {
401        oss << MemTypeToStr.at(mem_info->type);
402      }
403    }},
404   {"producer_task",
__anona88153740e02() 405    [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
406      auto mem_info = mem_block->mem_info.lock();
407      if (mem_info) {
408        MS_EXCEPTION_IF_NULL(mem_info->producer_task);
409        oss << mem_info->producer_task->time_stamp;
410      }
411    }},
412   {"task_name",
__anona88153740f02() 413    [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
414      auto mem_info = mem_block->mem_info.lock();
415      if (mem_info) {
416        MS_EXCEPTION_IF_NULL(mem_info->producer_task);
417        oss << mem_info->producer_task->task_name;
418      }
419    }},
420   {"node_name",
__anona88153741002() 421    [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
422      auto mem_info = mem_block->mem_info.lock();
423      if (mem_info) {
424        MS_EXCEPTION_IF_NULL(mem_info->producer_task);
425        oss << mem_info->producer_task->node_name;
426      }
427    }},
428   {"graph_name",
__anona88153741102() 429    [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
430      auto mem_info = mem_block->mem_info.lock();
431      if (mem_info) {
432        MS_EXCEPTION_IF_NULL(mem_info->producer_task);
433        oss << mem_info->producer_task->graph_name;
434      }
435    }},
436   {"user_tasks",
__anona88153741202() 437    [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
438      auto mem_info = mem_block->mem_info.lock();
439      if (mem_info) {
440        oss << task_list_to_str(mem_info->user_tasks);
441      }
442    }},
443   {"python_stack",
__anona88153741302() 444    [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
445      auto mem_info = mem_block->mem_info.lock();
446      if (mem_info) {
447        MS_EXCEPTION_IF_NULL(mem_info->producer_task);
448        oss << mem_info->producer_task->python_stack;
449      }
450    }},
451 };
452 
453 const std::vector<std::pair<std::string, std::function<void(const TaskInfoPtr &, std::ofstream &)>>> task_csv = {
__anona88153741402() 454   {"time_stamp", [](const TaskInfoPtr &task, std::ofstream &oss) { oss << task->time_stamp; }},
__anona88153741502() 455   {"task_name", [](const TaskInfoPtr &task, std::ofstream &oss) { oss << task->task_name; }},
__anona88153741602() 456   {"node_name", [](const TaskInfoPtr &task, std::ofstream &oss) { oss << task->node_name; }},
__anona88153741702() 457   {"graph_name", [](const TaskInfoPtr &task, std::ofstream &oss) { oss << task->graph_name; }},
__anona88153741802() 458   {"file_name", [](const TaskInfoPtr &task, std::ofstream &oss) { oss << task->file_name; }},
__anona88153741902() 459   {"line_num", [](const TaskInfoPtr &task, std::ofstream &oss) { oss << task->line_num; }},
460 };
461 
462 const std::vector<std::pair<std::string, std::function<void(const MemBlockInfoPtr &, std::ofstream &)>>> prof_csv = {
463   {"Name",
__anona88153741a02() 464    [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
465      auto mem_info = mem_block->mem_info.lock();
466      if (mem_info) {
467        MS_EXCEPTION_IF_NULL(mem_info->producer_task);
468        oss << mem_info->producer_task->node_name;
469      }
470    }},
471   {"Size(KB)", [](const MemBlockInfoPtr &mem_block,
__anona88153741b02() 472                   std::ofstream &oss) { oss << (static_cast<float>(mem_block->size) / kKBToByte); }},
473   {"Allocation Time(us)",
__anona88153741c02() 474    [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) { oss << mem_block->real_start_time; }},
475   {"Duration(us)",
__anona88153741d02() 476    [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
477      if (mem_block->real_end_time > 0) {
478        oss << (mem_block->real_end_time - mem_block->real_start_time);
479      }
480    }},
481   {"Allocation Total Allocated(MB)",
__anona88153741e02() 482    [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
483      oss << (static_cast<float>(mem_block->alloc_in_used_size) / kKBToByte / kMBToKB);
484    }},
485   {"Allocation Total Reserved(MB)",
__anona88153741f02() 486    [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
487      oss << (static_cast<float>(mem_block->alloc_total_size) / kKBToByte / kMBToKB);
488    }},
489   {"Release Total Allocated(MB)",
__anona88153742002() 490    [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
491      oss << (static_cast<float>(mem_block->release_in_used_size) / kKBToByte / kMBToKB);
492    }},
493   {"Release Total Reserved(MB)",
__anona88153742102() 494    [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
495      oss << (static_cast<float>(mem_block->release_total_size) / kKBToByte / kMBToKB);
496    }},
__anona88153742202() 497   {"Device", [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) { oss << mem_block->pool_name; }},
498 };
499 }  // namespace
500 
Dump()501 void MemoryTrackerEnabled::Dump() {
502   std::lock_guard<std::mutex> lock(mutex_);
503   if (has_dump) {
504     return;
505   }
506   has_dump = true;
507 
508   auto [block_csv_path, task_csv_path] = GetPath();
509   auto block_csv_path_opt = Common::CreatePrefixPath(block_csv_path);
510   auto task_csv_path_opt = Common::CreatePrefixPath(task_csv_path);
511   if (!block_csv_path_opt.has_value() || !task_csv_path_opt.has_value()) {
512     MS_LOG(ERROR) << "Get realpath failed, block_csv_path:" << block_csv_path << ", task_csv_path:" << task_csv_path;
513     return;
514   }
515 
516   MS_LOG(INFO) << "MemoryTracker Dump start";
517   ChangeFileMode(block_csv_path_opt.value(), S_IWUSR | S_IRUSR);
518   std::ofstream block_file(block_csv_path_opt.value());
519   if (!block_file) {
520     MS_LOG(EXCEPTION) << "Open file " << block_csv_path_opt.value() << " failed.";
521   }
522   size_t not_bind_size = 0;
523   for (const auto &csv : block_csv) {
524     block_file << csv.first << ",";
525   }
526   block_file << "\n";
527   for (auto &mem_block : mem_block_list_) {
528     if (mem_block->pool_name == "CPU") {
529       continue;
530     }
531     for (const auto &csv : block_csv) {
532       csv.second(mem_block, block_file);
533       block_file << ",";
534     }
535     if (!mem_block->is_bind) {
536       not_bind_size += mem_block->size;
537     }
538     block_file << "\n";
539   }
540 
541   ChangeFileMode(task_csv_path_opt.value(), S_IWUSR | S_IRUSR);
542   std::ofstream task_file(task_csv_path_opt.value());
543   if (!task_file) {
544     MS_LOG(EXCEPTION) << "Open file " << task_csv_path_opt.value() << " failed.";
545   }
546   for (const auto &csv : task_csv) {
547     task_file << csv.first << ",";
548   }
549   task_file << "\n";
550   for (auto &task : task_list_) {
551     for (const auto &csv : task_csv) {
552       csv.second(task, task_file);
553       task_file << ",";
554     }
555     task_file << "\n";
556   }
557 
558   block_file.close();
559   task_file.close();
560   ChangeFileMode(block_csv_path_opt.value(), S_IWUSR | S_IRUSR);
561   ChangeFileMode(task_csv_path_opt.value(), S_IWUSR | S_IRUSR);
562   MS_LOG(INFO) << "Not bind size, " << not_bind_size;
563   MS_LOG(INFO) << "MemoryTracker Dump end";
564 }
565 
UpdateProfilingPos()566 void MemoryTrackerEnabled::UpdateProfilingPos() {
567   std::lock_guard<std::mutex> lock(mutex_);
568   last_profiling_pos_ = mem_info_list_.size();
569 }
570 
DumpProfilingMemInfo(const std::string & path,const std::string & file_name)571 void MemoryTrackerEnabled::DumpProfilingMemInfo(const std::string &path, const std::string &file_name) {
572   std::lock_guard<std::mutex> lock(mutex_);
573 
574   auto csv_path = path + "/" + file_name + "_" + GetRankID() + ".csv";
575   auto csv_path_opt = Common::CreatePrefixPath(csv_path);
576   if (!csv_path_opt.has_value()) {
577     MS_LOG(ERROR) << "Get realpath failed, csv_path:" << csv_path;
578     return;
579   }
580 
581   MS_LOG(INFO) << "MemoryTracker DumpProfilingMemInfo start, last_profiling_pos:" << last_profiling_pos_;
582   ChangeFileMode(csv_path_opt.value(), S_IWUSR | S_IRUSR);
583   std::ofstream block_file(csv_path_opt.value());
584   auto old_file_flags = block_file.flags();
585   auto old_precision = block_file.precision();
586   block_file.unsetf(std::ios_base::floatfield);
587   block_file.precision(kPrecisionDigits);
588   for (const auto &csv : prof_csv) {
589     block_file << csv.first << ",";
590   }
591   block_file << "\n";
592 
593   for (size_t i = 0; i < mem_block_list_.size(); i++) {
594     const auto &mem_block = mem_block_list_[i];
595     if (i < last_profiling_pos_) {
596       continue;
597     }
598 
599     if (mem_block->pool_name == "CPU") {
600       continue;
601     }
602 
603     if (mem_block->start_time_stamp == kIllegalStartTimeStamp) {
604       MS_LOG(DEBUG) << "Mem block start time stamp is " << kIllegalStartTimeStamp << ".";
605       continue;
606     }
607 
608     for (const auto &csv : prof_csv) {
609       csv.second(mem_block, block_file);
610       block_file << ",";
611     }
612     block_file << "\n";
613   }
614 
615   // Restore file flags and precision
616   block_file.flags(old_file_flags);
617   block_file.precision(old_precision);
618   block_file.close();
619   ChangeFileMode(csv_path_opt.value(), S_IWUSR | S_IRUSR);
620 
621   // record the last time stamp
622   last_profiling_pos_ = mem_block_list_.size();
623   MS_LOG(INFO) << "MemoryTracker DumpProfilingMemInfo end, last_profiling_pos:" << last_profiling_pos_;
624 }
625 
626 }  // namespace tracker
627 }  // namespace device
628 }  // namespace mindspore
629