1 /**
2 * Copyright 2024 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "include/backend/mem_reuse/mem_tracker.h"
18 #include <fstream>
19 #include "frontend/parallel/group_manager.h"
20 #include "utils/ms_context.h"
21 #include "include/common/debug/common.h"
22 #include "include/common/utils/comm_manager.h"
23 #include "include/backend/device_type.h"
24 #include "include/backend/mem_reuse/mem_dynamic_allocator.h"
25 #include "include/common/utils/utils.h"
26 #include "include/backend/distributed/collective/collective_manager.h"
27 #include "utils/file_utils.h"
28
29 namespace mindspore {
30 namespace device {
31 namespace tracker {
32 constexpr int64_t kIllegalStartTimeStamp = -1L;
33 namespace {
GetRankID()34 std::string GetRankID() {
35 uint32_t rank_id = 0;
36 #if !defined(BUILD_LITE)
37 if (distributed::collective::CollectiveManager::instance()->initialized()) {
38 rank_id = CommManager::GetInstance().GetRank();
39 }
40 #endif
41 return std::to_string(rank_id);
42 }
43
GetAllocatorType(MemType mem_type)44 AllocatorType GetAllocatorType(MemType mem_type) {
45 static std::map<MemType, device::AllocatorType> mem_allocator_type_map = {
46 {MemType::kWeight, AllocatorType::kWeight},
47 {MemType::kConstantValue, AllocatorType::kConstantValue},
48 {MemType::kKernel, AllocatorType::kConstantValue},
49 {MemType::kGraphOutput, AllocatorType::kGraphOutput},
50 {MemType::kSomas, AllocatorType::kConstantValue},
51 {MemType::kInSideSomas, AllocatorType::kConstantValue},
52 {MemType::kSomasOutput, AllocatorType::kKernelOutput},
53 {MemType::kGeConst, AllocatorType::kConstantValue},
54 {MemType::kBatchMemory, AllocatorType::kConstantValue},
55 {MemType::kContinuousMemory, AllocatorType::kConstantValue},
56 {MemType::kPyNativeInput, AllocatorType::kConstantValue},
57 {MemType::kPyNativeOutput, AllocatorType::kKernelOutput},
58 {MemType::kGeFeatureMemory, AllocatorType::kConstantValue},
59 {MemType::kWorkSpace, AllocatorType::kWorkspace},
60 {MemType::kOther, AllocatorType::kOther}};
61
62 auto iter = mem_allocator_type_map.find(mem_type);
63 if (iter == mem_allocator_type_map.end()) {
64 MS_LOG(WARNING) << "Not found mem_type:" << mem_type << " in mem_allocator_type_map.";
65 return AllocatorType::kOther;
66 }
67 return iter->second;
68 }
69 } // namespace
70
GetPath()71 std::pair<std::string, std::string> MemoryTrackerEnabled::GetPath() {
72 std::string block_csv_path;
73 std::string task_csv_path;
74
75 auto ms_context = MsContext::GetInstance();
76 auto trace_path = ms_context->get_param<std::string>(MS_CTX_PROF_MEM_OUTPUT_PATH);
77 if (trace_path.empty()) {
78 trace_path = "./";
79 }
80
81 if (enable_hccl_) {
82 block_csv_path = trace_path + "/rank_" + GetRankID() + "/memory_block.csv";
83 task_csv_path = trace_path + "/rank_" + GetRankID() + "/task.csv";
84 } else {
85 block_csv_path = trace_path + "/memory_block.csv";
86 task_csv_path = trace_path + "/task.csv";
87 }
88 return std::make_pair(block_csv_path, task_csv_path);
89 }
90
AddTask(const std::string & task_name,const std::string & node_name,const std::string & graph_name,const std::string & file_name,size_t line_num)91 void MemoryTrackerEnabled::AddTask(const std::string &task_name, const std::string &node_name,
92 const std::string &graph_name, const std::string &file_name, size_t line_num) {
93 std::string python_stack;
94 if (WithPythonStack()) {
95 python_stack = GetPythonStackStr();
96 }
97
98 std::lock_guard lock(mutex_);
99 if (!is_init_enable_hccl_) {
100 // MS_CTX_ENABLE_HCCL will be reset when the process is destroyed.
101 // Therefore, record the enable_hccl when AddTask for the first time.
102 auto ms_context = MsContext::GetInstance();
103 enable_hccl_ = ms_context->get_param<bool>(MS_CTX_ENABLE_HCCL);
104 is_init_enable_hccl_ = true;
105 }
106
107 time_stamp_++;
108 auto task_info = std::make_shared<TaskInfo>();
109 MS_EXCEPTION_IF_NULL(task_info);
110 task_info->task_name = task_name;
111 task_info->node_name = node_name;
112 task_info->graph_name = graph_name;
113 task_info->file_name = file_name;
114 task_info->line_num = line_num;
115 task_info->time_stamp = time_stamp_;
116 task_info->python_stack = python_stack;
117 task_map_[task_name] = task_info;
118 task_list_.push_back(task_info);
119 }
120
NewMemInfo(const std::string & task_name,MemType type,size_t size,KernelTensorPtr kernel_tensor,const std::string & file_name,size_t line_num)121 MemInfoPtr MemoryTrackerEnabled::NewMemInfo(const std::string &task_name, MemType type, size_t size,
122 KernelTensorPtr kernel_tensor, const std::string &file_name,
123 size_t line_num) {
124 auto mem_info = std::make_shared<MemInfo>();
125 MS_EXCEPTION_IF_NULL(mem_info);
126 mem_info->type = type;
127 mem_info->size = size;
128 mem_info->kernel_tensor = kernel_tensor;
129 mem_info->file_name = file_name;
130 mem_info->line_num = line_num;
131 auto iter = task_map_.find(task_name);
132 if (iter == task_map_.end()) {
133 MS_LOG(ERROR) << "MemoryTracker AddMemInfo failed, task_name:" << task_name << " not found, " << file_name << ":"
134 << line_num;
135 return nullptr;
136 }
137
138 const auto &node_name = iter->second->node_name;
139 DynamicMemAllocatorDebugInfo::SetDebugInfo(node_name, GetAllocatorType(type));
140
141 mem_info->producer_task = iter->second;
142 mem_info_list_.push_back(mem_info);
143 return mem_info;
144 }
145
AddMemInfoForKernelTensor(const std::string & task_name,MemType type,size_t size,KernelTensorPtr kernel_tensor,const std::string & file_name,size_t line_num)146 void MemoryTrackerEnabled::AddMemInfoForKernelTensor(const std::string &task_name, MemType type, size_t size,
147 KernelTensorPtr kernel_tensor, const std::string &file_name,
148 size_t line_num) {
149 auto mem_info = NewMemInfo(task_name, type, size, kernel_tensor, file_name, line_num);
150 if (mem_info != nullptr) {
151 kernel_tensor_mem_map[kernel_tensor] = mem_info;
152 }
153 }
154
AddMemInfo(const std::string & task_name,MemType type,size_t size,DeviceAddress * device_address,const std::string & file_name,size_t line_num)155 void MemoryTrackerEnabled::AddMemInfo(const std::string &task_name, MemType type, size_t size,
156 DeviceAddress *device_address, const std::string &file_name, size_t line_num) {
157 MS_EXCEPTION_IF_NULL(device_address);
158 if (device_address->GetDeviceType() == DeviceType::kCPU) {
159 return;
160 }
161 std::lock_guard<std::mutex> lock(mutex_);
162
163 if (device_address->kernel_tensor() == nullptr) {
164 auto mem_info = NewMemInfo(task_name, type, size, nullptr, file_name, line_num);
165 device_address_mem_map[device_address] = mem_info;
166 } else {
167 AddMemInfoForKernelTensor(task_name, type, size, device_address->kernel_tensor().get(), file_name, line_num);
168 }
169 }
170
UpdateMemInfo(const DeviceAddress * device_address,MemType mem_type,const std::string & file_name,size_t line_num)171 void MemoryTrackerEnabled::UpdateMemInfo(const DeviceAddress *device_address, MemType mem_type,
172 const std::string &file_name, size_t line_num) {
173 std::lock_guard lock(mutex_);
174 if (device_address->GetDeviceType() == DeviceType::kCPU) {
175 return;
176 }
177 auto kernel_tensor = device_address->kernel_tensor().get();
178 auto iter = kernel_tensor_mem_map.find(kernel_tensor);
179 if (iter == kernel_tensor_mem_map.end()) {
180 MS_LOG(ERROR) << "MemoryTracker UpdateMemInfoMemType failed, kernel_tensor:" << kernel_tensor << " not found";
181 return;
182 }
183 iter->second->type = mem_type;
184 iter->second->file_name = file_name;
185 iter->second->line_num = line_num;
186 }
187
AddCompileTimeMemInfo(const std::string & task_name,size_t size,DeviceMemPtr device_ptr,MemType mem_type,const std::string & file_name,size_t line_num)188 void MemoryTrackerEnabled::AddCompileTimeMemInfo(const std::string &task_name, size_t size, DeviceMemPtr device_ptr,
189 MemType mem_type, const std::string &file_name, size_t line_num) {
190 std::lock_guard lock(mutex_);
191 auto mem_info = std::make_shared<MemInfo>();
192 MS_EXCEPTION_IF_NULL(mem_info);
193 mem_info->type = mem_type;
194 mem_info->size = size;
195 mem_info->file_name = file_name;
196 mem_info->line_num = line_num;
197 auto iter = task_map_.find(task_name);
198 if (iter == task_map_.end()) {
199 MS_LOG(ERROR) << "MemoryTracker AddCompileTimeMemInfo failed, task_name:" << task_name << " not found, "
200 << file_name << ":" << line_num;
201 return;
202 }
203 mem_info->producer_task = iter->second;
204 auto mem_block_iter = device_mem_block_map.find(device_ptr);
205 if (mem_block_iter == device_mem_block_map.end()) {
206 MS_LOG(ERROR) << "MemoryTracker AddCompileTimeMemInfo failed, device_ptr:" << device_ptr << " not found, "
207 << file_name << ":" << line_num;
208 return;
209 }
210 mem_info->mem_block = mem_block_iter->second;
211 mem_info->mem_block->is_bind = true;
212 mem_info->mem_block->mem_info = mem_info;
213 mem_info_list_.push_back(mem_info);
214 }
215
BindDevicePtr(DeviceAddress * device_address,DeviceMemPtr device_ptr,const std::string & file_name,size_t line_num)216 void MemoryTrackerEnabled::BindDevicePtr(DeviceAddress *device_address, DeviceMemPtr device_ptr,
217 const std::string &file_name, size_t line_num) {
218 if (device_address == nullptr) {
219 return;
220 }
221 std::lock_guard<std::mutex> lock(mutex_);
222 if (device_address->GetDeviceType() == DeviceType::kCPU) {
223 return;
224 }
225 MemInfoPtr mem_info{nullptr};
226 if (device_address->kernel_tensor() == nullptr) {
227 auto iter = device_address_mem_map.find(device_address);
228 if (iter == device_address_mem_map.end()) {
229 MS_LOG(ERROR) << "MemoryTracker BindDevicePtr failed, device_address:" << device_address << " not found, "
230 << file_name << ":" << line_num;
231 return;
232 }
233 mem_info = iter->second;
234 } else {
235 auto iter = kernel_tensor_mem_map.find(device_address->kernel_tensor().get());
236 if (iter == kernel_tensor_mem_map.end()) {
237 MS_LOG(ERROR) << "MemoryTracker BindDevicePtr failed, kernel_tensor:" << device_address->kernel_tensor().get()
238 << " not found, " << file_name << ":" << line_num;
239 return;
240 }
241 mem_info = iter->second;
242 }
243
244 if (mem_info->type == MemType::kInSideSomas) {
245 auto mem_block_info = std::make_shared<MemBlockInfo>();
246 MS_EXCEPTION_IF_NULL(mem_block_info);
247 mem_block_info->device_addr = device_ptr;
248 mem_block_info->size = mem_info->size;
249 mem_block_info->start_time_stamp = -1;
250 mem_block_info->end_time_stamp = -1;
251 mem_block_info->is_bind = true;
252 mem_block_info->mem_info = mem_info;
253 mem_info->mem_block = mem_block_info;
254 device_mem_block_map[device_ptr] = mem_block_info;
255 mem_block_list_.push_back(mem_block_info);
256 // mem_block need to dump again, after mem_block_list_ changed
257 has_dump = false;
258 return;
259 }
260 auto mem_block_iter = device_mem_block_map.find(device_ptr);
261 if (mem_block_iter == device_mem_block_map.end()) {
262 MS_LOG(ERROR) << "MemoryTracker BindDevicePtr failed, device_ptr:" << device_ptr << " not found, " << file_name
263 << ":" << line_num;
264 return;
265 }
266 mem_info->mem_block = mem_block_iter->second;
267 mem_info->mem_block->is_bind = true;
268 mem_info->mem_block->mem_info = mem_info;
269 }
270
UpdateDevicePtrInfo(DeviceMemPtr device_ptr,MemType mem_type,const std::string & task_name,const std::string & file_name,size_t line_num)271 void MemoryTrackerEnabled::UpdateDevicePtrInfo(DeviceMemPtr device_ptr, MemType mem_type, const std::string &task_name,
272 const std::string &file_name, size_t line_num) {
273 std::lock_guard lock(mutex_);
274 auto mem_block_iter = device_mem_block_map.find(device_ptr);
275 if (mem_block_iter == device_mem_block_map.end()) {
276 MS_LOG(ERROR) << "MemoryTracker AddCompileTimeMemInfo failed, device_ptr:" << device_ptr << " not found, "
277 << file_name << ":" << line_num;
278 return;
279 }
280 auto mem_info = std::make_shared<MemInfo>();
281 MS_EXCEPTION_IF_NULL(mem_info);
282 auto task_info = std::make_shared<TaskInfo>();
283 MS_EXCEPTION_IF_NULL(task_info);
284 task_info->task_name = task_name;
285 mem_info->producer_task = task_info;
286 mem_info->file_name = file_name;
287 mem_info->line_num = line_num;
288 mem_info->type = mem_type;
289 mem_info->mem_block = mem_block_iter->second;
290 mem_info->mem_block->is_bind = true;
291 mem_info->mem_block->mem_info = mem_info;
292 mem_info_list_.push_back(mem_info);
293 }
294
AllocMemBlock(DeviceMemPtr device_addr,size_t size,const std::string & pool_name,size_t actual_peak_memory,size_t in_used_size,size_t total_size,uint32_t stream_id)295 void MemoryTrackerEnabled::AllocMemBlock(DeviceMemPtr device_addr, size_t size, const std::string &pool_name,
296 size_t actual_peak_memory, size_t in_used_size, size_t total_size,
297 uint32_t stream_id) {
298 std::lock_guard lock(mutex_);
299 time_stamp_++;
300 auto mem_block = std::make_shared<MemBlockInfo>();
301 MS_EXCEPTION_IF_NULL(mem_block);
302 mem_block->device_addr = device_addr;
303 mem_block->start_time_stamp = time_stamp_;
304 mem_block->actual_peak_memory = actual_peak_memory;
305 mem_block->size = size;
306 mem_block->pool_name = pool_name;
307 mem_block->stream_id = stream_id;
308 mem_block->real_start_time = GetCurrentUSec();
309 mem_block->alloc_in_used_size = in_used_size;
310 mem_block->alloc_total_size = total_size;
311 device_mem_block_map[device_addr] = mem_block;
312 real_device_mem_block_map[device_addr] = mem_block;
313 mem_block_list_.emplace_back(mem_block);
314 // mem_block need to dump again, after mem_block_list_ changed
315 has_dump = false;
316 }
317
FreeMemBlock(DeviceMemPtr device_addr,size_t in_used_size,size_t total_size)318 void MemoryTrackerEnabled::FreeMemBlock(DeviceMemPtr device_addr, size_t in_used_size, size_t total_size) {
319 std::lock_guard lock(mutex_);
320 time_stamp_++;
321 auto iter = real_device_mem_block_map.find(device_addr);
322 if (iter == real_device_mem_block_map.end()) {
323 MS_LOG(ERROR) << "MemoryTracker FreeMemBlock failed, device_addr:" << device_addr << " not found";
324 return;
325 }
326 iter->second->end_time_stamp = time_stamp_;
327 iter->second->real_end_time = GetCurrentUSec();
328 iter->second->release_in_used_size = in_used_size;
329 iter->second->release_total_size = total_size;
330 }
331
UseMemBlock(const std::string & task_name,DeviceMemPtr device_addr,const std::string & file_name,size_t line_num)332 void MemoryTrackerEnabled::UseMemBlock(const std::string &task_name, DeviceMemPtr device_addr,
333 const std::string &file_name, size_t line_num) {
334 std::lock_guard lock(mutex_);
335 auto iter = device_mem_block_map.find(device_addr);
336 if (iter == device_mem_block_map.end()) {
337 MS_LOG(ERROR) << "MemoryTracker UseMemBlock failed, device_addr:" << device_addr << " not found, " << file_name
338 << ":" << line_num;
339 return;
340 }
341 if (iter->second->pool_name == "CPU") {
342 return;
343 }
344 auto task_iter = task_map_.find(task_name);
345 if (task_iter == task_map_.end()) {
346 MS_LOG(ERROR) << "MemoryTracker UseMemBlock failed, task_name:" << task_name << " not found, " << file_name << ":"
347 << line_num;
348 return;
349 }
350 auto mem_info = iter->second->mem_info.lock();
351 if (mem_info == nullptr) {
352 MS_LOG(ERROR) << "MemoryTracker UseMemBlock failed, mem_info is null, " << file_name << ":" << line_num;
353 return;
354 }
355 mem_info->user_tasks.push_back(task_iter->second);
356 }
357
358 namespace {
359 constexpr size_t kKBToByte = 1024;
360 constexpr size_t kMBToKB = 1024;
361 static const int kPrecisionDigits = 20;
362
__anona88153740302(const std::vector<TaskInfoPtr> &task_list) 363 auto task_list_to_str = [](const std::vector<TaskInfoPtr> &task_list) -> std::string {
364 std::stringstream ss;
365 ss << "{";
366 for (auto &task : task_list) {
367 ss << task->time_stamp << "-";
368 }
369 ss << "}";
370 return ss.str();
371 };
372
373 const std::vector<std::pair<std::string, std::function<void(const MemBlockInfoPtr &, std::ofstream &)>>> block_csv = {
374 {"start_time_stamp",
__anona88153740402() 375 [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) { oss << mem_block->start_time_stamp; }},
__anona88153740502() 376 {"end_time_stamp", [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) { oss << mem_block->end_time_stamp; }},
__anona88153740602() 377 {"device_addr", [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) { oss << mem_block->device_addr; }},
__anona88153740702() 378 {"stream_id", [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) { oss << mem_block->stream_id; }},
__anona88153740802() 379 {"pool_type", [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) { oss << mem_block->pool_name; }},
__anona88153740902() 380 {"size", [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) { oss << mem_block->size; }},
381 {"actual_peak_memory",
__anona88153740a02() 382 [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) { oss << mem_block->actual_peak_memory; }},
383 {"file_name",
__anona88153740b02() 384 [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
385 auto mem_info = mem_block->mem_info.lock();
386 if (mem_info) {
387 oss << mem_info->file_name;
388 }
389 }},
390 {"line_num",
__anona88153740c02() 391 [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
392 auto mem_info = mem_block->mem_info.lock();
393 if (mem_info) {
394 oss << mem_info->line_num;
395 }
396 }},
397 {"type",
__anona88153740d02() 398 [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
399 auto mem_info = mem_block->mem_info.lock();
400 if (mem_info) {
401 oss << MemTypeToStr.at(mem_info->type);
402 }
403 }},
404 {"producer_task",
__anona88153740e02() 405 [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
406 auto mem_info = mem_block->mem_info.lock();
407 if (mem_info) {
408 MS_EXCEPTION_IF_NULL(mem_info->producer_task);
409 oss << mem_info->producer_task->time_stamp;
410 }
411 }},
412 {"task_name",
__anona88153740f02() 413 [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
414 auto mem_info = mem_block->mem_info.lock();
415 if (mem_info) {
416 MS_EXCEPTION_IF_NULL(mem_info->producer_task);
417 oss << mem_info->producer_task->task_name;
418 }
419 }},
420 {"node_name",
__anona88153741002() 421 [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
422 auto mem_info = mem_block->mem_info.lock();
423 if (mem_info) {
424 MS_EXCEPTION_IF_NULL(mem_info->producer_task);
425 oss << mem_info->producer_task->node_name;
426 }
427 }},
428 {"graph_name",
__anona88153741102() 429 [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
430 auto mem_info = mem_block->mem_info.lock();
431 if (mem_info) {
432 MS_EXCEPTION_IF_NULL(mem_info->producer_task);
433 oss << mem_info->producer_task->graph_name;
434 }
435 }},
436 {"user_tasks",
__anona88153741202() 437 [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
438 auto mem_info = mem_block->mem_info.lock();
439 if (mem_info) {
440 oss << task_list_to_str(mem_info->user_tasks);
441 }
442 }},
443 {"python_stack",
__anona88153741302() 444 [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
445 auto mem_info = mem_block->mem_info.lock();
446 if (mem_info) {
447 MS_EXCEPTION_IF_NULL(mem_info->producer_task);
448 oss << mem_info->producer_task->python_stack;
449 }
450 }},
451 };
452
453 const std::vector<std::pair<std::string, std::function<void(const TaskInfoPtr &, std::ofstream &)>>> task_csv = {
__anona88153741402() 454 {"time_stamp", [](const TaskInfoPtr &task, std::ofstream &oss) { oss << task->time_stamp; }},
__anona88153741502() 455 {"task_name", [](const TaskInfoPtr &task, std::ofstream &oss) { oss << task->task_name; }},
__anona88153741602() 456 {"node_name", [](const TaskInfoPtr &task, std::ofstream &oss) { oss << task->node_name; }},
__anona88153741702() 457 {"graph_name", [](const TaskInfoPtr &task, std::ofstream &oss) { oss << task->graph_name; }},
__anona88153741802() 458 {"file_name", [](const TaskInfoPtr &task, std::ofstream &oss) { oss << task->file_name; }},
__anona88153741902() 459 {"line_num", [](const TaskInfoPtr &task, std::ofstream &oss) { oss << task->line_num; }},
460 };
461
462 const std::vector<std::pair<std::string, std::function<void(const MemBlockInfoPtr &, std::ofstream &)>>> prof_csv = {
463 {"Name",
__anona88153741a02() 464 [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
465 auto mem_info = mem_block->mem_info.lock();
466 if (mem_info) {
467 MS_EXCEPTION_IF_NULL(mem_info->producer_task);
468 oss << mem_info->producer_task->node_name;
469 }
470 }},
471 {"Size(KB)", [](const MemBlockInfoPtr &mem_block,
__anona88153741b02() 472 std::ofstream &oss) { oss << (static_cast<float>(mem_block->size) / kKBToByte); }},
473 {"Allocation Time(us)",
__anona88153741c02() 474 [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) { oss << mem_block->real_start_time; }},
475 {"Duration(us)",
__anona88153741d02() 476 [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
477 if (mem_block->real_end_time > 0) {
478 oss << (mem_block->real_end_time - mem_block->real_start_time);
479 }
480 }},
481 {"Allocation Total Allocated(MB)",
__anona88153741e02() 482 [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
483 oss << (static_cast<float>(mem_block->alloc_in_used_size) / kKBToByte / kMBToKB);
484 }},
485 {"Allocation Total Reserved(MB)",
__anona88153741f02() 486 [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
487 oss << (static_cast<float>(mem_block->alloc_total_size) / kKBToByte / kMBToKB);
488 }},
489 {"Release Total Allocated(MB)",
__anona88153742002() 490 [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
491 oss << (static_cast<float>(mem_block->release_in_used_size) / kKBToByte / kMBToKB);
492 }},
493 {"Release Total Reserved(MB)",
__anona88153742102() 494 [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) {
495 oss << (static_cast<float>(mem_block->release_total_size) / kKBToByte / kMBToKB);
496 }},
__anona88153742202() 497 {"Device", [](const MemBlockInfoPtr &mem_block, std::ofstream &oss) { oss << mem_block->pool_name; }},
498 };
499 } // namespace
500
Dump()501 void MemoryTrackerEnabled::Dump() {
502 std::lock_guard<std::mutex> lock(mutex_);
503 if (has_dump) {
504 return;
505 }
506 has_dump = true;
507
508 auto [block_csv_path, task_csv_path] = GetPath();
509 auto block_csv_path_opt = Common::CreatePrefixPath(block_csv_path);
510 auto task_csv_path_opt = Common::CreatePrefixPath(task_csv_path);
511 if (!block_csv_path_opt.has_value() || !task_csv_path_opt.has_value()) {
512 MS_LOG(ERROR) << "Get realpath failed, block_csv_path:" << block_csv_path << ", task_csv_path:" << task_csv_path;
513 return;
514 }
515
516 MS_LOG(INFO) << "MemoryTracker Dump start";
517 ChangeFileMode(block_csv_path_opt.value(), S_IWUSR | S_IRUSR);
518 std::ofstream block_file(block_csv_path_opt.value());
519 if (!block_file) {
520 MS_LOG(EXCEPTION) << "Open file " << block_csv_path_opt.value() << " failed.";
521 }
522 size_t not_bind_size = 0;
523 for (const auto &csv : block_csv) {
524 block_file << csv.first << ",";
525 }
526 block_file << "\n";
527 for (auto &mem_block : mem_block_list_) {
528 if (mem_block->pool_name == "CPU") {
529 continue;
530 }
531 for (const auto &csv : block_csv) {
532 csv.second(mem_block, block_file);
533 block_file << ",";
534 }
535 if (!mem_block->is_bind) {
536 not_bind_size += mem_block->size;
537 }
538 block_file << "\n";
539 }
540
541 ChangeFileMode(task_csv_path_opt.value(), S_IWUSR | S_IRUSR);
542 std::ofstream task_file(task_csv_path_opt.value());
543 if (!task_file) {
544 MS_LOG(EXCEPTION) << "Open file " << task_csv_path_opt.value() << " failed.";
545 }
546 for (const auto &csv : task_csv) {
547 task_file << csv.first << ",";
548 }
549 task_file << "\n";
550 for (auto &task : task_list_) {
551 for (const auto &csv : task_csv) {
552 csv.second(task, task_file);
553 task_file << ",";
554 }
555 task_file << "\n";
556 }
557
558 block_file.close();
559 task_file.close();
560 ChangeFileMode(block_csv_path_opt.value(), S_IWUSR | S_IRUSR);
561 ChangeFileMode(task_csv_path_opt.value(), S_IWUSR | S_IRUSR);
562 MS_LOG(INFO) << "Not bind size, " << not_bind_size;
563 MS_LOG(INFO) << "MemoryTracker Dump end";
564 }
565
UpdateProfilingPos()566 void MemoryTrackerEnabled::UpdateProfilingPos() {
567 std::lock_guard<std::mutex> lock(mutex_);
568 last_profiling_pos_ = mem_info_list_.size();
569 }
570
DumpProfilingMemInfo(const std::string & path,const std::string & file_name)571 void MemoryTrackerEnabled::DumpProfilingMemInfo(const std::string &path, const std::string &file_name) {
572 std::lock_guard<std::mutex> lock(mutex_);
573
574 auto csv_path = path + "/" + file_name + "_" + GetRankID() + ".csv";
575 auto csv_path_opt = Common::CreatePrefixPath(csv_path);
576 if (!csv_path_opt.has_value()) {
577 MS_LOG(ERROR) << "Get realpath failed, csv_path:" << csv_path;
578 return;
579 }
580
581 MS_LOG(INFO) << "MemoryTracker DumpProfilingMemInfo start, last_profiling_pos:" << last_profiling_pos_;
582 ChangeFileMode(csv_path_opt.value(), S_IWUSR | S_IRUSR);
583 std::ofstream block_file(csv_path_opt.value());
584 auto old_file_flags = block_file.flags();
585 auto old_precision = block_file.precision();
586 block_file.unsetf(std::ios_base::floatfield);
587 block_file.precision(kPrecisionDigits);
588 for (const auto &csv : prof_csv) {
589 block_file << csv.first << ",";
590 }
591 block_file << "\n";
592
593 for (size_t i = 0; i < mem_block_list_.size(); i++) {
594 const auto &mem_block = mem_block_list_[i];
595 if (i < last_profiling_pos_) {
596 continue;
597 }
598
599 if (mem_block->pool_name == "CPU") {
600 continue;
601 }
602
603 if (mem_block->start_time_stamp == kIllegalStartTimeStamp) {
604 MS_LOG(DEBUG) << "Mem block start time stamp is " << kIllegalStartTimeStamp << ".";
605 continue;
606 }
607
608 for (const auto &csv : prof_csv) {
609 csv.second(mem_block, block_file);
610 block_file << ",";
611 }
612 block_file << "\n";
613 }
614
615 // Restore file flags and precision
616 block_file.flags(old_file_flags);
617 block_file.precision(old_precision);
618 block_file.close();
619 ChangeFileMode(csv_path_opt.value(), S_IWUSR | S_IRUSR);
620
621 // record the last time stamp
622 last_profiling_pos_ = mem_block_list_.size();
623 MS_LOG(INFO) << "MemoryTracker DumpProfilingMemInfo end, last_profiling_pos:" << last_profiling_pos_;
624 }
625
626 } // namespace tracker
627 } // namespace device
628 } // namespace mindspore
629