• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "plugin/device/ascend/hal/profiler/memory_profiling.h"
18 #include <fstream>
19 #include <memory>
20 #include "utils/log_adapter.h"
21 #include "utils/ms_context.h"
22 #include "utils/ms_utils.h"
23 #include "nlohmann/json.hpp"
24 #include "plugin/device/ascend/hal/profiler/options.h"
25 
26 namespace mindspore {
27 namespace profiler {
28 namespace ascend {
29 constexpr char kOutputPath[] = "output";
30 
SetMemoryProfilingInitialize(const std::string & profiling_options)31 void MemoryProfiling::SetMemoryProfilingInitialize(const std::string &profiling_options) {
32   nlohmann::json options;
33   try {
34     options = nlohmann::json::parse(profiling_options);
35   } catch (nlohmann::json::exception &e) {
36     MS_LOG(EXCEPTION) << "Failed to parse profiling options because of format error.";
37   }
38 
39   if (options["profile_memory"] == "on") {
40     is_initialized_ = true;
41   }
42 }
43 
StartMemoryProfiling()44 void MemoryProfiling::StartMemoryProfiling() {
45   is_enabled_ = true;
46   if (NeedSaveMemoryProfiling()) {
47     SaveMemoryProfiling();
48     has_save_memory_data_ = true;
49   }
50 }
51 
StopMemoryProfiling()52 void MemoryProfiling::StopMemoryProfiling() { is_enabled_ = false; }
53 
AddGraphMemoryNode(uint32_t graph_id)54 std::shared_ptr<GraphMemory> MemoryProfiling::AddGraphMemoryNode(uint32_t graph_id) {
55   std::shared_ptr<GraphMemory> node = std::make_shared<GraphMemory>(graph_id);
56   MS_EXCEPTION_IF_NULL(node);
57   graph_memory_[graph_id] = node;
58   return node;
59 }
60 
GetGraphMemoryNode(uint32_t graph_id) const61 std::shared_ptr<GraphMemory> MemoryProfiling::GetGraphMemoryNode(uint32_t graph_id) const {
62   auto node = graph_memory_.find(graph_id);
63   if (node != graph_memory_.end()) {
64     return node->second;
65   }
66 
67   return nullptr;
68 }
69 
MemoryToPB()70 bool MemoryProfiling::MemoryToPB() {
71   memory_proto_.set_total_mem(device_mem_size_);
72   if (graph_memory_.size() == 0) {
73     MS_LOG(INFO) << "No memory profiling data need to be reported.";
74     return false;
75   }
76 
77   for (const auto &graph : graph_memory_) {
78     GraphMemProto *graph_proto = memory_proto_.add_graph_mem();
79     if (graph_proto == nullptr) {
80       MS_LOG(ERROR) << "Add graph memory proto failed.";
81       return false;
82     }
83     graph_proto->set_graph_id(graph.second->GetGraphId());
84     graph_proto->set_static_mem(graph.second->GetStaticMemSize());
85     // node memory to PB
86     for (const auto &node : graph.second->GetNodeMemory()) {
87       NodeMemProto *node_mem = graph_proto->add_node_mems();
88       if (node_mem == nullptr) {
89         MS_LOG(ERROR) << "Add node memory proto failed.";
90         return false;
91       }
92       node_mem->set_node_name(node.GetNodeName());
93       node_mem->set_node_id(node.GetNodeId());
94       for (const auto &id : node.GetInputTensorId()) {
95         node_mem->add_input_tensor_id(id);
96       }
97       for (const auto &id : node.GetOutputTensorId()) {
98         node_mem->add_output_tensor_id(id);
99       }
100       for (const auto &id : node.GetOutputTensorId()) {
101         node_mem->add_workspace_tensor_id(id);
102       }
103     }
104     // tensor memory to PB
105     for (const auto &node : graph.second->GetTensorMemory()) {
106       TensorMemProto *tensor_mem = graph_proto->add_tensor_mems();
107       if (tensor_mem == nullptr) {
108         MS_LOG(ERROR) << "Add node memory proto failed.";
109         return false;
110       }
111       tensor_mem->set_tensor_id(node.GetTensorId());
112       tensor_mem->set_size(node.GetAlignedSize());
113       std::string type = node.GetType();
114       tensor_mem->set_type(type);
115       tensor_mem->set_life_start(node.GetLifeStart());
116       tensor_mem->set_life_end(node.GetLifeEnd());
117       std::string life_long = node.GetLifeLong();
118       tensor_mem->set_life_long(life_long);
119     }
120   }
121   MS_LOG(INFO) << "Memory profiling data to PB end.";
122   return true;
123 }
124 
SaveMemoryProfiling()125 void MemoryProfiling::SaveMemoryProfiling() {
126   auto context = MsContext::GetInstance();
127   MS_EXCEPTION_IF_NULL(context);
128   std::string dir_path = GetOutputPath();
129   auto device_id = common::GetEnv("RANK_ID");
130   // If RANK_ID is not set, default value is 0
131   if (device_id.empty()) {
132     device_id = "0";
133   }
134 
135   if (!MemoryToPB()) {
136     return;
137   }
138 
139   std::string file = dir_path + std::string("/memory_usage_") + std::string(device_id) + std::string(".pb");
140   std::fstream handle(file, std::ios::out | std::ios::trunc | std::ios::binary);
141   if (!memory_proto_.SerializeToOstream(&handle)) {
142     MS_LOG(ERROR) << "Save memory profiling data to file failed";
143   }
144   handle.close();
145 
146   MS_LOG(INFO) << "Start save memory profiling data to " << file << " end";
147   return;
148 }
149 }  // namespace ascend
150 }  // namespace profiler
151 }  // namespace mindspore
152