1 /** 2 * Copyright 2021 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "plugin/device/ascend/hal/profiler/memory_profiling.h" 18 #include <fstream> 19 #include <memory> 20 #include "utils/log_adapter.h" 21 #include "utils/ms_context.h" 22 #include "utils/ms_utils.h" 23 #include "nlohmann/json.hpp" 24 #include "plugin/device/ascend/hal/profiler/options.h" 25 26 namespace mindspore { 27 namespace profiler { 28 namespace ascend { 29 constexpr char kOutputPath[] = "output"; 30 SetMemoryProfilingInitialize(const std::string & profiling_options)31void MemoryProfiling::SetMemoryProfilingInitialize(const std::string &profiling_options) { 32 nlohmann::json options; 33 try { 34 options = nlohmann::json::parse(profiling_options); 35 } catch (nlohmann::json::exception &e) { 36 MS_LOG(EXCEPTION) << "Failed to parse profiling options because of format error."; 37 } 38 39 if (options["profile_memory"] == "on") { 40 is_initialized_ = true; 41 } 42 } 43 StartMemoryProfiling()44void MemoryProfiling::StartMemoryProfiling() { 45 is_enabled_ = true; 46 if (NeedSaveMemoryProfiling()) { 47 SaveMemoryProfiling(); 48 has_save_memory_data_ = true; 49 } 50 } 51 StopMemoryProfiling()52void MemoryProfiling::StopMemoryProfiling() { is_enabled_ = false; } 53 AddGraphMemoryNode(uint32_t graph_id)54std::shared_ptr<GraphMemory> MemoryProfiling::AddGraphMemoryNode(uint32_t graph_id) { 55 std::shared_ptr<GraphMemory> node = std::make_shared<GraphMemory>(graph_id); 56 MS_EXCEPTION_IF_NULL(node); 57 graph_memory_[graph_id] = node; 58 return node; 59 } 60 GetGraphMemoryNode(uint32_t graph_id) const61std::shared_ptr<GraphMemory> MemoryProfiling::GetGraphMemoryNode(uint32_t graph_id) const { 62 auto node = graph_memory_.find(graph_id); 63 if (node != graph_memory_.end()) { 64 return node->second; 65 } 66 67 return nullptr; 68 } 69 MemoryToPB()70bool MemoryProfiling::MemoryToPB() { 71 memory_proto_.set_total_mem(device_mem_size_); 72 if (graph_memory_.size() == 0) { 73 MS_LOG(INFO) << "No memory profiling data need to be reported."; 74 return false; 75 } 76 77 for (const auto &graph : graph_memory_) { 78 GraphMemProto *graph_proto = memory_proto_.add_graph_mem(); 79 if (graph_proto == nullptr) { 80 MS_LOG(ERROR) << "Add graph memory proto failed."; 81 return false; 82 } 83 graph_proto->set_graph_id(graph.second->GetGraphId()); 84 graph_proto->set_static_mem(graph.second->GetStaticMemSize()); 85 // node memory to PB 86 for (const auto &node : graph.second->GetNodeMemory()) { 87 NodeMemProto *node_mem = graph_proto->add_node_mems(); 88 if (node_mem == nullptr) { 89 MS_LOG(ERROR) << "Add node memory proto failed."; 90 return false; 91 } 92 node_mem->set_node_name(node.GetNodeName()); 93 node_mem->set_node_id(node.GetNodeId()); 94 for (const auto &id : node.GetInputTensorId()) { 95 node_mem->add_input_tensor_id(id); 96 } 97 for (const auto &id : node.GetOutputTensorId()) { 98 node_mem->add_output_tensor_id(id); 99 } 100 for (const auto &id : node.GetOutputTensorId()) { 101 node_mem->add_workspace_tensor_id(id); 102 } 103 } 104 // tensor memory to PB 105 for (const auto &node : graph.second->GetTensorMemory()) { 106 TensorMemProto *tensor_mem = graph_proto->add_tensor_mems(); 107 if (tensor_mem == nullptr) { 108 MS_LOG(ERROR) << "Add node memory proto failed."; 109 return false; 110 } 111 tensor_mem->set_tensor_id(node.GetTensorId()); 112 tensor_mem->set_size(node.GetAlignedSize()); 113 std::string type = node.GetType(); 114 tensor_mem->set_type(type); 115 tensor_mem->set_life_start(node.GetLifeStart()); 116 tensor_mem->set_life_end(node.GetLifeEnd()); 117 std::string life_long = node.GetLifeLong(); 118 tensor_mem->set_life_long(life_long); 119 } 120 } 121 MS_LOG(INFO) << "Memory profiling data to PB end."; 122 return true; 123 } 124 SaveMemoryProfiling()125void MemoryProfiling::SaveMemoryProfiling() { 126 auto context = MsContext::GetInstance(); 127 MS_EXCEPTION_IF_NULL(context); 128 std::string dir_path = GetOutputPath(); 129 auto device_id = common::GetEnv("RANK_ID"); 130 // If RANK_ID is not set, default value is 0 131 if (device_id.empty()) { 132 device_id = "0"; 133 } 134 135 if (!MemoryToPB()) { 136 return; 137 } 138 139 std::string file = dir_path + std::string("/memory_usage_") + std::string(device_id) + std::string(".pb"); 140 std::fstream handle(file, std::ios::out | std::ios::trunc | std::ios::binary); 141 if (!memory_proto_.SerializeToOstream(&handle)) { 142 MS_LOG(ERROR) << "Save memory profiling data to file failed"; 143 } 144 handle.close(); 145 146 MS_LOG(INFO) << "Start save memory profiling data to " << file << " end"; 147 return; 148 } 149 } // namespace ascend 150 } // namespace profiler 151 } // namespace mindspore 152