• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "utils/callbacks_ge.h"
18 #include "pybind11/pybind11.h"
19 #include "ir/param_info.h"
20 #include "transform/graph_ir/df_graph_manager.h"
21 #include "transform/graph_ir/util.h"
22 #include "pipeline/jit/parse/data_converter.h"
23 #include "pipeline/jit/parse/python_adapter.h"
24 #include "utils/visible.h"
25 #include "utils/shape_utils.h"
26 
27 namespace mindspore {
28 namespace callbacks {
29 const char PYTHON_MOD_CALLBACK_MODULE[] = "mindspore.train.callback._callback";
30 const char PYTHON_FUN_PROCESS_CHECKPOINT[] = "checkpoint_cb_for_save_op";
31 const char PYTHON_FUN_PROCESS_SUMMARY[] = "summary_cb_for_save_op";
32 const char kSummary[] = "Summary";
33 const char kCheckPoint[] = "Save";
34 const int ONE_SHAPE = 1;
35 
36 using mindspore::transform::Status;
37 using mindspore::transform::TransformUtil;
38 
GetParameterShape(const FuncGraphPtr & graph,const std::string & param_name,const std::shared_ptr<ShapeVector> & shape)39 bool GetParameterShape(const FuncGraphPtr &graph, const std::string &param_name,
40                        const std::shared_ptr<ShapeVector> &shape) {
41   if (graph == nullptr) {
42     MS_LOG(ERROR) << "Graph is null, can not get graph parameter";
43     return false;
44   }
45 
46   auto parameter_nodes = graph->parameters();
47   for (auto &node : parameter_nodes) {
48     ParameterPtr param_node = std::static_pointer_cast<Parameter>(node);
49     if (param_node == nullptr) {
50       MS_LOG(ERROR) << "Parameter node is null, can not get graph parameter";
51       return false;
52     }
53     if (param_node->name() == param_name) {
54       TensorPtr tensor;
55       if (param_node->has_default()) {
56         tensor = std::dynamic_pointer_cast<tensor::Tensor>(param_node->default_param());
57       }
58       if (tensor == nullptr) {
59         shape->push_back(ONE_SHAPE);
60       } else {
61         *shape = tensor->shape();
62       }
63       return true;
64     }
65   }
66   MS_LOG(ERROR) << "Can not find parameter of name:" << param_name;
67   return false;
68 }
69 
GetMeTensorTransformed(uint32_t graph_id,const std::string & parameter_name,const std::shared_ptr<ge::Tensor> & ge_tensor_ptr)70 static TensorPtr GetMeTensorTransformed(uint32_t graph_id, const std::string &parameter_name,
71                                         const std::shared_ptr<ge::Tensor> &ge_tensor_ptr) {
72   FuncGraphPtr anf_graph = transform::DfGraphManager::GetInstance().GetAnfGraph(graph_id);
73   if (anf_graph == nullptr) {
74     MS_LOG(ERROR) << "Get anf graph failed during callback";
75     return nullptr;
76   }
77 
78   std::shared_ptr<ShapeVector> parameter_shape_ptr = std::make_shared<ShapeVector>();
79   if (!GetParameterShape(anf_graph, parameter_name, parameter_shape_ptr)) {
80     MS_LOG(ERROR) << "Can not get parameter shape during callback";
81     return nullptr;
82   }
83 
84   return TransformUtil::ConvertGeTensor(ge_tensor_ptr, *parameter_shape_ptr);
85 }
86 
CheckpointSaveCallback(uint32_t graph_id,const std::map<std::string,ge::Tensor> & params_list)87 uint32_t CheckpointSaveCallback(uint32_t graph_id, const std::map<std::string, ge::Tensor> &params_list) {
88   // Acquire GIL before calling Python code
89   py::gil_scoped_acquire acquire;
90 
91   MS_LOG(DEBUG) << "Start the checkpoint save callback function in checkpoint save process.";
92   py::list parameter_list = py::list();
93   for (auto &item : params_list) {
94     std::string name = item.first;
95     std::shared_ptr<ge::Tensor> ge_tensor_ptr = std::make_shared<ge::Tensor>(item.second);
96     if (name.size() > 5 && name.compare(name.size() - 5, 5, "_temp") == 0) {
97       continue;
98     } else {
99       TensorPtr tensor_ptr = GetMeTensorTransformed(graph_id, name, ge_tensor_ptr);
100       if (tensor_ptr == nullptr) {
101         MS_LOG(EXCEPTION) << "Transform ge tensor to me tensor failed";
102       }
103       py::dict param_dict;
104       param_dict["name"] = name;
105       param_dict["data"] = tensor_ptr;
106       parameter_list.append(param_dict);
107     }
108   }
109   py::bool_ ret =
110     parse::python_adapter::CallPyFn(PYTHON_MOD_CALLBACK_MODULE, PYTHON_FUN_PROCESS_CHECKPOINT, parameter_list);
111   auto bool_ret = py::cast<bool>(ret);
112 
113   uint32_t status = Status::SUCCESS;
114   if (!bool_ret) {
115     status = Status::FAILED;
116     MS_LOG(ERROR) << "Python checkpoint return false during callback";
117   }
118   return status;
119 }
120 
GetMeTensorForSummary(const std::string & name,const std::shared_ptr<ge::Tensor> & ge_tensor_ptr)121 static TensorPtr GetMeTensorForSummary(const std::string &name, const std::shared_ptr<ge::Tensor> &ge_tensor_ptr) {
122   // confirm the type by name
123   // Format: xxx[:Scalar] xxx[:Image] xxx[:Tensor]
124   if (name.empty()) {
125     MS_LOG(EXCEPTION) << "The summary name is empty.";
126   }
127   auto bpos = name.rfind("[:");
128   if (bpos >= name.size()) {
129     MS_LOG(EXCEPTION) << "The summary name(" << name << ") is invalid.";
130   }
131   auto tname = name.substr(bpos);
132   if (tname == "[:Scalar]") {
133     MS_LOG(DEBUG) << "The summary(" << name << ") is Scalar";
134     // process the scalar type summary
135     // Because the ge tensor is dim = 4, so set the (1,1,1,1)-->(1,)
136     // We do the (1,) shape is scalar
137     auto shape = ShapeVector({ONE_SHAPE});
138     return TransformUtil::ConvertGeTensor(ge_tensor_ptr, shape);
139   }
140   if (tname == "[:Tensor]" || tname == "[:Histogram]") {
141     MS_LOG(DEBUG) << "The summary(" << name << ") is Tensor";
142     // process the tensor summary
143     // Now we can't get the real shape, so we keep same shape with GE
144     return TransformUtil::ConvertGeTensor(ge_tensor_ptr);
145   }
146   if (tname == "[:Image]") {
147     MS_LOG(DEBUG) << "The summary(" << name << ") is Image";
148     // process the Image summary
149     // Image dim = 4, is same with ge, so we keep same shape with GE
150     return TransformUtil::ConvertGeTensor(ge_tensor_ptr);
151   }
152 
153   MS_LOG(EXCEPTION) << "The summary name(" << name << ") is invalid.";
154 }
155 
156 // Cache the summary callback data
157 // Output Format: [{"name": tag_name, "data": tensor}, {"name": tag_name, "data": tensor},...]
SummarySaveCallback(uint32_t graph_id,const std::map<std::string,ge::Tensor> & params_list)158 uint32_t MS_EXPORT SummarySaveCallback(uint32_t graph_id, const std::map<std::string, ge::Tensor> &params_list) {
159   // Acquire GIL before calling Python code
160   py::gil_scoped_acquire acquire;
161 
162   MS_LOG(DEBUG) << "Start the summary save callback function for graph " << graph_id << ".";
163   py::list summary_list = py::list();
164   MS_LOG(DEBUG) << "Param list size = " << params_list.size();
165   for (auto &item : params_list) {
166     std::string tag_name = item.first;
167     std::shared_ptr<ge::Tensor> ge_tensor_ptr = std::make_shared<ge::Tensor>(item.second);
168     TensorPtr tensor_ptr = GetMeTensorForSummary(tag_name, ge_tensor_ptr);
169     if (tensor_ptr == nullptr) {
170       MS_LOG(EXCEPTION) << "ConvertGeTensor return tensor is null";
171     }
172     py::dict summary_value_dict;
173     summary_value_dict["name"] = tag_name;
174     summary_value_dict["data"] = tensor_ptr;
175     summary_list.append(summary_value_dict);
176   }
177 
178   py::bool_ ret = parse::python_adapter::CallPyFn(PYTHON_MOD_CALLBACK_MODULE, PYTHON_FUN_PROCESS_SUMMARY, summary_list);
179   auto bool_ret = py::cast<bool>(ret);
180   if (!bool_ret) {
181     MS_LOG(ERROR) << "Python checkpoint return false during callback";
182     return Status::FAILED;
183   }
184   MS_LOG(DEBUG) << "End the summary save callback function.";
185   return Status::SUCCESS;
186 }
187 }  // namespace callbacks
188 }  // namespace mindspore
189