1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "transform/graph_ir/callbacks_ge.h"
17 #include "pybind11/pybind11.h"
18 #include "ir/param_info.h"
19 #include "include/transform/graph_ir/utils.h"
20 #include "pipeline/jit/ps/parse/data_converter.h"
21 #include "include/common/utils/python_adapter.h"
22 #include "utils/shape_utils.h"
23
24 namespace mindspore {
25 namespace callbacks {
26 const char PYTHON_MOD_CALLBACK_MODULE[] = "mindspore.train.callback._callback";
27 const char PYTHON_FUN_PROCESS_CHECKPOINT[] = "checkpoint_cb_for_save_op";
28 const char PYTHON_FUN_PROCESS_SUMMARY[] = "summary_cb_for_save_op";
29 const char kSummary[] = "Summary";
30 const char kCheckPoint[] = "Save";
31 const int ONE_SHAPE = 1;
32
33 using mindspore::transform::Status;
34
GetParameterShape(const FuncGraphPtr & graph,const std::string & param_name,const std::shared_ptr<ShapeVector> & shape)35 bool GetParameterShape(const FuncGraphPtr &graph, const std::string ¶m_name,
36 const std::shared_ptr<ShapeVector> &shape) {
37 if (graph == nullptr) {
38 MS_LOG(ERROR) << "Graph is null, can not get graph parameter";
39 return false;
40 }
41
42 auto parameter_nodes = graph->parameters();
43 for (auto &node : parameter_nodes) {
44 ParameterPtr param_node = std::static_pointer_cast<Parameter>(node);
45 if (param_node == nullptr) {
46 MS_LOG(ERROR) << "Parameter node is null, can not get graph parameter";
47 return false;
48 }
49 if (param_node->name() == param_name) {
50 TensorPtr tensor;
51 if (param_node->has_default()) {
52 tensor = std::dynamic_pointer_cast<tensor::Tensor>(param_node->default_param());
53 }
54 if (tensor == nullptr) {
55 shape->push_back(ONE_SHAPE);
56 } else {
57 *shape = tensor->shape();
58 }
59 return true;
60 }
61 }
62 MS_LOG(ERROR) << "Can not find parameter of name:" << param_name;
63 return false;
64 }
65
GetMeTensorTransformed(uint32_t graph_id,const std::string & parameter_name,const std::shared_ptr<ge::Tensor> & ge_tensor_ptr)66 static TensorPtr GetMeTensorTransformed(uint32_t graph_id, const std::string ¶meter_name,
67 const std::shared_ptr<ge::Tensor> &ge_tensor_ptr) {
68 FuncGraphPtr anf_graph = transform::GetAnfGraph(graph_id);
69 if (anf_graph == nullptr) {
70 MS_LOG(ERROR) << "Get anf graph failed during callback";
71 return nullptr;
72 }
73
74 std::shared_ptr<ShapeVector> parameter_shape_ptr = std::make_shared<ShapeVector>();
75 if (!GetParameterShape(anf_graph, parameter_name, parameter_shape_ptr)) {
76 MS_LOG(ERROR) << "Can not get parameter shape during callback";
77 return nullptr;
78 }
79
80 return transform::ConvertGeTensor(ge_tensor_ptr, *parameter_shape_ptr, true);
81 }
82
CheckpointSaveCallback(uint32_t graph_id,const std::map<std::string,ge::Tensor> & params_list)83 uint32_t CheckpointSaveCallback(uint32_t graph_id, const std::map<std::string, ge::Tensor> ¶ms_list) {
84 // Acquire GIL before calling Python code
85 py::gil_scoped_acquire acquire;
86
87 MS_LOG(DEBUG) << "Start the checkpoint save callback function in checkpoint save process.";
88 py::list parameter_list = py::list();
89 for (auto &item : params_list) {
90 std::string name = item.first;
91 std::shared_ptr<ge::Tensor> ge_tensor_ptr = std::make_shared<ge::Tensor>(item.second);
92 if (name.size() > 5 && name.compare(name.size() - 5, 5, "_temp") == 0) {
93 continue;
94 } else {
95 TensorPtr tensor_ptr = GetMeTensorTransformed(graph_id, name, ge_tensor_ptr);
96 if (tensor_ptr == nullptr) {
97 MS_LOG(EXCEPTION) << "Transform ge tensor to me tensor failed";
98 }
99 py::dict param_dict;
100 param_dict["name"] = name;
101 param_dict["data"] = tensor_ptr;
102 parameter_list.append(param_dict);
103 }
104 }
105 py::bool_ ret = python_adapter::CallPyFn(PYTHON_MOD_CALLBACK_MODULE, PYTHON_FUN_PROCESS_CHECKPOINT, parameter_list);
106 auto bool_ret = py::cast<bool>(ret);
107
108 uint32_t status = IntToUint(Status::SUCCESS);
109 if (!bool_ret) {
110 status = IntToUint(Status::FAILED);
111 MS_LOG(ERROR) << "Python checkpoint return false during callback";
112 }
113 return status;
114 }
115
GetMeTensorForSummary(const std::string & name,const std::shared_ptr<ge::Tensor> & ge_tensor_ptr)116 static TensorPtr GetMeTensorForSummary(const std::string &name, const std::shared_ptr<ge::Tensor> &ge_tensor_ptr) {
117 // confirm the type by name
118 // Format: xxx[:Scalar] xxx[:Image] xxx[:Tensor]
119 if (name.empty()) {
120 MS_LOG(EXCEPTION) << "The summary name is empty.";
121 }
122 auto bpos = name.rfind("[:");
123 if (bpos >= name.size()) {
124 MS_LOG(EXCEPTION) << "The summary name(" << name << ") is invalid.";
125 }
126 auto tname = name.substr(bpos);
127 if (tname == "[:Scalar]") {
128 MS_LOG(DEBUG) << "The summary(" << name << ") is Scalar";
129 // process the scalar type summary
130 // Because the ge tensor is dim = 4, so set the (1,1,1,1)-->(1,)
131 // We do the (1,) shape is scalar
132 auto shape = ShapeVector({ONE_SHAPE});
133 return transform::ConvertGeTensor(ge_tensor_ptr, shape);
134 }
135 if (tname == "[:Tensor]" || tname == "[:Histogram]") {
136 MS_LOG(DEBUG) << "The summary(" << name << ") is Tensor";
137 // process the tensor summary
138 // Now we can't get the real shape, so we keep same shape with GE
139 return transform::ConvertGeTensor(ge_tensor_ptr);
140 }
141 if (tname == "[:Image]") {
142 MS_LOG(DEBUG) << "The summary(" << name << ") is Image";
143 // process the Image summary
144 // Image dim = 4, is same with ge, so we keep same shape with GE
145 return transform::ConvertGeTensor(ge_tensor_ptr);
146 }
147
148 MS_LOG(EXCEPTION) << "The summary name(" << name << ") is invalid.";
149 }
150
151 // Cache the summary callback data
152 // Output Format: [{"name": tag_name, "data": tensor}, {"name": tag_name, "data": tensor},...]
SummarySaveCallback(uint32_t graph_id,const std::map<std::string,ge::Tensor> & params_list)153 uint32_t SummarySaveCallback(uint32_t graph_id, const std::map<std::string, ge::Tensor> ¶ms_list) {
154 // Acquire GIL before calling Python code
155 py::gil_scoped_acquire acquire;
156
157 MS_LOG(DEBUG) << "Start the summary save callback function for graph " << graph_id << ".";
158 py::list summary_list = py::list();
159 MS_LOG(DEBUG) << "Param list size = " << params_list.size();
160 for (auto &item : params_list) {
161 std::string tag_name = item.first;
162 std::shared_ptr<ge::Tensor> ge_tensor_ptr = std::make_shared<ge::Tensor>(item.second);
163 TensorPtr tensor_ptr = GetMeTensorForSummary(tag_name, ge_tensor_ptr);
164 if (tensor_ptr == nullptr) {
165 MS_LOG(EXCEPTION) << "ConvertGeTensor return tensor is null";
166 }
167 py::dict summary_value_dict;
168 summary_value_dict["name"] = tag_name;
169 summary_value_dict["data"] = tensor_ptr;
170 summary_list.append(summary_value_dict);
171 }
172
173 py::bool_ ret = python_adapter::CallPyFn(PYTHON_MOD_CALLBACK_MODULE, PYTHON_FUN_PROCESS_SUMMARY, summary_list);
174 auto bool_ret = py::cast<bool>(ret);
175 if (!bool_ret) {
176 MS_LOG(ERROR) << "Python checkpoint return false during callback";
177 return IntToUint(Status::FAILED);
178 }
179 MS_LOG(DEBUG) << "End the summary save callback function.";
180 return IntToUint(Status::SUCCESS);
181 }
182 } // namespace callbacks
183 } // namespace mindspore
184