1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef MINDSPORE_LITE_TOOLS_BENCHMARK_TRAIN_NET_TRAIN_H_
18 #define MINDSPORE_LITE_TOOLS_BENCHMARK_TRAIN_NET_TRAIN_H_
19
20 #include <getopt.h>
21 #include <signal.h>
22 #include <unordered_map>
23 #include <fstream>
24 #include <iostream>
25 #include <map>
26 #include <cmath>
27 #include <string>
28 #include <vector>
29 #include <memory>
30 #include <cfloat>
31 #include <utility>
32 #include <algorithm>
33
34 #include "tools/common/flag_parser.h"
35 #include "src/common/file_utils.h"
36 #include "src/common/utils.h"
37 #include "include/lite_session.h"
38
39 namespace mindspore::lite {
40 enum MS_API DataType { kImage = 0, kBinary = 1 };
41
42 constexpr float relativeTolerance = 1e-5;
43 constexpr float absoluteTolerance = 1e-8;
44
45 template <typename T>
TensorSum(void * data,int size)46 float TensorSum(void *data, int size) {
47 T *typed_data = reinterpret_cast<T *>(data);
48 float sum = 0.f;
49 for (int i = 0; i < size; i++) {
50 sum += static_cast<float>(typed_data[i]);
51 }
52 return sum;
53 }
54
55 class MS_API NetTrainFlags : public virtual FlagParser {
56 public:
NetTrainFlags()57 NetTrainFlags() {
58 // common
59 AddFlag(&NetTrainFlags::model_file_, "modelFile", "Input model file", "");
60 AddFlag(&NetTrainFlags::bb_model_file_, "bbModelFile", "Backboine model for transfer session", "");
61 AddFlag(&NetTrainFlags::in_data_file_, "inDataFile", "Input data file, if not set, use random input", "");
62 // MarkPerformance
63 AddFlag(&NetTrainFlags::warm_up_loop_count_, "warmUpLoopCount", "Run warm up loop", 0);
64 AddFlag(&NetTrainFlags::time_profiling_, "timeProfiling", "Run time profiling", false);
65 AddFlag(&NetTrainFlags::epochs_, "epochs", "Number of training epochs to run", 1);
66 AddFlag(&NetTrainFlags::num_threads_, "numThreads", "Run threads number", 1);
67 // MarkAccuracy
68 AddFlag(&NetTrainFlags::data_file_, "expectedDataFile", "Expected results data file path", "");
69 AddFlag(&NetTrainFlags::export_file_, "exportFile", "MS File to export trained model into", "");
70 AddFlag(&NetTrainFlags::accuracy_threshold_, "accuracyThreshold", "Threshold of accuracy", 0.5);
71 AddFlag(&NetTrainFlags::layer_checksum_, "layerCheckSum", "layer output checksum print (debug)", false);
72 AddFlag(&NetTrainFlags::enable_fp16_, "enableFp16", "Enable float16", false);
73 AddFlag(&NetTrainFlags::loss_name_, "lossName", "loss layer name", "");
74 AddFlag(&NetTrainFlags::inference_file_, "inferenceFile", "MS file to export inference model", "");
75 AddFlag(&NetTrainFlags::virtual_batch_, "virtualBatch", "use virtual batch", false);
76 AddFlag(&NetTrainFlags::resize_dims_in_, "inputShapes",
77 "Shape of input data, the format should be NHWC. e.g. 1,32,32,32:1,1,32,32,1", "");
78 AddFlag(&NetTrainFlags::is_raw_mix_precision_, "isRawMixPrecision",
79 "If model is mix precision export from MindSpore,please set true", false);
80 }
81
82 ~NetTrainFlags() override = default;
83 void InitResizeDimsList();
84
85 public:
86 // common
87 std::string model_file_;
88 std::string in_data_file_;
89 std::string bb_model_file_;
90 std::vector<std::string> input_data_list_;
91 DataType in_data_type_;
92 std::string in_data_type_in_ = "bin";
93 int cpu_bind_mode_ = 1;
94 bool enable_fp16_ = false;
95 bool virtual_batch_ = false;
96 // MarkPerformance
97 int num_threads_ = 1;
98 int warm_up_loop_count_ = 0;
99 bool time_profiling_;
100 int epochs_ = 1;
101 // MarkAccuracy
102 std::string data_file_;
103 std::string data_type_ = "FLOAT";
104 float accuracy_threshold_;
105 // Resize
106 std::string export_file_ = "";
107 std::string resize_dims_in_ = "";
108 bool layer_checksum_ = false;
109 std::vector<std::vector<int>> resize_dims_;
110 std::string loss_name_ = "";
111 std::string inference_file_ = "";
112 bool is_raw_mix_precision_ = false;
113 };
114
115 class MS_API NetTrain {
116 public:
NetTrain(NetTrainFlags * flags)117 explicit NetTrain(NetTrainFlags *flags) : flags_(flags) {}
118 virtual ~NetTrain() = default;
119
120 int Init();
121 int RunNetTrain();
122
123 private:
124 // call GenerateInputData or ReadInputFile to init inputTensors
125 int LoadInput(Vector<tensor::MSTensor *> *ms_inputs);
126 void CheckSum(mindspore::tensor::MSTensor *tensor, std::string node_type, int id, std::string in_out);
127 // call GenerateRandomData to fill inputTensors
128 int GenerateInputData(std::vector<mindspore::tensor::MSTensor *> *ms_inputs);
129
130 int GenerateRandomData(size_t size, void *data);
131
132 int ReadInputFile(std::vector<mindspore::tensor::MSTensor *> *ms_inputs);
133 int CreateAndRunNetwork(const std::string &filename, const std::string &bb_filename, int train_session, int epochs,
134 bool check_accuracy = true);
135
136 std::unique_ptr<session::LiteSession> CreateAndRunNetworkForInference(const std::string &filename,
137 const Context &context);
138
139 std::unique_ptr<session::LiteSession> CreateAndRunNetworkForTrain(const std::string &filename,
140 const std::string &bb_filename,
141 const Context &context, const TrainCfg &train_cfg,
142 int epochs);
143
144 int InitCallbackParameter();
145
146 int PrintResult(const std::vector<std::string> &title, const std::map<std::string, std::pair<int, float>> &result);
147
148 template <typename T>
PrintInputData(tensor::MSTensor * input)149 void PrintInputData(tensor::MSTensor *input) {
150 MS_ASSERT(input != nullptr);
151 static int i = 0;
152 auto inData = reinterpret_cast<T *>(input->MutableData());
153 size_t tensorSize = input->ElementsNum();
154 size_t len = (tensorSize < 20) ? tensorSize : 20;
155 std::cout << "InData" << i++ << ": ";
156 for (size_t j = 0; j < len; j++) {
157 std::cout << inData[j] << " ";
158 }
159 std::cout << std::endl;
160 }
161
162 // tensorData need to be converter first
163 template <typename T>
CompareData(const float * refOutput,int size,T * msTensorData)164 float CompareData(const float *refOutput, int size, T *msTensorData) {
165 size_t errorCount = 0;
166 float meanError = 0;
167 std::cout << "Data of model output: ";
168 for (int j = 0; j < std::min(50, size); j++) {
169 std::cout << static_cast<float>(msTensorData[j]) << " ";
170 }
171 std::cout << std::endl;
172 std::cout << "Data of Ref output : ";
173 for (int j = 0; j < std::min(50, size); j++) {
174 std::cout << refOutput[j] << " ";
175 }
176 std::cout << std::endl;
177 for (int j = 0; j < size; j++) {
178 if (std::isnan(msTensorData[j]) || std::isinf(msTensorData[j])) {
179 std::cerr << "Output tensor has nan or inf data, compare fail" << std::endl;
180 MS_LOG(ERROR) << "Output tensor has nan or inf data, compare fail";
181 return RET_ERROR;
182 }
183
184 auto tolerance = absoluteTolerance + relativeTolerance * fabs(refOutput[j]);
185 auto absoluteError = std::fabs(static_cast<float>(msTensorData[j]) - refOutput[j]);
186 if (absoluteError > tolerance) {
187 if (fabs(refOutput[j]) == 0) {
188 if (absoluteError > 1e-5) {
189 meanError += absoluteError;
190 errorCount++;
191 } else {
192 continue;
193 }
194 } else {
195 // just assume that atol = rtol
196 meanError += absoluteError / (fabs(refOutput[j]) + FLT_MIN);
197 errorCount++;
198 }
199 }
200 }
201 std::cout << std::endl;
202 if (meanError > 0.0f) {
203 meanError /= errorCount;
204 }
205
206 if (meanError <= 0.0000001) {
207 std::cout << "Mean bias of tensor: 0%" << std::endl;
208 } else {
209 std::cout << "Mean bias of tensor: " << meanError * 100 << "%" << std::endl;
210 }
211 return meanError;
212 }
213
214 int MarkPerformance(const std::unique_ptr<session::LiteSession> &session);
215 int MarkAccuracy(const std::unique_ptr<session::LiteSession> &session, bool enforce_accuracy = true);
216 int CompareOutput(const session::LiteSession &lite_session);
217 int SaveModels(const std::unique_ptr<session::LiteSession> &session);
218 int CheckExecutionOfSavedModels();
TensorNan(float * data,int size)219 void TensorNan(float *data, int size) {
220 for (int i = 0; i < size; i++) {
221 if (std::isnan(data[i])) {
222 std::cout << "nan value of index=" << i << std::endl;
223 break;
224 }
225 }
226 }
227 NetTrainFlags *flags_;
228
229 // callback parameters
230 uint64_t op_begin_ = 0;
231 int op_call_times_total_ = 0;
232 float op_cost_total_ = 0.0f;
233 std::map<std::string, std::pair<int, float>> op_times_by_type_;
234 std::map<std::string, std::pair<int, float>> op_times_by_name_;
235
236 mindspore::KernelCallBack before_call_back_;
237 mindspore::KernelCallBack after_call_back_;
238 };
239
240 int MS_API RunNetTrain(int argc, const char **argv);
241 } // namespace mindspore::lite
242 #endif // MINDSPORE_LITE_TOOLS_BENCHMARK_TRAIN_NET_TRAIN_H_
243