1 /**
2 * Copyright 2021-2022 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "tools/converter/quantizer/parameter_tunner.h"
18 #include <set>
19 #include <functional>
20 #include <map>
21 #include <memory>
22 #include <vector>
23 #include <algorithm>
24 #include "tools/converter/preprocess/cv_calib_data.h"
25 #include "tools/converter/preprocess/image_preprocess.h"
26 #ifdef MSLITE_DEPS_OPENCV
27 #include "tools/converter/preprocess/opencv_utils.h"
28 #endif
29 #include "tools/converter/quantizer/quantize_util.h"
30 #include "tools/converter/quantizer/weight_quantizer.h"
31 #include "tools/converter/export_model.h"
32 #include "tools/common/tensor_util.h"
33 #include "tools/converter/parser/parser_utils.h"
34
35 namespace mindspore::lite::quant {
36 namespace {
37 static const int kOneChannel = 1;
38 static const int kThreeChannels = 3;
39 static const int kSixChannels = 6;
40 static const int kDefaultRangeStart = 1;
41 static const int kDefaultRangeEnd = 30000;
42 static const int kDefaultExtendFactor = 4;
43 static const int kMaxStepCoarseSearch = 173;
44 static const int kMaxStepFineSearch = 20;
45 static const int kMaxFineSearchIterations = 3;
46 static const int kFinestGranularity = 2;
47 static const int kCompressToInt8Ratio = 4.0;
48 static const int kTwo = 2;
49 static const float kScaleFactor = (0.01 * 0.01 * 0.01 * 24.0);
50 } // namespace
GetMinScale(const std::set<tensor::TensorPtr> & weight_quantized_tensors)51 float GetMinScale(const std::set<tensor::TensorPtr> &weight_quantized_tensors) {
52 size_t max_tensor_size = 1;
53 for (const auto &tensor : weight_quantized_tensors) {
54 max_tensor_size = std::max(max_tensor_size, tensor->DataSize());
55 }
56 return (max_tensor_size > 0) ? std::sqrt(kScaleFactor / max_tensor_size) : kScaleFactor;
57 }
58
ExtendBatchSize(const std::shared_ptr<mindspore::Model> & model,std::vector<MSTensor> * inputs,int batch)59 static Status ExtendBatchSize(const std::shared_ptr<mindspore::Model> &model, std::vector<MSTensor> *inputs,
60 int batch) {
61 std::vector<std::vector<int64_t>> dims(inputs->size());
62 int i = 0;
63 for (auto input : *inputs) {
64 dims.at(i) = input.Shape();
65 dims.at(i).at(0) = batch;
66 input.SetShape(dims.at(i));
67 i++;
68 }
69 return model->Resize(*inputs, dims);
70 }
71
CopyDataAndRun(const std::shared_ptr<mindspore::Model> & origin_model,const std::shared_ptr<mindspore::Model> & quant_model)72 int ParameterOptimizer::CopyDataAndRun(const std::shared_ptr<mindspore::Model> &origin_model,
73 const std::shared_ptr<mindspore::Model> &quant_model) {
74 auto weight_quant_inputs = quant_model->GetInputs();
75 for (auto input : weight_quant_inputs) {
76 auto origin_tensor = origin_model->GetInputByTensorName(input.Name());
77 auto weight_quant_tensor_data = input.MutableData();
78 if (memcpy_s(weight_quant_tensor_data, input.DataSize(), origin_tensor.MutableData(), origin_tensor.DataSize()) !=
79 EOK) {
80 MS_LOG(ERROR) << "memcpy data failed.";
81 return RET_ERROR;
82 }
83 }
84 auto weight_quant_outputs = quant_model->GetOutputs();
85 auto model_status = quant_model->Predict(weight_quant_inputs, &weight_quant_outputs);
86 if (model_status != kSuccess) {
87 MS_LOG(ERROR) << "Run origin session failed.";
88 return RET_ERROR;
89 }
90
91 return RET_OK;
92 }
93
WeightQuantModelInference(const FuncGraphPtr & func_graph,const std::shared_ptr<ConverterPara> & param,const std::shared_ptr<mindspore::Model> & origin_model,size_t origin_model_size,SearchParams * s_param,int * ret_scale,float * best_compress_ratio,bool * found_valid_scale)94 int ParameterOptimizer::WeightQuantModelInference(const FuncGraphPtr &func_graph,
95 const std::shared_ptr<ConverterPara> ¶m,
96 const std::shared_ptr<mindspore::Model> &origin_model,
97 size_t origin_model_size, SearchParams *s_param, int *ret_scale,
98 float *best_compress_ratio, bool *found_valid_scale) {
99 CHECK_NULL_RETURN(param);
100 CHECK_NULL_RETURN(origin_model);
101 CHECK_NULL_RETURN(ret_scale);
102 CHECK_NULL_RETURN(best_compress_ratio);
103 CHECK_NULL_RETURN(found_valid_scale);
104 CHECK_NULL_RETURN(s_param);
105 const float threshold = 0.995f;
106 *best_compress_ratio = 0.0f;
107 *found_valid_scale = false;
108 for (int scale = s_param->range_start; scale <= s_param->range_end; scale += s_param->step) {
109 param->commonQuantParam.quant_type = quant::QUANT_WEIGHT;
110 FuncGraphPtr func_graph_bak;
111 auto ret = CloneFuncGraph(func_graph, param, &func_graph_bak);
112 if (ret != RET_OK) {
113 MS_LOG(ERROR) << "Clone FuncGraph failed.";
114 return ret;
115 }
116
117 auto quantizer = std::make_unique<quant::WeightQuantizer>(param, 1.0f / scale);
118 CHECK_NULL_RETURN(quantizer);
119 auto status = quantizer->DoQuantize(func_graph_bak);
120 if (status != RET_OK) {
121 MS_LOG(WARNING) << "DoQuantization failed " << status;
122 continue;
123 }
124
125 if (scale == 1) {
126 float inv_min_scale = GetMinScale(quantizer->GetWeightQuantizedTensors());
127 if (inv_min_scale != 0) {
128 if ((1.0 / inv_min_scale) > s_param->range_end) {
129 // extend scale end
130 int num_of_steps = (s_param->range_end - s_param->range_start) / s_param->step;
131 s_param->range_end = static_cast<int>(1.0 / inv_min_scale);
132 s_param->step = s_param->range_end / num_of_steps;
133 }
134 }
135 std::cout << "=== Basic search in range [1," << s_param->range_end << "] === " << s_param->step << "\n";
136 }
137
138 MS_LOG(INFO) << "create quant session";
139 size_t weight_quant_size = 0;
140 auto weight_quant_model = std::make_shared<mindspore::Model>();
141 CHECK_NULL_RETURN(weight_quant_model);
142 auto build_status = BuildModelByFuncGraph(weight_quant_model, func_graph_bak, param, &weight_quant_size);
143 if (build_status != kSuccess) {
144 MS_LOG(WARNING) << "build model failed!";
145 continue;
146 }
147 auto weight_quant_inputs = weight_quant_model->GetInputs();
148 if ((param->dataPreProcessParam.calibrate_size == 0) && (param->mixedBitWeightQuantParam.use_cv_data)) {
149 if (ExtendBatchSize(weight_quant_model, &weight_quant_inputs, kNumOfCalibrationImages) != kSuccess) {
150 MS_LOG(ERROR) << "Resize session for CV calibration failed!";
151 return RET_ERROR;
152 }
153 }
154 auto model_status = CopyDataAndRun(origin_model, weight_quant_model);
155 if (model_status != kSuccess) {
156 MS_LOG(ERROR) << "Copy Input Data to model failed.";
157 return RET_ERROR;
158 }
159 auto cos_sim = CompareDataByCosineDistance<float>(origin_model, weight_quant_model);
160 MS_CHECK_TRUE_MSG(weight_quant_size > 0, RET_ERROR, "weight quant size must be larger than 0");
161 const auto compress_ratio = 1.0 * origin_model_size / weight_quant_size;
162 std::cout << " scale:" << scale << " cos_sim:" << cos_sim << " compression ratio:" << compress_ratio << std::endl;
163 if (cos_sim >= threshold) {
164 if (compress_ratio > *best_compress_ratio) {
165 *best_compress_ratio = compress_ratio;
166 *found_valid_scale = true;
167 *ret_scale = scale;
168 return RET_OK;
169 }
170 }
171 }
172 *found_valid_scale = false;
173 MS_LOG(DEBUG) << "Couldn't reach cosine similarity constraint";
174 return RET_OK;
175 }
176
177 #ifdef MSLITE_DEPS_OPENCV
PrepareSingleImage(const uint8_t * buf,int len,const std::vector<int64_t> & shape,uint8_t * out_buf,size_t * out_len)178 static int PrepareSingleImage(const uint8_t *buf, int len, const std::vector<int64_t> &shape, uint8_t *out_buf,
179 size_t *out_len) {
180 cv::Mat mat;
181 const int HEIGHT_INDEX = 1;
182 const int WIDTH_INDEX = 2;
183 const std::vector<double> mean = {127.5, 127.5, 127.5};
184 const std::vector<double> standard_deviation = {127.5, 127.5, 127.5};
185 auto ret = preprocess::DecodeBuffer(buf, len, &mat);
186 if (ret != RET_OK) {
187 MS_LOG(ERROR) << "PrepareSingleImage error in decode: " << ret;
188 return ret;
189 }
190 ret = preprocess::Resize(&mat, shape.at(WIDTH_INDEX), shape.at(HEIGHT_INDEX), cv::INTER_LINEAR);
191 if (ret != RET_OK) {
192 MS_LOG(ERROR) << "PrepareSingleImage error in Resize: " << ret;
193 return ret;
194 }
195 ret = preprocess::Normalize(&mat, mean, standard_deviation);
196 if (ret != RET_OK) {
197 MS_LOG(ERROR) << "PrepareSingleImage error in Normalize: " << ret;
198 return ret;
199 }
200 if (shape.at(kNHWC_C) == kOneChannel) {
201 ret = preprocess::ConvertImageFormat(&mat, cv::COLOR_BGR2GRAY);
202 if (ret != RET_OK) {
203 MS_LOG(ERROR) << "PrepareSingleImage error in Gray Scaling: " << ret;
204 return ret;
205 }
206 }
207 if (shape.at(kNHWC_C) == kSixChannels) { // in case of 2 stacked images
208 std::vector<cv::Mat> channels(kThreeChannels);
209 cv::split(mat, channels);
210 std::vector<cv::Mat> mat_6_channels;
211 for (int i = 0; i < kThreeChannels; i++) {
212 mat_6_channels.push_back(channels[i]);
213 }
214 for (int i = 0; i < kThreeChannels; i++) {
215 mat_6_channels.push_back(channels[i]);
216 }
217 cv::merge(mat_6_channels, mat);
218 }
219 uint8_t *data = nullptr;
220 size_t size = 0;
221 ret = preprocess::GetMatData(mat, reinterpret_cast<void **>(&data), &size);
222 if (data == nullptr || size == 0) {
223 MS_LOG(ERROR) << "GetMatData data is nullptr or size == 0";
224 return RET_ERROR;
225 }
226 if (ret != RET_OK) {
227 delete[] data;
228 MS_LOG(ERROR) << "Get mat data failed.";
229 return ret;
230 }
231
232 if (size > *out_len) {
233 delete[] data;
234 MS_LOG(ERROR) << "Buffer Size mismatch " << size << " vs " << *out_len;
235 return ret;
236 }
237 std::copy(data, data + size, out_buf);
238 delete[] data;
239 *out_len = size;
240 return RET_OK;
241 }
242
GenerateCvData(mindspore::MSTensor * tensor)243 static int GenerateCvData(mindspore::MSTensor *tensor) {
244 MS_ASSERT(tensor != nullptr);
245 auto input_data = tensor->MutableData();
246 if (input_data == nullptr) {
247 MS_LOG(ERROR) << "MallocData for tensor failed";
248 return RET_ERROR;
249 }
250
251 const int num_of_images = kNumOfCalibrationImages;
252 const uint8_t *ims[num_of_images] = {COCO_train2014_0581821, COCO_train2014_0581821, COCO_train2014_0581821};
253 int im_sizes[num_of_images] = {sizeof(COCO_train2014_0581882), sizeof(COCO_train2014_0581909),
254 sizeof(COCO_train2014_0581909)};
255 uint8_t *t_data = reinterpret_cast<uint8_t *>(tensor->MutableData());
256 size_t t_size = tensor->DataSize();
257 size_t loc = 0;
258
259 for (int i = 0; i < num_of_images; i++) {
260 size_t o_size = t_size - loc;
261 auto ret = PrepareSingleImage(ims[i], im_sizes[i], tensor->Shape(), t_data + loc, &o_size);
262 if (ret != RET_OK) {
263 MS_LOG(ERROR) << "Preparing Single image error";
264 return ret;
265 }
266 loc += o_size;
267 }
268 return RET_OK;
269 }
270 #endif
271
OriginModelInference(const FuncGraphPtr & func_graph,const std::shared_ptr<ConverterPara> & param,const std::shared_ptr<mindspore::Model> & origin_model,size_t * origin_model_size)272 int ParameterOptimizer::OriginModelInference(const FuncGraphPtr &func_graph,
273 const std::shared_ptr<ConverterPara> ¶m,
274 const std::shared_ptr<mindspore::Model> &origin_model,
275 size_t *origin_model_size) {
276 CHECK_NULL_RETURN(param);
277 CHECK_NULL_RETURN(origin_model);
278 CHECK_NULL_RETURN(origin_model_size);
279 FuncGraphPtr func_graph_bak;
280 auto ret = CloneFuncGraph(func_graph, param, &func_graph_bak);
281 if (ret != RET_OK) {
282 MS_LOG(ERROR) << "Clone FuncGraph failed.";
283 return RET_ERROR;
284 }
285 param->commonQuantParam.quant_type = quant::QUANT_NONE;
286 *origin_model_size = 0;
287 auto status = BuildModelByFuncGraph(origin_model, func_graph_bak, param, origin_model_size);
288 if (status != kSuccess) {
289 MS_LOG(ERROR) << "build model failed!";
290 return RET_ERROR;
291 }
292 auto origin_inputs = origin_model->GetInputs();
293 if ((param->dataPreProcessParam.calibrate_size == 0) && (param->mixedBitWeightQuantParam.use_cv_data)) {
294 if (ExtendBatchSize(origin_model, &origin_inputs, kNumOfCalibrationImages) != kSuccess) {
295 MS_LOG(ERROR) << "Resize session for CV calibration failed!";
296 return RET_ERROR;
297 }
298 }
299
300 for (auto input : origin_inputs) {
301 if (param->dataPreProcessParam.calibrate_size > 0) {
302 ret = preprocess::PreProcess(param->dataPreProcessParam, input.Name(), 0, &input);
303 } else {
304 if (param->mixedBitWeightQuantParam.use_cv_data && (input.Shape().size() == DIMENSION_4D) &&
305 (input.Shape().at(0) == kNumOfCalibrationImages) &&
306 ((input.Shape().at(kNHWC_C) == kOneChannel) || (input.Shape().at(kNHWC_C) == kThreeChannels) ||
307 (input.Shape().at(kNHWC_C) == kSixChannels)) &&
308 (input.DataType() == DataType::kNumberTypeFloat32)) {
309 #ifdef MSLITE_DEPS_OPENCV
310 ret = GenerateCvData(&input);
311 } else {
312 #endif
313 ret = GenerateRandomData(&input);
314 }
315 }
316 if (ret != RET_OK) {
317 MS_LOG(ERROR) << input.Name() << ":"
318 << "Generate random data failed.";
319 return ret;
320 }
321 }
322 auto origin_outputs = origin_model->GetOutputs();
323 auto model_status = origin_model->Predict(origin_inputs, &origin_outputs);
324 if (model_status != kSuccess) {
325 MS_LOG(ERROR) << "Run origin predict failed.";
326 return RET_ERROR;
327 }
328 return RET_OK;
329 }
330
GridSearchForScale(const FuncGraphPtr & func_graph,const std::shared_ptr<ConverterPara> & param,double * init_scale)331 int ParameterOptimizer::GridSearchForScale(const FuncGraphPtr &func_graph, const std::shared_ptr<ConverterPara> ¶m,
332 double *init_scale) {
333 CHECK_NULL_RETURN(param);
334 CHECK_NULL_RETURN(init_scale);
335
336 auto origin_model = std::make_shared<mindspore::Model>();
337 size_t origin_model_size;
338 int ord_param = kDefaultRangeStart;
339 auto ret = OriginModelInference(func_graph, param, origin_model, &origin_model_size);
340 if (ret != RET_OK) {
341 MS_LOG(ERROR) << "Origin Model Inference failed.";
342 return ret;
343 }
344
345 float best_compress_ratio = 0;
346 bool found_valid_scale = false;
347 int steps_per_stage = param->mixedBitWeightQuantParam.max_iterations / (kMaxFineSearchIterations + 1);
348 if (steps_per_stage > kMaxStepCoarseSearch) {
349 steps_per_stage = kMaxStepCoarseSearch;
350 }
351
352 int range_start = kDefaultRangeStart;
353 int range_end = kDefaultRangeEnd;
354 int step = (range_end - range_start) / steps_per_stage;
355 SearchParams search_param = {range_start, range_end, step};
356
357 std::cout << "====== Search for the best scale =======\n";
358 ret = WeightQuantModelInference(func_graph, param, origin_model, origin_model_size, &search_param, &ord_param,
359 &best_compress_ratio, &found_valid_scale);
360 if (ret != RET_OK) {
361 MS_LOG(ERROR) << "Weight quant graph inference failed.";
362 return ret;
363 }
364 step = search_param.step;
365
366 if (found_valid_scale == false || (ord_param == kDefaultRangeStart && (best_compress_ratio < kCompressToInt8Ratio))) {
367 range_start = search_param.range_end;
368 range_end = kDefaultExtendFactor * search_param.range_end;
369 SearchParams wider_range_param = {range_start, range_end, step};
370 std::cout << "=== Couldn't find proper compression, extending the search range ===\n";
371 ret = WeightQuantModelInference(func_graph, param, origin_model, origin_model_size, &wider_range_param, &ord_param,
372 &best_compress_ratio, &found_valid_scale);
373 if (ret != RET_OK) {
374 MS_LOG(ERROR) << "Weight quant graph inference failed.";
375 return ret;
376 }
377 if (found_valid_scale == false) {
378 std::cout << "=== Couldn't find compression that will match similarity constraints. Aborting! ===\n";
379 std::cout << "======================= You may try fixed 8bit quantization =======================\n";
380 return RET_ERROR;
381 }
382 }
383
384 if (steps_per_stage > kMaxStepFineSearch) {
385 steps_per_stage = kMaxStepFineSearch;
386 }
387 for (int search_cnt = 0; search_cnt < kMaxFineSearchIterations; search_cnt++) {
388 int prev_prev_val = ord_param - kTwo * step;
389 range_end = ord_param;
390 range_start = std::max(1, prev_prev_val);
391 step = (range_end - range_start) / steps_per_stage;
392 if (step < static_cast<int>(sqrt(static_cast<float>(range_end - range_start))) / kTwo) {
393 step = static_cast<int>(sqrt(static_cast<float>(range_end - range_start))) / kTwo;
394 }
395 range_start = ord_param - ((ord_param - range_start) / step) * step; // align search to meet prev scale
396 if ((range_start == search_param.range_start) || (range_start == prev_prev_val) || (range_start == 1)) {
397 range_start += step;
398 }
399
400 search_param.range_start = range_start;
401 search_param.range_end = range_end;
402 search_param.step = step;
403 std::cout << "=== Fine search " << search_cnt << " in range [" << range_start << "," << range_end << "] ===\n";
404 ret = WeightQuantModelInference(func_graph, param, origin_model, origin_model_size, &search_param, &ord_param,
405 &best_compress_ratio, &found_valid_scale);
406 if (ret != RET_OK) {
407 MS_LOG(ERROR) << "Weight quant graph inference failed.";
408 return ret;
409 }
410 if (step <= kFinestGranularity) {
411 break;
412 }
413 }
414 std::cout << "best compression is " << best_compress_ratio << " at scale " << ord_param << std::endl;
415 *init_scale = 1.0 / (ord_param);
416 return RET_OK;
417 }
418 } // namespace mindspore::lite::quant
419