1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_QUANTIZE_UTIL_H_
18 #define MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_QUANTIZE_UTIL_H_
19
20 #ifndef _MSC_VER
21 #include <dirent.h>
22 #endif
23 #include <sys/stat.h>
24 #include <memory>
25 #include <string>
26 #include <cmath>
27 #include <set>
28 #include <array>
29 #include <vector>
30 #include <algorithm>
31 #include <limits>
32 #include <utility>
33 #include "ops/mat_mul.h"
34 #include "ops/lstm.h"
35 #include "ops/fusion/full_connection.h"
36 #include "tools/converter/quantizer/quantizer.h"
37 #include "include/errorcode.h"
38 #include "ir/func_graph.h"
39 #include "ir/anf.h"
40 #include "include/model.h"
41 #include "base/base.h"
42 #include "ir/primitive.h"
43 #include "abstract/dshape.h"
44 #include "tools/converter/quantizer/huffman_encode.h"
45 #include "tools/converter/quantizer/bitpacking.h"
46 #include "tools/converter/quantizer/mixed_bit_weight_quantizer.h"
47 #include "src/lite_session.h"
48 #include "tools/converter/graphdef_transform.h"
49 #include "src/common/file_utils.h"
50 #include "src/common/quant_utils.h"
51
52 namespace mindspore::lite::quant {
53 enum WeightQuantType {
54 FIXED_BIT_PER_CHANNEL = 0,
55 FIXED_BIT_PER_LAYER = 1,
56 MIXED_BIT_PER_LAYER = 2,
57 };
58 constexpr size_t kUint8Quantization = 8;
59 constexpr size_t kMaxBit = 8;
60 constexpr size_t kMaxNum1024 = 1024;
61 constexpr float kPercentBase = 100.0;
62 constexpr size_t kMillisecondsBase = 10;
63 constexpr size_t kWightIndex = 1;
64 constexpr double kScaleThreashold = 1e-38;
65
66 struct SessionModel {
67 session::LiteSession *session{nullptr};
68 Model *model{nullptr};
69 };
70
71 /**
72 * 1. when op's weight size > mWeightSize just skip
73 * 2. only do conv/deconv/convdepthwise/deconvdepthwise/mul/matmul/batchmatmul quantization
74 * 3. when conv/deconv/convdepthwise/deconvdepthwise ops' weight channel size > covWeightQuantChannelThreshold just skip
75 * */
76 class QuantStrategy {
77 public:
78 explicit QuantStrategy(size_t weightSize, size_t covWeightQuantChannelThreshold = 16);
79
80 ~QuantStrategy() = default;
81
82 bool CanConvOpQuantized(const CNodePtr &node) const;
83 bool CanMulOpQuantized(const CNodePtr &node) const;
84 static bool CanOpFullQuantized(const AnfNodePtr &node);
85 bool CanTensorQuantized(const AnfNodePtr &inputNode) const;
86
87 size_t m_weight_size_;
88 size_t m_conv_weight_quant_channel_threshold_;
89
90 private:
91 static const std::vector<std::string> conv_types_;
92 static const std::vector<std::string> mul_types_;
93 };
94
95 constexpr float delta = 0.1;
96 constexpr float ratio = 10.0;
97 constexpr int percent = 10;
98 constexpr int quant_param_size = 32 * 8;
99
100 QuantParamHolderPtr GetCNodeQuantHolder(const PrimitivePtr &primitive);
101
102 STATUS CalQuantizationParams(schema::QuantParamT *quantParam, double mMin, double mMax, bool narrowRange = false,
103 int numBits = kUint8Quantization);
104
105 std::pair<float, float> OutlierMethod(std::vector<float> min_datas, std::vector<float> max_datas);
106
107 std::vector<int8_t> KMeans(float *data, size_t elem_count, size_t k, size_t epochs, schema::QuantParamT *quantParam);
108
109 STATUS UpdateTensorDataAndSize(const tensor::TensorPtr &weight, void *quant_datas, int new_size, TypeId new_data_type);
110
111 int CalChannels(const ShapeVector &dims, int channel_cnt, bool *channel_at_first);
112
113 void CalQuantAssitInfo(const PrimitivePtr &primitive, const ShapeVector &shapes, int index, bool *channel_at_first,
114 int *channel_cnt);
115
116 void CalQuantAssitInfo(const schema::PrimitiveT &primitive, const std::vector<int> &shapes, int index,
117 bool *channel_at_first, int *channel_cnt);
118
119 bool TensorQuantParamsInited(const schema::TensorT &tensor);
120
121 template <typename T>
DoBitPack(const tensor::TensorPtr & weight,const size_t & bit_num,const std::vector<T> & quant_datas)122 STATUS DoBitPack(const tensor::TensorPtr &weight, const size_t &bit_num, const std::vector<T> &quant_datas) {
123 if (bit_num != 8 && bit_num != 16) {
124 std::vector<T> data{};
125 for (size_t i = 0; i < quant_datas.size(); ++i) {
126 data.emplace_back((static_cast<T>(quant_datas[i])));
127 }
128 if (bit_num > 0 && bit_num < 8) {
129 std::vector<uint8_t> pack_data{};
130 BitPack::BitPacking<T, uint8_t>(bit_num, data, &pack_data);
131 auto status =
132 UpdateTensorDataAndSize(weight, pack_data.data(), pack_data.size() * sizeof(uint8_t), kNumberTypeUInt8);
133 if (status != RET_OK) {
134 MS_LOG(ERROR) << "UpdateTensorDataAndSize error";
135 return RET_ERROR;
136 }
137 } else if (bit_num > 8 && bit_num < 16) {
138 std::vector<uint16_t> pack_data{};
139 BitPack::BitPacking<T, uint16_t>(bit_num, data, &pack_data);
140 auto status =
141 UpdateTensorDataAndSize(weight, pack_data.data(), pack_data.size() * sizeof(uint16_t), kNumberTypeUInt16);
142 if (status != RET_OK) {
143 MS_LOG(ERROR) << "UpdateTensorDataAndSize error";
144 return RET_ERROR;
145 }
146 }
147 }
148 return RET_OK;
149 }
150
151 STATUS MixedBitQuantFilter(const tensor::TensorPtr &weight, const PrimitivePtr &primitive, QuantType quant_type,
152 WeightQuantType weight_quant_type, TypeId quant_data_type, double init_scale, int index);
153
154 template <typename T>
155 STATUS FixedBitQuantFilter(const tensor::TensorPtr &weight, const PrimitivePtr &primitive, QuantType quant_type,
156 int quant_max, int quant_min, size_t bit_num, WeightQuantType weight_quant_type,
157 TypeId quant_data_type, int index = 1, bool k_means = false) {
158 MS_ASSERT(weight != nullptr);
159 MS_ASSERT(primitive != nullptr);
160 auto dims = weight->shape();
161 if (weight_quant_type == FIXED_BIT_PER_CHANNEL) {
162 if (dims.size() <= 1) {
163 MS_LOG(WARNING) << "dims is " << dims.size() << " can not per_channel";
164 weight_quant_type = FIXED_BIT_PER_LAYER;
165 }
166 }
167
168 std::vector<schema::QuantParamT> quant_params;
169 size_t elem_count = weight->DataSize();
170 auto *raw_data = static_cast<float *>(weight->data_c());
171 if (raw_data == nullptr) {
172 MS_LOG(ERROR) << "rawDatas is nullptr";
173 return RET_ERROR;
174 }
175
176 std::vector<T> quant_data(elem_count);
177 int ret = RET_OK;
178 if (weight_quant_type == FIXED_BIT_PER_CHANNEL) {
179 bool channel_at_first = true;
180 int channel_cnt = -1;
181 CalQuantAssitInfo(primitive, dims, index, &channel_at_first, &channel_cnt);
182 auto channels = CalChannels(dims, channel_cnt, &channel_at_first);
183 if (channels == 0) {
184 MS_LOG(ERROR) << "channels is zero";
185 return RET_ERROR;
186 }
187 ret = DoPerChannelQuant<T>(static_cast<float *>(weight->data_c()), weight->DataSize(),
188 static_cast<mindspore::schema::QuantType>(quant_type), &quant_params, quant_max,
189 quant_min, bit_num, k_means, &quant_data, channels, channel_at_first);
190 if (ret == RET_CONTINUE) {
191 return ret;
192 } else if (ret != RET_OK) {
193 MS_LOG(ERROR) << "Do per channel quant failed.";
194 return ret;
195 }
196 } else if (weight_quant_type == FIXED_BIT_PER_LAYER) {
197 ret = DoPerLayerQuant<T>(static_cast<float *>(weight->data_c()), weight->DataSize(), &quant_params, quant_max,
198 quant_min, bit_num, k_means, &quant_data);
199 if (ret != RET_OK) {
200 MS_LOG(ERROR) << "Do per layer quant failed.";
201 return ret;
202 }
203 } else {
204 MS_LOG(ERROR) << "Unsupported weight quant type:" << weight_quant_type;
205 }
206 auto status = UpdateTensorDataAndSize(weight, quant_data.data(), quant_data.size() * sizeof(T), quant_data_type);
207 if (status != RET_OK) {
208 MS_LOG(ERROR) << "UpdateTensorDataAndSize error";
209 return RET_ERROR;
210 }
211
212 #ifdef HUFFMAN_ENCODE
213 auto huffman_encode = std::make_unique<lite::HuffmanEncode>();
214 ret = huffman_encode->DoHuffmanEncode(weight, primitive, quant_datas.data(), bit_num);
215 if (ret != RET_OK) {
216 MS_LOG(ERROR) << "Do huffman encode failed.";
217 return ret;
218 }
219 #endif
220
221 if (quant_params.empty()) {
222 MS_LOG(ERROR) << "quant_params empty";
223 return RET_ERROR;
224 }
225 auto quant_param_holder = GetCNodeQuantHolder(primitive);
226 if (quant_type == QuantType_QUANT_ALL) {
227 quant_param_holder->set_input_quant_param(index, quant_params);
228 } else {
229 quant_param_holder->set_input_quant_param(index, quant_params);
230 }
231 return ret;
232 }
233
234 // utils
235
236 std::string NodePrimitiveType(const CNodePtr &cnode);
237
238 SessionModel CreateSessionByFuncGraph(const FuncGraphPtr &func_graph, const converter::Flags &flags, int thread_num);
239
240 FuncGraphPtr CopyFuncGraph(const FuncGraphPtr &);
241
242 void GetLiteParameter(const AnfNodePtr &node, ParameterPtr *param_node, tensor::TensorPtr *tensor_info);
243
244 bool CheckNodeInSet(const CNodePtr &cnode, const std::set<PrimitivePtr> &support_primitive_types);
245 } // namespace mindspore::lite::quant
246 #endif // MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_QUANTIZE_UTIL_H_
247