• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020-2023 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_LITE_SRC_RUNTIME_WEIGHT_DECODER_H_
18 #define MINDSPORE_LITE_SRC_RUNTIME_WEIGHT_DECODER_H_
19 
20 #include <map>
21 #include <utility>
22 #include <vector>
23 #include <queue>
24 #include <limits>
25 #include <string>
26 #include <cmath>
27 #include "nnacl/matmul_parameter.h"
28 #include "nnacl/gather_parameter.h"
29 #include "src/executor/kernel_exec.h"
30 #include "src/common/utils.h"
31 #include "src/tensor.h"
32 #include "src/litert/lite_model.h"
33 
34 static constexpr int kPerTensor = 1;
35 static constexpr int kBitNumMix = 0;
36 static constexpr int kBitNum1 = 1;
37 static constexpr int kBitNum8 = 8;
38 static constexpr int kBitNum16 = 16;
39 static constexpr int kBitNum32 = 32;
40 static constexpr int kMaxVarCorr = 10;
41 static constexpr int kNumberBase = 10;
42 
43 namespace mindspore::lite {
44 
45 class MS_API WeightDecoder {
46  public:
47   static int DequantNode(const OpParameter *op_parameter, const std::vector<Tensor *> &in_tensors, TypeId dst_data_type,
48                          const std::string &model_version, bool float_mode);
49   static int DecompressTensor(const SchemaTensorWrapper &src_tensor, lite::Tensor *dst_tensor);
50 
CompareVersion(const std::string & version1,const std::string & version2)51   static int CompareVersion(const std::string &version1, const std::string &version2) {
52     std::istringstream iss1(version1);
53     std::istringstream iss2(version2);
54     std::string string1;
55     std::string string2;
56     while (!iss1.eof() || !iss2.eof()) {
57       getline(iss1, string1, '.');
58       getline(iss2, string2, '.');
59       int64_t integer1 = std::strtol(string1.c_str(), nullptr, kNumberBase);
60       int64_t integer2 = std::strtol(string2.c_str(), nullptr, kNumberBase);
61       if (integer1 > integer2) return 1;
62       if (integer1 < integer2) return -1;
63       string1 = string2 = "0";
64     }
65     return 0;
66   }
67 
68   static int GetMatMulPreferredDim(const OpParameter *op_parameter, int input_index, const std::vector<int> &dims);
69 
70   template <typename T>
GetPreferredDim(const std::vector<T * > & in_tensors,const OpParameter * op_parameter,int index,const std::vector<int> & dims,const std::string & model_version)71   static int GetPreferredDim(const std::vector<T *> &in_tensors, const OpParameter *op_parameter, int index,
72                              const std::vector<int> &dims, const std::string &model_version) {
73 #ifndef WEIGHT_DECODE_CLIP
74     const int first_version_offset = 15;
75     if (model_version.empty() || model_version.substr(0, first_version_offset) != "MindSpore Lite " ||
76         CompareVersion(model_version.substr(first_version_offset, model_version.size()), "1.6.0") == -1) {
77       return IsChannelFirst(index, op_parameter) ? 0 : 1;
78     }
79     if (op_parameter->type_ == schema::PrimitiveType_MatMulFusion) {
80       return GetMatMulPreferredDim(op_parameter, index, dims);
81     } else if (op_parameter->type_ == schema::PrimitiveType_Conv2dTransposeFusion) {
82       if (model_version.empty() ||
83           CompareVersion(model_version.substr(first_version_offset, model_version.size()), "1.8.0") == -1) {
84         return 0;
85       }
86       return GetDeConvPreferredDim(op_parameter, dims);
87     } else if (op_parameter->type_ == schema::PrimitiveType_Gather) {
88       return GetGatherPreferredDim(op_parameter, in_tensors);
89     }
90     // The first index.
91     return 0;
92 #else
93     MS_LOG(ERROR) << "Do not support preferred dim.";
94     return RET_NOT_SUPPORT;
95 #endif
96   }
97 
98   template <typename ST, typename DT = float>
DequantData(const lite::Tensor * input_tensor,int preferred_dim)99   static DT *DequantData(const lite::Tensor *input_tensor, int preferred_dim) {
100 #ifndef WEIGHT_DECODE_CLIP
101     const auto *quant_datas = static_cast<const ST *>(input_tensor->data());
102     if (quant_datas == nullptr) {
103       MS_LOG(ERROR) << "Get quant tensor failed.";
104       return nullptr;
105     }
106     auto quant_param = input_tensor->quant_params();
107     if (quant_param.size() != kPerTensor) {
108       return DequantPerChannelData<ST, DT>(input_tensor, quant_datas, preferred_dim);
109     } else {
110       return DequantPerLayerData<ST, DT>(input_tensor, quant_datas);
111     }
112 #else
113     MS_LOG(ERROR) << "Do not support dequant data.";
114     return RET_NOT_SUPPORT;
115 #endif
116   }
117 
118 #ifndef WEIGHT_DECODE_CLIP
119 
120  private:
121   static int DequantTensor(Tensor *tensor, int preferred_dim, TypeId dst_data_type = kNumberTypeFloat32);
122 
123   static int UnPackToInt(const SchemaTensorWrapper &src_tensor, lite::Tensor *dst_tensor);
124 
125   static int DecodeHuffmanCode(const SchemaTensorWrapper &src_tensor, lite::Tensor *dst_tensor);
126 
127   static int UnPack(const SchemaTensorWrapper &src_tensor, lite::Tensor *dst_tensor);
128 
129   static STATUS SparseDecompress(const SchemaTensorWrapper &src_tensor, Tensor *dst_tensor);
130 
131   static std::vector<bool> StringToBitVector(const std::string &str);
132 
133   static STATUS IndexingDecompress(const SchemaTensorWrapper &src_tensor, Tensor *dst_tensor);
134 
135   static bool IsChannelFirst(int index, const OpParameter *op_parameter);
136 
137   // A * stride_a + bucket_index * stride_b + C
138   static int GetDataIndex(const std::vector<int> &dims, int preferred_dim, int bucket_index, int bucket_in_index);
139 
140   template <typename ST, typename DT = float>
DequantPerLayerData(const lite::Tensor * input_tensor,const ST * quant_datas)141   static DT *DequantPerLayerData(const lite::Tensor *input_tensor, const ST *quant_datas) {
142     auto quant_param = input_tensor->quant_params();
143     auto input_tensor_element_num = input_tensor->ElementsNum();
144     MS_CHECK_GT(input_tensor_element_num, 0, nullptr);
145     DT *dequant_datas = static_cast<DT *>(malloc(input_tensor_element_num * sizeof(DT)));
146     if (dequant_datas == nullptr) {
147       MS_LOG(ERROR) << "Malloc failed.";
148       return nullptr;
149     }
150     auto quant_clusters = input_tensor->quant_clusters();
151     auto param = quant_param.front();
152     auto scale = param.scale;
153     auto zero_point = param.zeroPoint;
154     for (int64_t j = 0; j < input_tensor_element_num; j++) {
155       if (!quant_clusters.empty()) {
156         int8_t index = quant_datas[j];
157         if (index > INT8_MAX || index < INT8_MIN) {
158           MS_LOG(ERROR) << "KMeans param quant is error.";
159           free(dequant_datas);
160           return nullptr;
161         }
162         if (abs(index - INT8_MIN) >= static_cast<int>(param.clusters.size())) {
163           MS_LOG(ERROR) << "index exceed the boundary of param.clusters";
164           free(dequant_datas);
165           return nullptr;
166         }
167         dequant_datas[j] = static_cast<DT>(param.clusters[index - INT8_MIN]);
168       } else {
169 #ifdef ENABLE_ARM32
170         volatile float dequant_data = (quant_datas[j] - zero_point) * scale;
171         dequant_datas[j] = static_cast<DT>(dequant_data);
172 #else
173         dequant_datas[j] = static_cast<DT>((quant_datas[j] - zero_point) * scale);
174 #endif
175       }
176     }
177     return dequant_datas;
178   }
179 
180   template <typename ST, typename DT = float>
DequantPerChannelData(const lite::Tensor * input_tensor,const ST * quant_datas,int preferred_dim)181   static DT *DequantPerChannelData(const lite::Tensor *input_tensor, const ST *quant_datas, int preferred_dim) {
182     auto quant_param = input_tensor->quant_params();
183     auto input_tensor_element_num = input_tensor->ElementsNum();
184     MS_CHECK_GT(input_tensor_element_num, 0, nullptr);
185     DT *dequant_datas = static_cast<DT *>(malloc(input_tensor_element_num * sizeof(DT)));
186     if (dequant_datas == nullptr) {
187       MS_LOG(ERROR) << "Malloc failed.";
188       return nullptr;
189     }
190     auto shapes = input_tensor->shape();
191     auto channels = quant_param.size();
192     MS_CHECK_LT(static_cast<size_t>(preferred_dim), shapes.size(), nullptr);
193     if (channels != static_cast<size_t>(shapes.at(preferred_dim))) {
194       MS_LOG(ERROR) << input_tensor->tensor_name() << " shapes at preferred_dim " << preferred_dim << " is "
195                     << shapes.at(preferred_dim) << " != channels " << channels;
196       free(dequant_datas);
197       return nullptr;
198     }
199     MS_CHECK_GT(channels, 0, nullptr);
200     size_t per_channel_size = input_tensor_element_num / channels;
201     for (size_t i = 0; i < channels; i++) {
202       auto param = quant_param.at(i);
203       auto scale = param.scale;
204       auto zero_point = param.zeroPoint;
205       auto var_corr = param.var_corr;
206       auto mean_corr = param.mean_corr;
207       if (var_corr < 0 || var_corr > kMaxVarCorr) {
208         MS_LOG(WARNING) << "unexpected var_corr: " << var_corr;
209         var_corr = 1;
210       }
211       for (size_t j = 0; j < per_channel_size; j++) {
212         auto index = GetDataIndex(shapes, preferred_dim, i, j);
213 #ifdef ENABLE_ARM32
214         volatile float dequant_data = (quant_datas[index] - zero_point) * scale * var_corr + mean_corr;
215         dequant_datas[index] = static_cast<DT>(dequant_data);
216 #else
217         dequant_datas[index] = static_cast<DT>((quant_datas[index] - zero_point) * scale * var_corr + mean_corr);
218 #endif
219       }
220     }
221     return dequant_datas;
222   }
223 
224   static int GetDeConvPreferredDim(const OpParameter *op_parameter, const std::vector<int> &dims);
225 
226   template <typename T>
GetGatherPreferredDim(const OpParameter * op_parameter,const std::vector<T * > & in_tensors)227   static int GetGatherPreferredDim(const OpParameter *op_parameter, const std::vector<T *> &in_tensors) {
228     MS_ASSERT(op_parameter != nullptr);
229     const int axis_index = 2;
230     const int axis_tensor_size = 3;
231     if (in_tensors.size() == axis_tensor_size && in_tensors.at(axis_index)->IsConst()) {
232       if (in_tensors.at(axis_index)->data_type() == kNumberTypeInt32) {
233         return static_cast<int *>(in_tensors.at(axis_index)->data())[0];
234       } else if (in_tensors.at(axis_index)->data_type() == kNumberTypeInt64) {
235         return static_cast<int64_t *>(in_tensors.at(axis_index)->data())[0];
236       }
237     }
238     const auto *param = reinterpret_cast<const GatherParameter *>(op_parameter);
239     return param->axis_;
240   }
241 
242   static int DequantWeight(lite::Tensor *input_tensor, int preferred_dim, TypeId dst_data_type = kNumberTypeFloat32);
243 
244   static int DecodeKMeansWeight(lite::Tensor *tensor, TypeId dst_data_type);
245 
246   template <typename T>
DecodeKMeansData(lite::Tensor * tensor,T ** dequant_data)247   static int DecodeKMeansData(lite::Tensor *tensor, T **dequant_data) {
248     CHECK_NULL_RETURN(dequant_data);
249     *dequant_data = static_cast<T *>(malloc(tensor->ElementsNum() * sizeof(T)));
250     CHECK_NULL_RETURN(*dequant_data);
251     for (int64_t i = 0; i < tensor->ElementsNum(); i++) {
252       auto index = static_cast<int8_t *>(tensor->data())[i] - INT8_MIN;
253       (*dequant_data)[i] = static_cast<T>(tensor->quant_clusters().at(index));
254     }
255     return RET_OK;
256   }
257 
258   template <typename T1, typename T2>
UnPackData(int origin_bit,const T2 & packed_data,std::queue<bool> * unpack_bit_data,void * unpack_int,size_t * count,size_t limit_size,bool is_last)259   static void UnPackData(int origin_bit, const T2 &packed_data, std::queue<bool> *unpack_bit_data, void *unpack_int,
260                          size_t *count, size_t limit_size, bool is_last) {
261     T2 uint_result = 0;
262     T1 result;
263     UnPackFromUintToOrigin<T2>(packed_data, unpack_bit_data);
264     const int base = 2;
265     while (static_cast<int>(unpack_bit_data->size()) >= origin_bit) {
266       for (int k = 0; k < origin_bit; k++) {
267         bool bit_tmp = unpack_bit_data->front();
268         uint_result = (static_cast<size_t>(bit_tmp) << static_cast<unsigned int>(k)) + uint_result;
269         unpack_bit_data->pop();
270       }
271       result = static_cast<T1>(uint_result - static_cast<T2>(pow(base, origin_bit - 1)));
272       if (*count >= limit_size) {
273         return;
274       }
275       (static_cast<T1 *>(unpack_int))[*count] = result;
276       uint_result = 0;
277       (*count)++;
278     }
279     size_t remainder = unpack_bit_data->size();
280     if (is_last && remainder > 0) {
281       for (size_t i = 0; i < remainder; i++) {
282         bool bit = unpack_bit_data->front();
283         uint_result = (static_cast<unsigned int>(bit) << i) + uint_result;
284         unpack_bit_data->pop();
285       }
286       result = static_cast<T1>(uint_result - static_cast<T2>(pow(base, origin_bit - 1)));
287       if (*count >= limit_size) {
288         return;
289       }
290       (static_cast<T1 *>(unpack_int))[*count] = result;
291     }
292   }
293 
294   template <typename T1, typename T2>
UnPackUtil(const SchemaTensorWrapper & src_tensor,const size_t & unpack_int_up_limit_size,int origin_bit,void * unpack_int_data)295   static int UnPackUtil(const SchemaTensorWrapper &src_tensor, const size_t &unpack_int_up_limit_size, int origin_bit,
296                         void *unpack_int_data) {
297     MS_ASSERT(src_tensor.handler() != nullptr);
298     MS_ASSERT(src_tensor.data() != nullptr);
299     if (src_tensor.data() == nullptr) {
300       MS_LOG(ERROR) << "tensor data is null";
301       return RET_NULL_PTR;
302     }
303     auto weight_data = src_tensor.data();
304     size_t pack_size =
305       src_tensor.handler()->dataType() == kNumberTypeInt8 ? src_tensor.length() : src_tensor.length() / 2;
306     std::queue<bool> unpack_bit_data;
307     size_t count = 0;
308     for (size_t i = 0; i < pack_size; ++i) {
309       T2 pack_data = (static_cast<const T2 *>(static_cast<const void *>(weight_data)))[i];
310       bool is_last = i == pack_size - 1;
311       if (count >= unpack_int_up_limit_size) {
312         MS_LOG(ERROR) << "extend unpack_int_up_limit_size, which is " << unpack_int_up_limit_size;
313         return RET_ERROR;
314       }
315       UnPackData<T1, T2>(origin_bit, pack_data, &unpack_bit_data, unpack_int_data, &count, unpack_int_up_limit_size,
316                          is_last);
317     }
318     return RET_OK;
319   }
320 
321   template <typename T2>
UnPackFromUintToOrigin(const T2 & packed_data,std::queue<bool> * unpack_bit_data)322   static void UnPackFromUintToOrigin(const T2 &packed_data, std::queue<bool> *unpack_bit_data) {
323     auto n = packed_data;
324     size_t bit_count = 0;
325     while (bit_count < sizeof(T2) * kBitNum8) {
326       bool a = n % 2;
327       n = n >> 1;
328       bit_count++;
329       unpack_bit_data->push(a);
330     }
331   }
332 
333   template <typename T>
UnIndexTensorData(const std::vector<int> & unique_values,const std::vector<size_t> & indices,void * dst_data,size_t dst_data_size)334   static STATUS UnIndexTensorData(const std::vector<int> &unique_values, const std::vector<size_t> &indices,
335                                   void *dst_data, size_t dst_data_size) {
336     std::vector<T> un_indexed_data;
337     for (auto index : indices) {
338       if (index >= unique_values.size()) {
339         MS_LOG(ERROR) << "index: " << index << " size: " << unique_values.size();
340         return RET_ERROR;
341       }
342       if (unique_values[index] > std::numeric_limits<T>::max() ||
343           unique_values[index] < std::numeric_limits<T>::min()) {
344         MS_LOG(ERROR) << "data: " << unique_values[index] << " max: " << std::numeric_limits<T>::max()
345                       << " min: " << std::numeric_limits<T>::min();
346         return RET_ERROR;
347       }
348       un_indexed_data.push_back(static_cast<T>(unique_values[index]));
349     }
350     if (un_indexed_data.size() * sizeof(T) != dst_data_size) {
351       MS_LOG(ERROR) << "un idnexed data size: " << un_indexed_data.size() * sizeof(T)
352                     << " expected by tensor: " << dst_data_size;
353       return RET_ERROR;
354     }
355     memcpy(dst_data, un_indexed_data.data(), un_indexed_data.size() * sizeof(T));
356 
357     return RET_OK;
358   }
359 
360   template <typename T>
UnSparseTensorData(const std::vector<int> & unique_values,const std::vector<size_t> & indices,const std::vector<size_t> & coors,const flatbuffers::Vector<flatbuffers::Offset<schema::QuantParam>> * quant_params,size_t elem_cnt,size_t coor_best_bit,void * dst_data,size_t dst_data_size)361   static STATUS UnSparseTensorData(const std::vector<int> &unique_values, const std::vector<size_t> &indices,
362                                    const std::vector<size_t> &coors,
363                                    const flatbuffers::Vector<flatbuffers::Offset<schema::QuantParam>> *quant_params,
364                                    size_t elem_cnt, size_t coor_best_bit, void *dst_data, size_t dst_data_size) {
365     std::vector<T> un_sparsed_data;
366     size_t data_index = 0;
367     auto nz_cnt = indices.size();
368     MS_ASSERT(nz_cnt == coors.size());
369     auto channel_cnt = quant_params->size();
370     MS_CHECK_GT(channel_cnt, 0, RET_ERROR);
371     auto elem_perchannel = elem_cnt / channel_cnt;
372     MS_CHECK_GT(elem_perchannel, 0, RET_ERROR);
373     for (size_t i = 0; i < nz_cnt; i++) {
374       auto index = indices[i];
375       if (index >= unique_values.size()) {
376         MS_LOG(ERROR) << "index: " << index << " size: " << unique_values.size();
377         return RET_ERROR;
378       }
379       auto nz = unique_values[index];
380       if (nz > std::numeric_limits<T>::max() || nz < std::numeric_limits<T>::min()) {
381         MS_LOG(ERROR) << "data: " << nz << " max: " << std::numeric_limits<T>::max()
382                       << " min: " << std::numeric_limits<T>::min();
383         return RET_ERROR;
384       }
385       auto coor = coors[i];
386       for (size_t j = 0; j < coor; j++) {
387         auto cur_channel = data_index / elem_perchannel;
388         auto zp = quant_params->Get(cur_channel)->zeroPoint();
389         un_sparsed_data.push_back(zp);
390         data_index++;
391       }
392       un_sparsed_data.push_back(static_cast<T>(unique_values[index]));
393       data_index++;
394     }
395     if (un_sparsed_data.size() * sizeof(T) > dst_data_size) {
396       MS_LOG(ERROR) << "un-sparsed data size: " << un_sparsed_data.size() * sizeof(T)
397                     << " tensor size: " << dst_data_size;
398       return RET_ERROR;
399     } else if (un_sparsed_data.size() * sizeof(T) < dst_data_size &&
400                (un_sparsed_data.size() + (1 << coor_best_bit) - 1) * sizeof(T) < dst_data_size) {
401       MS_LOG(ERROR) << "un-sparsed data size: " << un_sparsed_data.size() * sizeof(T)
402                     << " tensor size: " << dst_data_size << " coor_best_bit: " << coor_best_bit;
403       return RET_ERROR;
404     }
405 
406     for (; data_index < dst_data_size / sizeof(T); data_index++) {
407       auto cur_channel = data_index / elem_perchannel;
408       auto zp = quant_params->Get(cur_channel)->zeroPoint();
409       un_sparsed_data.push_back(static_cast<T>(zp));
410     }
411 
412     memcpy(dst_data, un_sparsed_data.data(), un_sparsed_data.size() * sizeof(T));
413 
414     return RET_OK;
415   }
416 #endif
417 };
418 }  // namespace mindspore::lite
419 #endif  // MINDSPORE_LITE_SRC_RUNTIME_WEIGHT_DECODER_H_
420