1 /** 2 * Copyright 2020-2023 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_LITE_SRC_RUNTIME_WEIGHT_DECODER_H_ 18 #define MINDSPORE_LITE_SRC_RUNTIME_WEIGHT_DECODER_H_ 19 20 #include <map> 21 #include <utility> 22 #include <vector> 23 #include <queue> 24 #include <limits> 25 #include <string> 26 #include <cmath> 27 #include "nnacl/matmul_parameter.h" 28 #include "nnacl/gather_parameter.h" 29 #include "src/executor/kernel_exec.h" 30 #include "src/common/utils.h" 31 #include "src/tensor.h" 32 #include "src/litert/lite_model.h" 33 34 static constexpr int kPerTensor = 1; 35 static constexpr int kBitNumMix = 0; 36 static constexpr int kBitNum1 = 1; 37 static constexpr int kBitNum8 = 8; 38 static constexpr int kBitNum16 = 16; 39 static constexpr int kBitNum32 = 32; 40 static constexpr int kMaxVarCorr = 10; 41 static constexpr int kNumberBase = 10; 42 43 namespace mindspore::lite { 44 45 class MS_API WeightDecoder { 46 public: 47 static int DequantNode(const OpParameter *op_parameter, const std::vector<Tensor *> &in_tensors, TypeId dst_data_type, 48 const std::string &model_version, bool float_mode); 49 static int DecompressTensor(const SchemaTensorWrapper &src_tensor, lite::Tensor *dst_tensor); 50 CompareVersion(const std::string & version1,const std::string & version2)51 static int CompareVersion(const std::string &version1, const std::string &version2) { 52 std::istringstream iss1(version1); 53 std::istringstream iss2(version2); 54 std::string string1; 55 std::string string2; 56 while (!iss1.eof() || !iss2.eof()) { 57 getline(iss1, string1, '.'); 58 getline(iss2, string2, '.'); 59 int64_t integer1 = std::strtol(string1.c_str(), nullptr, kNumberBase); 60 int64_t integer2 = std::strtol(string2.c_str(), nullptr, kNumberBase); 61 if (integer1 > integer2) return 1; 62 if (integer1 < integer2) return -1; 63 string1 = string2 = "0"; 64 } 65 return 0; 66 } 67 68 static int GetMatMulPreferredDim(const OpParameter *op_parameter, int input_index, const std::vector<int> &dims); 69 70 template <typename T> GetPreferredDim(const std::vector<T * > & in_tensors,const OpParameter * op_parameter,int index,const std::vector<int> & dims,const std::string & model_version)71 static int GetPreferredDim(const std::vector<T *> &in_tensors, const OpParameter *op_parameter, int index, 72 const std::vector<int> &dims, const std::string &model_version) { 73 #ifndef WEIGHT_DECODE_CLIP 74 const int first_version_offset = 15; 75 if (model_version.empty() || model_version.substr(0, first_version_offset) != "MindSpore Lite " || 76 CompareVersion(model_version.substr(first_version_offset, model_version.size()), "1.6.0") == -1) { 77 return IsChannelFirst(index, op_parameter) ? 0 : 1; 78 } 79 if (op_parameter->type_ == schema::PrimitiveType_MatMulFusion) { 80 return GetMatMulPreferredDim(op_parameter, index, dims); 81 } else if (op_parameter->type_ == schema::PrimitiveType_Conv2dTransposeFusion) { 82 if (model_version.empty() || 83 CompareVersion(model_version.substr(first_version_offset, model_version.size()), "1.8.0") == -1) { 84 return 0; 85 } 86 return GetDeConvPreferredDim(op_parameter, dims); 87 } else if (op_parameter->type_ == schema::PrimitiveType_Gather) { 88 return GetGatherPreferredDim(op_parameter, in_tensors); 89 } 90 // The first index. 91 return 0; 92 #else 93 MS_LOG(ERROR) << "Do not support preferred dim."; 94 return RET_NOT_SUPPORT; 95 #endif 96 } 97 98 template <typename ST, typename DT = float> DequantData(const lite::Tensor * input_tensor,int preferred_dim)99 static DT *DequantData(const lite::Tensor *input_tensor, int preferred_dim) { 100 #ifndef WEIGHT_DECODE_CLIP 101 const auto *quant_datas = static_cast<const ST *>(input_tensor->data()); 102 if (quant_datas == nullptr) { 103 MS_LOG(ERROR) << "Get quant tensor failed."; 104 return nullptr; 105 } 106 auto quant_param = input_tensor->quant_params(); 107 if (quant_param.size() != kPerTensor) { 108 return DequantPerChannelData<ST, DT>(input_tensor, quant_datas, preferred_dim); 109 } else { 110 return DequantPerLayerData<ST, DT>(input_tensor, quant_datas); 111 } 112 #else 113 MS_LOG(ERROR) << "Do not support dequant data."; 114 return RET_NOT_SUPPORT; 115 #endif 116 } 117 118 #ifndef WEIGHT_DECODE_CLIP 119 120 private: 121 static int DequantTensor(Tensor *tensor, int preferred_dim, TypeId dst_data_type = kNumberTypeFloat32); 122 123 static int UnPackToInt(const SchemaTensorWrapper &src_tensor, lite::Tensor *dst_tensor); 124 125 static int DecodeHuffmanCode(const SchemaTensorWrapper &src_tensor, lite::Tensor *dst_tensor); 126 127 static int UnPack(const SchemaTensorWrapper &src_tensor, lite::Tensor *dst_tensor); 128 129 static STATUS SparseDecompress(const SchemaTensorWrapper &src_tensor, Tensor *dst_tensor); 130 131 static std::vector<bool> StringToBitVector(const std::string &str); 132 133 static STATUS IndexingDecompress(const SchemaTensorWrapper &src_tensor, Tensor *dst_tensor); 134 135 static bool IsChannelFirst(int index, const OpParameter *op_parameter); 136 137 // A * stride_a + bucket_index * stride_b + C 138 static int GetDataIndex(const std::vector<int> &dims, int preferred_dim, int bucket_index, int bucket_in_index); 139 140 template <typename ST, typename DT = float> DequantPerLayerData(const lite::Tensor * input_tensor,const ST * quant_datas)141 static DT *DequantPerLayerData(const lite::Tensor *input_tensor, const ST *quant_datas) { 142 auto quant_param = input_tensor->quant_params(); 143 auto input_tensor_element_num = input_tensor->ElementsNum(); 144 MS_CHECK_GT(input_tensor_element_num, 0, nullptr); 145 DT *dequant_datas = static_cast<DT *>(malloc(input_tensor_element_num * sizeof(DT))); 146 if (dequant_datas == nullptr) { 147 MS_LOG(ERROR) << "Malloc failed."; 148 return nullptr; 149 } 150 auto quant_clusters = input_tensor->quant_clusters(); 151 auto param = quant_param.front(); 152 auto scale = param.scale; 153 auto zero_point = param.zeroPoint; 154 for (int64_t j = 0; j < input_tensor_element_num; j++) { 155 if (!quant_clusters.empty()) { 156 int8_t index = quant_datas[j]; 157 if (index > INT8_MAX || index < INT8_MIN) { 158 MS_LOG(ERROR) << "KMeans param quant is error."; 159 free(dequant_datas); 160 return nullptr; 161 } 162 if (abs(index - INT8_MIN) >= static_cast<int>(param.clusters.size())) { 163 MS_LOG(ERROR) << "index exceed the boundary of param.clusters"; 164 free(dequant_datas); 165 return nullptr; 166 } 167 dequant_datas[j] = static_cast<DT>(param.clusters[index - INT8_MIN]); 168 } else { 169 #ifdef ENABLE_ARM32 170 volatile float dequant_data = (quant_datas[j] - zero_point) * scale; 171 dequant_datas[j] = static_cast<DT>(dequant_data); 172 #else 173 dequant_datas[j] = static_cast<DT>((quant_datas[j] - zero_point) * scale); 174 #endif 175 } 176 } 177 return dequant_datas; 178 } 179 180 template <typename ST, typename DT = float> DequantPerChannelData(const lite::Tensor * input_tensor,const ST * quant_datas,int preferred_dim)181 static DT *DequantPerChannelData(const lite::Tensor *input_tensor, const ST *quant_datas, int preferred_dim) { 182 auto quant_param = input_tensor->quant_params(); 183 auto input_tensor_element_num = input_tensor->ElementsNum(); 184 MS_CHECK_GT(input_tensor_element_num, 0, nullptr); 185 DT *dequant_datas = static_cast<DT *>(malloc(input_tensor_element_num * sizeof(DT))); 186 if (dequant_datas == nullptr) { 187 MS_LOG(ERROR) << "Malloc failed."; 188 return nullptr; 189 } 190 auto shapes = input_tensor->shape(); 191 auto channels = quant_param.size(); 192 MS_CHECK_LT(static_cast<size_t>(preferred_dim), shapes.size(), nullptr); 193 if (channels != static_cast<size_t>(shapes.at(preferred_dim))) { 194 MS_LOG(ERROR) << input_tensor->tensor_name() << " shapes at preferred_dim " << preferred_dim << " is " 195 << shapes.at(preferred_dim) << " != channels " << channels; 196 free(dequant_datas); 197 return nullptr; 198 } 199 MS_CHECK_GT(channels, 0, nullptr); 200 size_t per_channel_size = input_tensor_element_num / channels; 201 for (size_t i = 0; i < channels; i++) { 202 auto param = quant_param.at(i); 203 auto scale = param.scale; 204 auto zero_point = param.zeroPoint; 205 auto var_corr = param.var_corr; 206 auto mean_corr = param.mean_corr; 207 if (var_corr < 0 || var_corr > kMaxVarCorr) { 208 MS_LOG(WARNING) << "unexpected var_corr: " << var_corr; 209 var_corr = 1; 210 } 211 for (size_t j = 0; j < per_channel_size; j++) { 212 auto index = GetDataIndex(shapes, preferred_dim, i, j); 213 #ifdef ENABLE_ARM32 214 volatile float dequant_data = (quant_datas[index] - zero_point) * scale * var_corr + mean_corr; 215 dequant_datas[index] = static_cast<DT>(dequant_data); 216 #else 217 dequant_datas[index] = static_cast<DT>((quant_datas[index] - zero_point) * scale * var_corr + mean_corr); 218 #endif 219 } 220 } 221 return dequant_datas; 222 } 223 224 static int GetDeConvPreferredDim(const OpParameter *op_parameter, const std::vector<int> &dims); 225 226 template <typename T> GetGatherPreferredDim(const OpParameter * op_parameter,const std::vector<T * > & in_tensors)227 static int GetGatherPreferredDim(const OpParameter *op_parameter, const std::vector<T *> &in_tensors) { 228 MS_ASSERT(op_parameter != nullptr); 229 const int axis_index = 2; 230 const int axis_tensor_size = 3; 231 if (in_tensors.size() == axis_tensor_size && in_tensors.at(axis_index)->IsConst()) { 232 if (in_tensors.at(axis_index)->data_type() == kNumberTypeInt32) { 233 return static_cast<int *>(in_tensors.at(axis_index)->data())[0]; 234 } else if (in_tensors.at(axis_index)->data_type() == kNumberTypeInt64) { 235 return static_cast<int64_t *>(in_tensors.at(axis_index)->data())[0]; 236 } 237 } 238 const auto *param = reinterpret_cast<const GatherParameter *>(op_parameter); 239 return param->axis_; 240 } 241 242 static int DequantWeight(lite::Tensor *input_tensor, int preferred_dim, TypeId dst_data_type = kNumberTypeFloat32); 243 244 static int DecodeKMeansWeight(lite::Tensor *tensor, TypeId dst_data_type); 245 246 template <typename T> DecodeKMeansData(lite::Tensor * tensor,T ** dequant_data)247 static int DecodeKMeansData(lite::Tensor *tensor, T **dequant_data) { 248 CHECK_NULL_RETURN(dequant_data); 249 *dequant_data = static_cast<T *>(malloc(tensor->ElementsNum() * sizeof(T))); 250 CHECK_NULL_RETURN(*dequant_data); 251 for (int64_t i = 0; i < tensor->ElementsNum(); i++) { 252 auto index = static_cast<int8_t *>(tensor->data())[i] - INT8_MIN; 253 (*dequant_data)[i] = static_cast<T>(tensor->quant_clusters().at(index)); 254 } 255 return RET_OK; 256 } 257 258 template <typename T1, typename T2> UnPackData(int origin_bit,const T2 & packed_data,std::queue<bool> * unpack_bit_data,void * unpack_int,size_t * count,size_t limit_size,bool is_last)259 static void UnPackData(int origin_bit, const T2 &packed_data, std::queue<bool> *unpack_bit_data, void *unpack_int, 260 size_t *count, size_t limit_size, bool is_last) { 261 T2 uint_result = 0; 262 T1 result; 263 UnPackFromUintToOrigin<T2>(packed_data, unpack_bit_data); 264 const int base = 2; 265 while (static_cast<int>(unpack_bit_data->size()) >= origin_bit) { 266 for (int k = 0; k < origin_bit; k++) { 267 bool bit_tmp = unpack_bit_data->front(); 268 uint_result = (static_cast<size_t>(bit_tmp) << static_cast<unsigned int>(k)) + uint_result; 269 unpack_bit_data->pop(); 270 } 271 result = static_cast<T1>(uint_result - static_cast<T2>(pow(base, origin_bit - 1))); 272 if (*count >= limit_size) { 273 return; 274 } 275 (static_cast<T1 *>(unpack_int))[*count] = result; 276 uint_result = 0; 277 (*count)++; 278 } 279 size_t remainder = unpack_bit_data->size(); 280 if (is_last && remainder > 0) { 281 for (size_t i = 0; i < remainder; i++) { 282 bool bit = unpack_bit_data->front(); 283 uint_result = (static_cast<unsigned int>(bit) << i) + uint_result; 284 unpack_bit_data->pop(); 285 } 286 result = static_cast<T1>(uint_result - static_cast<T2>(pow(base, origin_bit - 1))); 287 if (*count >= limit_size) { 288 return; 289 } 290 (static_cast<T1 *>(unpack_int))[*count] = result; 291 } 292 } 293 294 template <typename T1, typename T2> UnPackUtil(const SchemaTensorWrapper & src_tensor,const size_t & unpack_int_up_limit_size,int origin_bit,void * unpack_int_data)295 static int UnPackUtil(const SchemaTensorWrapper &src_tensor, const size_t &unpack_int_up_limit_size, int origin_bit, 296 void *unpack_int_data) { 297 MS_ASSERT(src_tensor.handler() != nullptr); 298 MS_ASSERT(src_tensor.data() != nullptr); 299 if (src_tensor.data() == nullptr) { 300 MS_LOG(ERROR) << "tensor data is null"; 301 return RET_NULL_PTR; 302 } 303 auto weight_data = src_tensor.data(); 304 size_t pack_size = 305 src_tensor.handler()->dataType() == kNumberTypeInt8 ? src_tensor.length() : src_tensor.length() / 2; 306 std::queue<bool> unpack_bit_data; 307 size_t count = 0; 308 for (size_t i = 0; i < pack_size; ++i) { 309 T2 pack_data = (static_cast<const T2 *>(static_cast<const void *>(weight_data)))[i]; 310 bool is_last = i == pack_size - 1; 311 if (count >= unpack_int_up_limit_size) { 312 MS_LOG(ERROR) << "extend unpack_int_up_limit_size, which is " << unpack_int_up_limit_size; 313 return RET_ERROR; 314 } 315 UnPackData<T1, T2>(origin_bit, pack_data, &unpack_bit_data, unpack_int_data, &count, unpack_int_up_limit_size, 316 is_last); 317 } 318 return RET_OK; 319 } 320 321 template <typename T2> UnPackFromUintToOrigin(const T2 & packed_data,std::queue<bool> * unpack_bit_data)322 static void UnPackFromUintToOrigin(const T2 &packed_data, std::queue<bool> *unpack_bit_data) { 323 auto n = packed_data; 324 size_t bit_count = 0; 325 while (bit_count < sizeof(T2) * kBitNum8) { 326 bool a = n % 2; 327 n = n >> 1; 328 bit_count++; 329 unpack_bit_data->push(a); 330 } 331 } 332 333 template <typename T> UnIndexTensorData(const std::vector<int> & unique_values,const std::vector<size_t> & indices,void * dst_data,size_t dst_data_size)334 static STATUS UnIndexTensorData(const std::vector<int> &unique_values, const std::vector<size_t> &indices, 335 void *dst_data, size_t dst_data_size) { 336 std::vector<T> un_indexed_data; 337 for (auto index : indices) { 338 if (index >= unique_values.size()) { 339 MS_LOG(ERROR) << "index: " << index << " size: " << unique_values.size(); 340 return RET_ERROR; 341 } 342 if (unique_values[index] > std::numeric_limits<T>::max() || 343 unique_values[index] < std::numeric_limits<T>::min()) { 344 MS_LOG(ERROR) << "data: " << unique_values[index] << " max: " << std::numeric_limits<T>::max() 345 << " min: " << std::numeric_limits<T>::min(); 346 return RET_ERROR; 347 } 348 un_indexed_data.push_back(static_cast<T>(unique_values[index])); 349 } 350 if (un_indexed_data.size() * sizeof(T) != dst_data_size) { 351 MS_LOG(ERROR) << "un idnexed data size: " << un_indexed_data.size() * sizeof(T) 352 << " expected by tensor: " << dst_data_size; 353 return RET_ERROR; 354 } 355 memcpy(dst_data, un_indexed_data.data(), un_indexed_data.size() * sizeof(T)); 356 357 return RET_OK; 358 } 359 360 template <typename T> UnSparseTensorData(const std::vector<int> & unique_values,const std::vector<size_t> & indices,const std::vector<size_t> & coors,const flatbuffers::Vector<flatbuffers::Offset<schema::QuantParam>> * quant_params,size_t elem_cnt,size_t coor_best_bit,void * dst_data,size_t dst_data_size)361 static STATUS UnSparseTensorData(const std::vector<int> &unique_values, const std::vector<size_t> &indices, 362 const std::vector<size_t> &coors, 363 const flatbuffers::Vector<flatbuffers::Offset<schema::QuantParam>> *quant_params, 364 size_t elem_cnt, size_t coor_best_bit, void *dst_data, size_t dst_data_size) { 365 std::vector<T> un_sparsed_data; 366 size_t data_index = 0; 367 auto nz_cnt = indices.size(); 368 MS_ASSERT(nz_cnt == coors.size()); 369 auto channel_cnt = quant_params->size(); 370 MS_CHECK_GT(channel_cnt, 0, RET_ERROR); 371 auto elem_perchannel = elem_cnt / channel_cnt; 372 MS_CHECK_GT(elem_perchannel, 0, RET_ERROR); 373 for (size_t i = 0; i < nz_cnt; i++) { 374 auto index = indices[i]; 375 if (index >= unique_values.size()) { 376 MS_LOG(ERROR) << "index: " << index << " size: " << unique_values.size(); 377 return RET_ERROR; 378 } 379 auto nz = unique_values[index]; 380 if (nz > std::numeric_limits<T>::max() || nz < std::numeric_limits<T>::min()) { 381 MS_LOG(ERROR) << "data: " << nz << " max: " << std::numeric_limits<T>::max() 382 << " min: " << std::numeric_limits<T>::min(); 383 return RET_ERROR; 384 } 385 auto coor = coors[i]; 386 for (size_t j = 0; j < coor; j++) { 387 auto cur_channel = data_index / elem_perchannel; 388 auto zp = quant_params->Get(cur_channel)->zeroPoint(); 389 un_sparsed_data.push_back(zp); 390 data_index++; 391 } 392 un_sparsed_data.push_back(static_cast<T>(unique_values[index])); 393 data_index++; 394 } 395 if (un_sparsed_data.size() * sizeof(T) > dst_data_size) { 396 MS_LOG(ERROR) << "un-sparsed data size: " << un_sparsed_data.size() * sizeof(T) 397 << " tensor size: " << dst_data_size; 398 return RET_ERROR; 399 } else if (un_sparsed_data.size() * sizeof(T) < dst_data_size && 400 (un_sparsed_data.size() + (1 << coor_best_bit) - 1) * sizeof(T) < dst_data_size) { 401 MS_LOG(ERROR) << "un-sparsed data size: " << un_sparsed_data.size() * sizeof(T) 402 << " tensor size: " << dst_data_size << " coor_best_bit: " << coor_best_bit; 403 return RET_ERROR; 404 } 405 406 for (; data_index < dst_data_size / sizeof(T); data_index++) { 407 auto cur_channel = data_index / elem_perchannel; 408 auto zp = quant_params->Get(cur_channel)->zeroPoint(); 409 un_sparsed_data.push_back(static_cast<T>(zp)); 410 } 411 412 memcpy(dst_data, un_sparsed_data.data(), un_sparsed_data.size() * sizeof(T)); 413 414 return RET_OK; 415 } 416 #endif 417 }; 418 } // namespace mindspore::lite 419 #endif // MINDSPORE_LITE_SRC_RUNTIME_WEIGHT_DECODER_H_ 420