1 /** 2 * Copyright 2022 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_TENSOR_COMPRESSOR_H_ 18 #define MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_TENSOR_COMPRESSOR_H_ 19 20 #include <memory> 21 #include <string> 22 #include <vector> 23 #include <set> 24 #include <map> 25 #include <numeric> 26 #include <limits> 27 #include <functional> 28 #include <algorithm> 29 #include "include/errorcode.h" 30 #include "ir/anf.h" 31 #include "ir/tensor.h" 32 #include "src/common/log_adapter.h" 33 #include "src/common/log_util.h" 34 #include "schema/inner/model_generated.h" 35 #include "tools/converter/quantizer/bitpacking.h" 36 #include "tools/converter/quantizer/quant_params.h" 37 #include "tools/converter/quantizer/quantize_util.h" 38 39 using mindspore::ParameterPtr; 40 namespace mindspore::lite::quant { 41 class TensorCompressor { 42 public: 43 template <typename T> DoSparseCompress(const ParameterPtr & weight,size_t bit_num,const std::vector<schema::QuantParamT> & quant_params)44 int DoSparseCompress(const ParameterPtr &weight, size_t bit_num, 45 const std::vector<schema::QuantParamT> &quant_params) { 46 auto tensor_info = weight->default_param()->cast<tensor::TensorPtr>(); 47 CHECK_NULL_RETURN(tensor_info); 48 if (tensor_info->compression_type() != mindspore::kNoCompression) { 49 MS_LOG(INFO) << weight->fullname_with_scope() << " is shared weight."; 50 return RET_OK; 51 } 52 auto max_size = tensor_info->Size(); 53 auto quant_data_array = static_cast<T *>(tensor_info->data().data()); 54 55 std::vector<T> quant_data(quant_data_array, quant_data_array + max_size / sizeof(T)); 56 auto elem_cnt = quant_data.size(); 57 auto dims = tensor_info->shape_c(); 58 size_t elem_cnt_by_dims = std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<>()); 59 if (elem_cnt != elem_cnt_by_dims) { 60 MS_LOG(ERROR) << weight->fullname_with_scope() << " elem_cnt: " << elem_cnt 61 << " not equal elem_cnt_by_dims: " << elem_cnt_by_dims; 62 return RET_ERROR; 63 } 64 65 std::set<T> quant_data_set; 66 for (auto quant_value : quant_data) { 67 quant_data_set.insert(quant_value); 68 } 69 std::map<T, size_t> unique_value_index_map; 70 auto index = 0; 71 for (auto iter = quant_data_set.cbegin(); iter != quant_data_set.cend(); ++iter) { 72 unique_value_index_map[*iter] = index++; 73 } 74 75 auto unique_value_cnt = quant_data_set.size(); 76 size_t unique_value_bit = ceil(log2(unique_value_cnt)); 77 auto pack_repetition_size_in_bit = bit_num + bit_num * unique_value_cnt + unique_value_bit * elem_cnt; 78 size_t pack_repetition_size_in_byte = ceil(1.0 * pack_repetition_size_in_bit / k8Bit); 79 size_t origin_size_in_byte = ceil(1.0 * bit_num * elem_cnt / k8Bit); 80 81 size_t coor_best_bit = 0; 82 auto nz_cnt = CalCoorBestBit<T>(quant_data, elem_cnt, quant_params, unique_value_bit, &coor_best_bit); 83 // 1. coor_best_bit 2. nz_cnt 3. quant_data_set size 4. unique_values 5. unique_value indexing 6. nz values coord 84 const auto pack_sparsity_size_in_bit = 85 1 * k8Bit + 4 * k8Bit + bit_num + bit_num * unique_value_cnt + unique_value_bit * nz_cnt + nz_cnt * coor_best_bit; 86 size_t pack_sparsity_size_in_byte = ceil(1.0 * pack_sparsity_size_in_bit / k8Bit); 87 MS_LOG(DEBUG) << "coor_best_bit: " << coor_best_bit << " ori: " << origin_size_in_byte 88 << " indexing: " << pack_repetition_size_in_byte << " sparse: " << pack_sparsity_size_in_byte; 89 auto min_byte_need = std::min({origin_size_in_byte, pack_repetition_size_in_byte, pack_sparsity_size_in_byte}); 90 if (min_byte_need == origin_size_in_byte) { 91 return RET_NO_CHANGE; 92 } else if (min_byte_need == pack_repetition_size_in_byte) { 93 MS_LOG(DEBUG) << "from " << origin_size_in_byte << " to " << pack_repetition_size_in_byte; 94 return IndexingCompress<T>(weight, quant_data_set, unique_value_index_map, unique_value_bit, unique_value_cnt, 95 pack_repetition_size_in_byte, bit_num); 96 } else if (min_byte_need == pack_sparsity_size_in_byte) { 97 MS_LOG(DEBUG) << "from " << origin_size_in_byte << " to " << pack_sparsity_size_in_byte; 98 return SparsityCompress<T>(weight, quant_params, quant_data_set, unique_value_index_map, unique_value_bit, 99 unique_value_cnt, pack_sparsity_size_in_byte, nz_cnt, coor_best_bit, bit_num); 100 } else { 101 MS_LOG(DEBUG) << "unexpected: " << min_byte_need << " not in {" << origin_size_in_byte << " " 102 << pack_repetition_size_in_byte << " " << pack_sparsity_size_in_byte << "}"; 103 } 104 return RET_NO_CHANGE; 105 } 106 int DoBitPack(const size_t &bit_num, schema::TensorT *tensor_input); 107 108 int DoBitPack(const ParameterPtr &weight, size_t bit_num); 109 110 private: 111 template <typename T> IndexingCompress(const ParameterPtr & weight,const std::set<T> & quant_data_set,const std::map<T,size_t> & unique_value_index_map,size_t unique_value_bit,size_t unique_value_cnt,size_t pack_repetition_size_in_byte,size_t bit_num)112 int IndexingCompress(const ParameterPtr &weight, const std::set<T> &quant_data_set, 113 const std::map<T, size_t> &unique_value_index_map, size_t unique_value_bit, 114 size_t unique_value_cnt, size_t pack_repetition_size_in_byte, size_t bit_num) { 115 std::vector<bool> bits(pack_repetition_size_in_byte * k8Bit); 116 size_t index = 0; 117 // write unique_value_cnt: bit_num bit for unsigned 118 for (size_t i = 0; i < bit_num; i++) { 119 bits[index++] = (unique_value_cnt >> (bit_num - i - 1)) & (0x1); 120 } 121 // write the unique value set: each value has bit_num bit signed 122 for (auto iter = quant_data_set.cbegin(); iter != quant_data_set.cend(); ++iter) { 123 for (size_t i = 0; i < bit_num; i++) { 124 bits[index++] = ((*iter + (1 << (bit_num - 1))) >> (bit_num - i - 1)) & (0x1); 125 } 126 } 127 128 auto tensor_info = weight->default_param()->cast<tensor::TensorPtr>(); 129 CHECK_NULL_RETURN(tensor_info); 130 auto max_size = tensor_info->ElementsNum(); 131 auto quant_data = static_cast<T *>(tensor_info->data().data()); 132 // write the index: each index has unique_value_bit unsigned 133 for (int i = 0; i < max_size; i++) { 134 auto quant_value = quant_data[i]; 135 for (size_t j = 0; j < unique_value_bit; j++) { 136 bits[index++] = (unique_value_index_map.at(quant_value) >> (unique_value_bit - j - 1)) & (0x1); 137 } 138 } 139 if (index > pack_repetition_size_in_byte * k8Bit) { 140 MS_LOG(ERROR) << "unexpected index: " << index << " should not be greater than " 141 << pack_repetition_size_in_byte * k8Bit; 142 return RET_ERROR; 143 } 144 145 auto ret = SetNewCompressionTensor(weight, bits, bit_num, tensor_info, mindspore::kIndexing); 146 if (ret != RET_OK) { 147 MS_LOG(ERROR) << "Add New tensor failed."; 148 return RET_ERROR; 149 } 150 return RET_OK; 151 } 152 153 template <typename T> SparsityCompress(const ParameterPtr & weight,const std::vector<schema::QuantParamT> & quant_params,const std::set<T> & quant_data_set,const std::map<T,size_t> & unique_value_index_map,size_t unique_value_bit,size_t unique_value_cnt,size_t pack_sparsity_size_in_byte,size_t nz_cnt,size_t coor_best_bit,size_t bit_num)154 int SparsityCompress(const ParameterPtr &weight, const std::vector<schema::QuantParamT> &quant_params, 155 const std::set<T> &quant_data_set, const std::map<T, size_t> &unique_value_index_map, 156 size_t unique_value_bit, size_t unique_value_cnt, size_t pack_sparsity_size_in_byte, 157 size_t nz_cnt, size_t coor_best_bit, size_t bit_num) { 158 auto tensor_info = weight->default_param()->cast<tensor::TensorPtr>(); 159 CHECK_NULL_RETURN(tensor_info); 160 auto quant_data = static_cast<T *>(tensor_info->data().data()); 161 int elem_cnt = tensor_info->DataSize(); 162 auto channel_cnt = quant_params.size(); 163 if (channel_cnt == 0) { 164 MS_LOG(ERROR) << "quant_params is empty."; 165 return RET_ERROR; 166 } 167 auto elem_perchannel = elem_cnt / channel_cnt; 168 169 std::vector<bool> bits(pack_sparsity_size_in_byte * k8Bit); 170 size_t index = 0; 171 // coor_best_bit 172 for (size_t i = 0; i < k8Bit; i++) { 173 bits[index++] = (coor_best_bit >> (k8Bit - i - 1)) & 0x1; 174 } 175 // nz_cnt 176 for (size_t i = 0; i < k32Bit; i++) { 177 bits[index++] = (nz_cnt >> (k32Bit - i - 1)) & 0x1; 178 } 179 // unique_value cnt 180 for (size_t i = 0; i < bit_num; i++) { 181 bits[index++] = (unique_value_cnt >> (bit_num - i - 1)) & 0x1; 182 } 183 // unique_values 184 for (auto unique_value : quant_data_set) { 185 for (size_t i = 0; i < bit_num; i++) { 186 bits[index++] = ((unique_value + (1 << (bit_num - 1))) >> (bit_num - i - 1)) & (0x1); 187 } 188 } 189 // nz values indexing && get coor 190 std::vector<size_t> coors(nz_cnt); 191 int coors_index = 0; 192 int prev_index = -1; 193 for (int di = 0; di < elem_cnt; di++) { 194 auto cur_channel = di / elem_perchannel; 195 auto zp = quant_params[cur_channel].zeroPoint; 196 auto nz_value = quant_data[di]; 197 if (nz_value != zp || static_cast<size_t>(di - prev_index) >= (1u << coor_best_bit)) { 198 MS_ASSERT(coors_index < nz_cnt); 199 coors[coors_index++] = di - prev_index - 1; 200 prev_index = di; 201 for (size_t i = 0; i < unique_value_bit; i++) { 202 bits[index++] = (unique_value_index_map.at(nz_value) >> (unique_value_bit - i - 1)) & (0x1); 203 } 204 } 205 } 206 // write coor 207 for (auto coor : coors) { 208 for (size_t i = 0; i < coor_best_bit; i++) { 209 bits[index++] = (coor >> (coor_best_bit - i - 1)) & 0x1; 210 } 211 } 212 if (index > pack_sparsity_size_in_byte * k8Bit) { 213 MS_LOG(ERROR) << "unexpected index: " << index << " should not be greater than " 214 << pack_sparsity_size_in_byte * k8Bit; 215 return RET_ERROR; 216 } 217 218 auto ret = SetNewCompressionTensor(weight, bits, bit_num, tensor_info, mindspore::kSparse); 219 if (ret != RET_OK) { 220 MS_LOG(ERROR) << "Add New tensor failed."; 221 return RET_ERROR; 222 } 223 return RET_OK; 224 } 225 226 template <typename T> CalCoorBestBit(const std::vector<T> & quant_data,size_t elem_cnt,const std::vector<schema::QuantParamT> & quant_params,int unique_value_bit,size_t * coor_best_bit)227 size_t CalCoorBestBit(const std::vector<T> &quant_data, size_t elem_cnt, 228 const std::vector<schema::QuantParamT> &quant_params, int unique_value_bit, 229 size_t *coor_best_bit) { 230 MS_ASSERT(!quant_params.empty()); 231 size_t best_nn_cnt = 0; 232 size_t min_len_in_bit = std::numeric_limits<size_t>::max(); 233 for (size_t bit = k2Bit; bit <= k10Bit; bit++) { 234 // search 235 int nn_cnt = 0; 236 int prev_index = -1; 237 auto channel_cnt = quant_params.size(); 238 MS_ASSERT(channel_cnt > 0); 239 auto elem_perchannel = elem_cnt / channel_cnt; 240 for (size_t i = 0; i < elem_cnt; i++) { 241 auto cur_channel = i / elem_perchannel; 242 auto zp = quant_params[cur_channel].zeroPoint; 243 if (quant_data[i] != zp || (static_cast<int>(i) - prev_index) >= ((1 << bit))) { 244 nn_cnt++; 245 prev_index = i; 246 } 247 } 248 249 size_t len_in_bit = nn_cnt * bit + nn_cnt * unique_value_bit; 250 if (len_in_bit < min_len_in_bit) { 251 min_len_in_bit = len_in_bit; 252 *coor_best_bit = bit; 253 best_nn_cnt = nn_cnt; 254 } 255 } 256 return best_nn_cnt; 257 } 258 259 void WriteBufferWithAlignByte(const std::vector<bool> &bool_vec, int8_t *data); 260 261 int SetNewCompressionTensor(const ParameterPtr &weight, const std::vector<bool> &bits, size_t bit_num, 262 const tensor::TensorPtr &tensor_info, TensorCompressionType compression_type); 263 }; 264 } // namespace mindspore::lite::quant 265 #endif // MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_TENSOR_COMPRESSOR_H_ 266