1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include "tensorflow/lite/tools/optimize/quantize_model.h"
16
17 #include <algorithm>
18 #include <cstdint>
19 #include <limits>
20 #include <memory>
21 #include <string>
22 #include <unordered_map>
23 #include <unordered_set>
24 #include <utility>
25 #include <vector>
26
27 #include "flatbuffers/flexbuffers.h"
28 #include "tensorflow/lite/context.h"
29 #include "tensorflow/lite/core/api/error_reporter.h"
30 #include "tensorflow/lite/model.h"
31 #include "tensorflow/lite/schema/schema_generated.h"
32 #include "tensorflow/lite/schema/schema_utils.h"
33 #include "tensorflow/lite/tools/optimize/model_utils.h"
34 #include "tensorflow/lite/tools/optimize/operator_property.h"
35 #include "tensorflow/lite/tools/optimize/quantization_utils.h"
36
37 namespace tflite {
38 namespace optimize {
39
40 namespace {
41
IsFloatTensor(const SubGraphT * subgraph,int32_t tensor_idx)42 bool IsFloatTensor(const SubGraphT* subgraph, int32_t tensor_idx) {
43 TensorT* tensor = subgraph->tensors[tensor_idx].get();
44 if (tensor->type != TensorType_FLOAT32) {
45 // Skip non-real-valued tensor.
46 return false;
47 }
48 return true;
49 }
50
51 // Gets the operator property from the operator_property list and additionally
52 // modifies the quantizable parameter based on the user's specified
53 // operator_names.
GetOperatorProperty(const std::unordered_set<string> & operator_names,const ModelT * model,int subgraph_index,int op_idx,const string & operator_name,const TensorType & activations_type)54 operator_property::OperatorProperty GetOperatorProperty(
55 const std::unordered_set<string>& operator_names, const ModelT* model,
56 int subgraph_index, int op_idx, const string& operator_name,
57 const TensorType& activations_type) {
58 operator_property::OperatorProperty property =
59 operator_property::GetOperatorProperty(model, subgraph_index, op_idx);
60 const SubGraphT* subgraph = model->subgraphs[subgraph_index].get();
61 const OperatorT* op = subgraph->operators[op_idx].get();
62 const BuiltinOperator op_code =
63 GetBuiltinCode(model->operator_codes[op->opcode_index].get());
64 if (activations_type == TensorType_INT16 && !property.quantizable_int16) {
65 property.quantizable = false;
66 }
67 // The algorithm adds Dequantize and Quantize, so we don't require them to be
68 // in the operator_names.
69 if (op_code != BuiltinOperator_DEQUANTIZE &&
70 op_code != BuiltinOperator_QUANTIZE) {
71 property.quantizable =
72 property.quantizable &&
73 (operator_names.find(operator_name) != operator_names.end());
74 }
75 return property;
76 }
77
IsRealValueOp(const std::unordered_set<string> & real_value_op_set,const string & operator_name)78 bool IsRealValueOp(const std::unordered_set<string>& real_value_op_set,
79 const string& operator_name) {
80 return real_value_op_set.find(operator_name) != real_value_op_set.end();
81 }
82
83 // Creates a set that contains all quantizable ops that happen to take a
84 // non-float type in the source graph.
PopulateRealValueOpSet(ModelT * model,const std::unordered_set<string> & operator_names,const TensorType & activations_type)85 std::unordered_set<string> PopulateRealValueOpSet(
86 ModelT* model, const std::unordered_set<string>& operator_names,
87 const TensorType& activations_type) {
88 std::unordered_set<string> real_value_op_set;
89 for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
90 subgraph_idx++) {
91 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
92 for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
93 OperatorT* op = subgraph->operators[op_idx].get();
94 const string operator_name = subgraph->tensors[op->outputs[0]]->name;
95 operator_property::OperatorProperty property =
96 GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
97 operator_name, activations_type);
98
99 if (!property.quantizable) {
100 real_value_op_set.insert(operator_name);
101 continue;
102 }
103
104 for (const std::pair<int, operator_property::TensorProperty>& input :
105 property.inputs) {
106 const int32_t input_idx = input.first;
107 const int32_t tensor_idx = op->inputs[input_idx];
108 if (IsFloatTensor(subgraph, tensor_idx)) {
109 real_value_op_set.insert(operator_name);
110 break;
111 }
112 }
113 for (const std::pair<int, operator_property::TensorProperty>& output :
114 property.outputs) {
115 const int32_t output_idx = output.first;
116 const int32_t tensor_idx = op->outputs[output_idx];
117 if (IsFloatTensor(subgraph, tensor_idx)) {
118 real_value_op_set.insert(operator_name);
119 break;
120 }
121 }
122
123 if (property.arbitrary_inputs) {
124 const int32_t tensor_idx = op->inputs[0];
125 if (IsFloatTensor(subgraph, tensor_idx)) {
126 real_value_op_set.insert(operator_name);
127 }
128 }
129
130 if (property.arbitrary_outputs) {
131 const int32_t tensor_idx = op->outputs[0];
132 if (IsFloatTensor(subgraph, tensor_idx)) {
133 real_value_op_set.insert(operator_name);
134 }
135 }
136 }
137 }
138 return real_value_op_set;
139 }
140
QuantizeBias(ModelT * model,const TensorT * input_tensor,const TensorT * weight_tensor,TensorT * bias_tensor,bool is_per_channel,int channel_dim_index,const TensorType & activations_type,ErrorReporter * error_reporter)141 TfLiteStatus QuantizeBias(ModelT* model, const TensorT* input_tensor,
142 const TensorT* weight_tensor, TensorT* bias_tensor,
143 bool is_per_channel, int channel_dim_index,
144 const TensorType& activations_type,
145 ErrorReporter* error_reporter) {
146 if (bias_tensor->shape.size() != 1) {
147 TF_LITE_REPORT_ERROR(error_reporter, "Expected bias tensor shape to be 1.");
148 return kTfLiteError;
149 }
150
151 int32_t channel_dim_size = bias_tensor->shape[0];
152 TF_LITE_ENSURE(error_reporter, weight_tensor->quantization);
153 std::vector<float> weight_scales = weight_tensor->quantization->scale;
154
155 if (is_per_channel) {
156 if (bias_tensor->shape[0] != weight_tensor->shape[channel_dim_index]) {
157 TF_LITE_REPORT_ERROR(
158 error_reporter,
159 "Channel mismatch between bias and weight tensors %d vs %d",
160 bias_tensor->shape[0], weight_tensor->shape[channel_dim_index]);
161 return kTfLiteError;
162 }
163 if (!input_tensor->quantization ||
164 input_tensor->quantization->scale.size() != 1) {
165 TF_LITE_REPORT_ERROR(error_reporter,
166 "Input tensor missing quantization information");
167 return kTfLiteError;
168 }
169
170 if (weight_scales.size() != channel_dim_size) {
171 TF_LITE_REPORT_ERROR(error_reporter,
172 "Mismatch weight scale dimension: %d",
173 weight_scales.size());
174 return kTfLiteError;
175 }
176 if (activations_type == tflite::TensorType_INT16) {
177 return utils::SymmetricPerChannelBiasQuantize<std::int64_t>(
178 model, bias_tensor, input_tensor->quantization->scale[0],
179 weight_scales.data(), channel_dim_size, error_reporter);
180 } else {
181 return utils::SymmetricPerChannelBiasQuantize<std::int32_t>(
182 model, bias_tensor, input_tensor->quantization->scale[0],
183 weight_scales.data(), channel_dim_size, error_reporter);
184 }
185 } else {
186 if (weight_scales.size() != 1) {
187 TF_LITE_REPORT_ERROR(
188 error_reporter,
189 "Expected per-layer weight scale dimension size 1, got %d",
190 weight_scales.size());
191 return kTfLiteError;
192 }
193 if (activations_type == tflite::TensorType_INT16) {
194 return utils::SymmetricPerLayerBiasQuantize<std::int64_t>(
195 model, bias_tensor,
196 input_tensor->quantization->scale[0] * weight_scales[0],
197 error_reporter);
198 } else {
199 return utils::SymmetricPerLayerBiasQuantize<std::int32_t>(
200 model, bias_tensor,
201 input_tensor->quantization->scale[0] * weight_scales[0],
202 error_reporter);
203 }
204 }
205 return kTfLiteError;
206 }
207
208 // True if the tensor type has to be modified.
TensorTypeChangeRequired(const TensorT * tensor,const TensorType & type)209 bool TensorTypeChangeRequired(const TensorT* tensor, const TensorType& type) {
210 // The quantized model is type INT8/INT16, so if the user provided type is
211 // INT8/INT16, we do not have to do any custom logic. Additionally, if the
212 // current tensor isn't INT8/INT16 quantized, the custom type doesn't apply.
213 bool int8check = type != TensorType_INT8 && tensor->type == TensorType_INT8 &&
214 !tensor->quantization->scale.empty();
215 bool int16check = type != TensorType_INT16 &&
216 tensor->type == TensorType_INT16 &&
217 !tensor->quantization->scale.empty();
218 return (int8check || int16check);
219 }
220
221 // Check if input is consumed by quantize, which means we don't need to
222 // requantize if the output scale is the same as the input tensor's.
InputQuantizeRequired(const ModelT * model,const SubGraphT * subgraph,int32_t input_idx)223 bool InputQuantizeRequired(const ModelT* model, const SubGraphT* subgraph,
224 int32_t input_idx) {
225 std::vector<OperatorT*> quantize_ops;
226 for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
227 OperatorT* op = subgraph->operators[op_idx].get();
228 if (std::find(op->inputs.begin(), op->inputs.end(), input_idx) !=
229 op->inputs.end()) {
230 const BuiltinOperator op_code =
231 GetBuiltinCode(model->operator_codes[op->opcode_index].get());
232 if (op_code != BuiltinOperator_QUANTIZE) {
233 return true;
234 }
235 quantize_ops.push_back(op);
236 }
237 }
238 if (quantize_ops.size() == 1) {
239 const auto* tensor = subgraph->tensors[input_idx].get();
240 const auto* op = quantize_ops[0];
241 const int32_t output_idx = op->outputs[0];
242 const auto output_type = subgraph->tensors[output_idx]->type;
243 const float output_scale =
244 subgraph->tensors[output_idx]->quantization->scale[0];
245 const int64_t output_zero_point =
246 subgraph->tensors[output_idx]->quantization->zero_point[0];
247 if (output_type == tensor->type &&
248 output_scale == tensor->quantization->scale[0] &&
249 output_zero_point == tensor->quantization->zero_point[0]) {
250 return false;
251 }
252 }
253 return true;
254 }
255
256 // Sets the input type, adding a Leading Op node at the start of the model if
257 // necessary.
258 // Returns the new input tensor index.
SetInputType(ModelT * model,SubGraphT * subgraph,const int32_t tensor_idx,const TensorType & input_type,const TensorType & activations_type)259 int32_t SetInputType(ModelT* model, SubGraphT* subgraph,
260 const int32_t tensor_idx, const TensorType& input_type,
261 const TensorType& activations_type) {
262 TensorT* tensor = subgraph->tensors[tensor_idx].get();
263 if (!TensorTypeChangeRequired(tensor, input_type)) {
264 return -1;
265 }
266 if (input_type == TensorType_FLOAT32 || input_type == TensorType_UINT8) {
267 std::string type_string =
268 activations_type == TensorType_INT16 ? "int16" : "int8";
269 // Create a new tensor to be the input of the leading Op.
270 std::unique_ptr<TensorT> leading_op_input;
271 if (input_type == TensorType_FLOAT32) {
272 // Add tensor for quantize operator. Scales and zero points are not
273 // needed.
274 const string leading_op_name = tensor->name;
275 const string new_name_original_input = tensor->name + "_" + type_string;
276 tensor->name = new_name_original_input;
277 utils::MakeTensor(leading_op_name, tensor->shape, tensor->shape_signature,
278 input_type, &leading_op_input);
279 } else {
280 // Get scale and zero point from the first tensor.
281 const float scale = subgraph->tensors[tensor_idx]->quantization->scale[0];
282 const int64_t zero_point =
283 subgraph->tensors[tensor_idx]->quantization->zero_point[0];
284
285 // Add tensor for requantize operator. Scale is the existing scale and
286 // zero point is shifted by +128.
287 TFLITE_DCHECK_GE(zero_point, -128);
288 TFLITE_DCHECK_LE(zero_point, 127);
289 const string leading_op_name = tensor->name;
290 const string new_name_original_input = tensor->name + "_" + type_string;
291 tensor->name = new_name_original_input;
292 utils::MakeTensorWithQuantParam(
293 leading_op_name, tensor->shape, tensor->shape_signature, input_type,
294 scale, zero_point + 128, &leading_op_input);
295 }
296
297 // Check if quantize op already exists.
298 if (!InputQuantizeRequired(model, subgraph, tensor_idx)) {
299 subgraph->tensors[tensor_idx] = std::move(leading_op_input);
300 return tensor_idx;
301 }
302
303 const int32_t leading_op_input_idx = subgraph->tensors.size();
304 subgraph->tensors.push_back(std::move(leading_op_input));
305
306 // Create the leading op, which is Quantize Op that quantize or requantize
307 // the input.
308 std::unique_ptr<OperatorT> leading_op;
309 utils::MakeQuantizeOperator(model, &leading_op, leading_op_input_idx,
310 tensor_idx);
311
312 // Insert the new op at the start of the model.
313 subgraph->operators.insert(subgraph->operators.begin(),
314 std::move(leading_op));
315 return leading_op_input_idx;
316 }
317 return -1;
318 }
319
320 // Sets the output type, adding a Tailing Op node at the end of the model if
321 // necessary.
322 // Returns the new output tensor index.
SetOutputType(ModelT * model,SubGraphT * subgraph,const int32_t tensor_idx,const TensorType & output_type,const TensorType & activations_type)323 int32_t SetOutputType(ModelT* model, SubGraphT* subgraph,
324 const int32_t tensor_idx, const TensorType& output_type,
325 const TensorType& activations_type) {
326 TensorT* tensor = subgraph->tensors[tensor_idx].get();
327 if (!TensorTypeChangeRequired(tensor, output_type)) {
328 return -1;
329 }
330 if (output_type == TensorType_FLOAT32 || output_type == TensorType_UINT8) {
331 std::string type_string =
332 activations_type == TensorType_INT16 ? "int16" : "int8";
333 // Create a new tensor to be the output of the tailing op.
334 std::unique_ptr<TensorT> tailing_op_output;
335 if (output_type == TensorType_FLOAT32) {
336 const string tailing_op_name = tensor->name;
337 const string new_name_original_output = tensor->name + "_" + type_string;
338 tensor->name = new_name_original_output;
339 utils::MakeTensor(tailing_op_name, tensor->shape, tensor->shape_signature,
340 output_type, &tailing_op_output);
341 } else {
342 // Get scale and zero point from the last tensor.
343 const float scale = subgraph->tensors[tensor_idx]->quantization->scale[0];
344 const int64_t zero_point =
345 subgraph->tensors[tensor_idx]->quantization->zero_point[0];
346
347 // Add tensor for requantize operator. Scale is the existing scale and
348 // zero point is shifted by +128.
349 TFLITE_DCHECK_GE(zero_point, -128);
350 TFLITE_DCHECK_LE(zero_point, 127);
351 const string tailing_op_name = tensor->name;
352 const string new_name_original_output = tensor->name + "_" + type_string;
353 tensor->name = new_name_original_output;
354 utils::MakeTensorWithQuantParam(
355 tailing_op_name, tensor->shape, tensor->shape_signature, output_type,
356 scale, zero_point + 128, &tailing_op_output);
357 }
358 const int32_t tailing_op_output_idx = subgraph->tensors.size();
359 subgraph->tensors.push_back(std::move(tailing_op_output));
360
361 // Create the tailing operation.
362 std::unique_ptr<OperatorT> tailing_op;
363 if (output_type == TensorType_FLOAT32) {
364 // Tailing Op is Dequantize Op.
365 utils::MakeDequantizeOperator(model, &tailing_op, tensor_idx,
366 tailing_op_output_idx);
367 } else {
368 // Tailing Op is Quantize Op that does requantization.
369 utils::MakeQuantizeOperator(model, &tailing_op, tensor_idx,
370 tailing_op_output_idx);
371 }
372 // Add the operator at the end of the model.
373 subgraph->operators.push_back(std::move(tailing_op));
374 return tailing_op_output_idx;
375 }
376 return -1;
377 }
378
379 // Sets the input and output types to the provided types. Leading and
380 // tailing operations will be added if needed.
381 // For Float input and output, leading op is Quantize and tailing op is
382 // Dequantize.
383 // For Uint8 input and output, leading op is Quantize (uint8 to
384 // int8, can be thought as "requant") and tailing op is also Quantize (int8 to
385 // uint8, can be thought as "requant").
SetInputAndOutputTypes(ModelT * model,const TensorType & input_type,const TensorType & output_type,const TensorType & activations_type,ErrorReporter * error_reporter)386 TfLiteStatus SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
387 const TensorType& output_type,
388 const TensorType& activations_type,
389 ErrorReporter* error_reporter) {
390 for (int subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
391 subgraph_idx++) {
392 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
393
394 for (int i = 0; i < subgraph->inputs.size(); ++i) {
395 TensorT* tensor = subgraph->tensors[subgraph->inputs[i]].get();
396 // TODO(suharshs): Add support for this case if it ever comes up.
397 if (tensor->type == TensorType_FLOAT32 && input_type != tensor->type) {
398 TF_LITE_REPORT_ERROR(
399 error_reporter,
400 "Unsupported input type %s for input tensor %d of type %s.",
401 EnumNameTensorType(input_type), subgraph->inputs[i],
402 EnumNameTensorType(tensor->type));
403 return kTfLiteError;
404 }
405 const int32_t input_idx = SetInputType(
406 model, subgraph, subgraph->inputs[i], input_type, activations_type);
407 if (input_idx < 0) {
408 continue;
409 }
410 subgraph->inputs[i] = input_idx;
411 }
412 for (int i = 0; i < subgraph->outputs.size(); ++i) {
413 TensorT* tensor = subgraph->tensors[subgraph->outputs[i]].get();
414 // TODO(suharshs): Add support for this case if it ever comes up.
415 if (tensor->type == TensorType_FLOAT32 && output_type != tensor->type) {
416 TF_LITE_REPORT_ERROR(
417 error_reporter,
418 "Unsupported output type %s for output tensor '%s' of type %s.",
419 EnumNameTensorType(output_type), tensor->name.c_str(),
420 EnumNameTensorType(tensor->type));
421 return kTfLiteError;
422 }
423 const int32_t output_idx = SetOutputType(
424 model, subgraph, subgraph->outputs[i], output_type, activations_type);
425 if (output_idx < 0) {
426 continue;
427 }
428 subgraph->outputs[i] = output_idx;
429 }
430 }
431 return kTfLiteOk;
432 }
433
434 // Apply constraints to ops if they have any.
435 // We have made the restriction that for int8 quantized concat, minimum, and
436 // maximum, the inputs and outputs must have the same scale and zero point.
437 // The other ones with constraints are handled in QuantizeWeightsAndInput.
ApplyConstraints(ModelT * model,const std::unordered_set<string> & operator_names,const std::unordered_set<string> & real_value_op_set,TensorType activations_type,ErrorReporter * error_reporter)438 TfLiteStatus ApplyConstraints(
439 ModelT* model, const std::unordered_set<string>& operator_names,
440 const std::unordered_set<string>& real_value_op_set,
441 TensorType activations_type, ErrorReporter* error_reporter) {
442 for (int subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
443 subgraph_idx++) {
444 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
445 // Iterate backward to avoid messing with index.
446 for (int op_idx = subgraph->operators.size() - 1; op_idx >= 0; op_idx--) {
447 OperatorT* op = subgraph->operators[op_idx].get();
448 const string operator_name = subgraph->tensors[op->outputs[0]]->name;
449 operator_property::OperatorProperty property =
450 GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
451 operator_name, activations_type);
452 if (!property.quantizable ||
453 !IsRealValueOp(real_value_op_set, operator_name)) {
454 continue;
455 }
456 if (!property.arbitrary_inputs ||
457 !property.restrict_same_input_output_scale) {
458 continue;
459 }
460 // If ApplyConstraints and requant is needed, use the min of min and max
461 // of max, which means using the scale and zero point of output.
462 TensorT* output_tensor = subgraph->tensors[op->outputs[0]].get();
463 if (!utils::QuantizationParametersExist(output_tensor)) {
464 TF_LITE_REPORT_ERROR(
465 error_reporter,
466 "Unable to get scale or zero point from the tensor at %d.",
467 op->outputs[0]);
468 return kTfLiteError;
469 }
470 const float output_scale = output_tensor->quantization->scale[0];
471 const float output_zp = output_tensor->quantization->zero_point[0];
472 for (size_t input_idx = 0; input_idx < op->inputs.size(); ++input_idx) {
473 TensorT* input_tensor = subgraph->tensors[op->inputs[input_idx]].get();
474 if (!utils::QuantizationParametersExist(input_tensor)) {
475 TF_LITE_REPORT_ERROR(
476 error_reporter,
477 "Unable to get scale or zero point from tensor at %d.",
478 op->inputs[input_idx]);
479 return kTfLiteError;
480 }
481 if (input_tensor->quantization->scale[0] == output_scale &&
482 input_tensor->quantization->zero_point[0] == output_zp) {
483 // This input does not need to be requantized.
484 continue;
485 }
486
487 std::unique_ptr<TensorT> additional_tensor;
488 const string requant_tensor_name = input_tensor->name + "_requantized";
489 utils::MakeTensorWithQuantParam(
490 requant_tensor_name, input_tensor->shape,
491 input_tensor->shape_signature, activations_type, output_scale,
492 output_zp, &additional_tensor);
493 const int32_t additional_tensor_idx = subgraph->tensors.size();
494 subgraph->tensors.push_back(std::move(additional_tensor));
495
496 // Add requant op before this input.
497 // There are better ways to handle this, which is to try to push the
498 // rescale upwards recursively and hope all upstream ops can absort
499 // this rescale.and only add requant when there is no other way.
500 std::unique_ptr<OperatorT> requant_op;
501 utils::MakeQuantizeOperator(model, &requant_op, op->inputs[input_idx],
502 additional_tensor_idx);
503 op->inputs[input_idx] = additional_tensor_idx;
504
505 subgraph->operators.insert(subgraph->operators.begin() + op_idx,
506 std::move(requant_op));
507 }
508 }
509 }
510 return kTfLiteOk;
511 }
512
513 // In case of int16 activations, there are two implementations of kernels for
514 // ADD/SUB operators. We set the builtin option pot_scale_int16
515 // during quantization so that from now only the general case implementation is
516 // used.
SetOperatorPropertyADDSUBOperator(ModelT * model,const TensorType & activations_type)517 void SetOperatorPropertyADDSUBOperator(ModelT* model,
518 const TensorType& activations_type) {
519 if (activations_type != TensorType_INT16) {
520 // This is needed only in case of int16 activations.
521 return;
522 }
523
524 for (int subgraph_idx = 0, end = model->subgraphs.size(); subgraph_idx < end;
525 subgraph_idx++) {
526 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
527 // Iterate backward to avoid messing with index.
528 for (int op_idx = subgraph->operators.size() - 1; op_idx >= 0; op_idx--) {
529 OperatorT* op = subgraph->operators[op_idx].get();
530 OperatorCodeT* op_code = model->operator_codes[op->opcode_index].get();
531 if (op_code && op_code->builtin_code == BuiltinOperator_ADD) {
532 {
533 auto* options = op->builtin_options.AsAddOptions();
534 if (options) {
535 options->pot_scale_int16 = false;
536 }
537 }
538 }
539 if (op_code && op_code->builtin_code == BuiltinOperator_SUB) {
540 {
541 auto* options = op->builtin_options.AsSubOptions();
542 if (options) {
543 options->pot_scale_int16 = false;
544 }
545 }
546 }
547 }
548 }
549 }
550
GetInputs(const OperatorT * op,operator_property::OperatorProperty property)551 std::vector<std::pair<int, operator_property::TensorProperty>> GetInputs(
552 const OperatorT* op, operator_property::OperatorProperty property) {
553 std::vector<std::pair<int, operator_property::TensorProperty>> inputs;
554 if (property.arbitrary_inputs || !property.quantizable) {
555 for (int i = 0; i < op->inputs.size(); ++i) {
556 inputs.push_back({i, {}});
557 }
558 } else {
559 inputs = property.inputs;
560 }
561 return inputs;
562 }
563
GetOutputs(const OperatorT * op,operator_property::OperatorProperty property)564 std::vector<std::pair<int, operator_property::TensorProperty>> GetOutputs(
565 const OperatorT* op, operator_property::OperatorProperty property) {
566 std::vector<std::pair<int, operator_property::TensorProperty>> outputs;
567 if (property.arbitrary_outputs) {
568 for (int i = 0; i < op->outputs.size(); ++i) {
569 outputs.push_back({i, {}});
570 }
571 } else {
572 outputs = property.outputs;
573 }
574 return outputs;
575 }
576
ShouldRestrictSameInputOutputScale(operator_property::OperatorProperty property)577 bool ShouldRestrictSameInputOutputScale(
578 operator_property::OperatorProperty property) {
579 // Ops with multiple inputs (i.e. concat, max and min) gets restricted in
580 // ApplyConstraints.
581 return (!property.arbitrary_inputs &&
582 property.restrict_same_input_output_scale);
583 }
584
IsSubgraphInput(SubGraphT * subgraph,int32_t index)585 bool IsSubgraphInput(SubGraphT* subgraph, int32_t index) {
586 for (const int32_t input_idx : subgraph->inputs) {
587 if (index == input_idx) {
588 return true;
589 }
590 }
591 return false;
592 }
593
594 // Quantize the op input. Will increment op_idx if ops are added.
QuantizeOpInput(ModelT * model,int32_t subgraph_idx,size_t * op_idx,operator_property::OperatorProperty property,const std::pair<int32_t,operator_property::TensorProperty> & input,const TensorType & activations_type,ErrorReporter * error_reporter)595 TfLiteStatus QuantizeOpInput(
596 ModelT* model, int32_t subgraph_idx, size_t* op_idx,
597 operator_property::OperatorProperty property,
598 const std::pair<int32_t, operator_property::TensorProperty>& input,
599 const TensorType& activations_type, ErrorReporter* error_reporter) {
600 int32_t input_idx = input.first;
601 operator_property::TensorProperty tensor_property = input.second;
602 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
603 OperatorT* op = subgraph->operators[*op_idx].get();
604 const BuiltinOperator op_code =
605 GetBuiltinCode(model->operator_codes[op->opcode_index].get());
606 if (input_idx >= op->inputs.size()) {
607 TF_LITE_REPORT_ERROR(
608 error_reporter,
609 "Required input index %d is larger than the input length of op "
610 "%s at index %d in subgraph %d",
611 input_idx, op->inputs.size(), EnumNameBuiltinOperator(op_code), *op_idx,
612 subgraph_idx);
613 return kTfLiteError;
614 }
615 const int32_t tensor_idx = op->inputs[input_idx];
616 if (tensor_idx == -1) {
617 // Skip optional tensor.
618 return kTfLiteOk;
619 }
620 TensorT* tensor = subgraph->tensors[tensor_idx].get();
621 // Assumes op is quantized to int8.
622 const bool is_input_quantized = utils::QuantizationParametersExist(tensor);
623 if (property.quantizable && !is_input_quantized) {
624 // The operation is quantizable, but the input isn't yet quantized.
625 if (utils::HasBuffer(model, subgraph, tensor_idx)) {
626 // TODO(suharshs): Look at consumers, throw error if one consumer is
627 // per-channel and one per-layer.
628 bool quantize_const_input = property.quantize_input_as_activations &&
629 activations_type == TensorType_INT16;
630 if (tensor_property.number_of_bits == 8 && !quantize_const_input) {
631 if (tensor_property.use_derived_scale) {
632 // Currently 8bit tensors in input do not accept derived scale.
633 return kTfLiteError;
634 }
635 if (utils::QuantizeWeight(model, tensor, tensor_property.per_axis,
636 tensor_property.per_axis_index,
637 error_reporter) != kTfLiteOk) {
638 TF_LITE_REPORT_ERROR(
639 error_reporter,
640 "Unable to quantize buffer or min/max value for input %d "
641 "in op %s in subgraph %d, node: %d",
642 input_idx, EnumNameBuiltinOperator(op_code), subgraph_idx,
643 *op_idx);
644 return kTfLiteError;
645 }
646 } else if (tensor_property.number_of_bits == 16 || quantize_const_input) {
647 if (tensor_property.use_derived_scale) {
648 // Currently 16bit tensors in input do not accept derived scale.
649 return kTfLiteError;
650 }
651 TensorT* tensor = subgraph->tensors[tensor_idx].get();
652 int total_size = 1;
653 for (int i = 0; i < tensor->shape.size(); ++i) {
654 total_size *= tensor->shape[i];
655 }
656 BufferT* buffer = model->buffers[tensor->buffer].get();
657 float* float_data = reinterpret_cast<float*>(buffer->data.data());
658 auto minmax = std::minmax_element(float_data, float_data + total_size);
659 const float min = *minmax.first;
660 const float max = *minmax.second;
661 const float range = std::max(std::abs(min), std::abs(max));
662 // The narrow range quantized value for int16.
663 const float quantize_range = 32767.0;
664 const float scale = range / quantize_range;
665 return utils::SymmetricQuantizeFloatsToInt16(model, tensor, scale,
666 error_reporter);
667 } else if (tensor_property.number_of_bits == 32) {
668 if (!tensor_property.use_derived_scale) {
669 // Currently 32 bit tensors in input only accept derived scale.
670 return kTfLiteError;
671 }
672 TensorT* tensor = subgraph->tensors[tensor_idx].get();
673 const float scale = utils::GetEffectiveScale(
674 model, subgraph, *op_idx,
675 tensor_property.derived_scale.input_tensors,
676 tensor_property.derived_scale.intermediate_tensors,
677 tensor_property.derived_scale.factors);
678 return utils::SymmetricPerLayerBiasQuantize<std::int32_t>(
679 model, tensor, scale, error_reporter);
680
681 } else if (tensor_property.number_of_bits == 10) {
682 // When the number of bits is 10 (instead of 16), quantize the tensor to
683 // [-512, 512], instead of [-32767, 32767].
684 TensorT* tensor = subgraph->tensors[tensor_idx].get();
685 int total_size = 1;
686 for (int i = 0; i < tensor->shape.size(); ++i) {
687 total_size *= tensor->shape[i];
688 }
689 BufferT* buffer = model->buffers[tensor->buffer].get();
690 float* buffer_data = reinterpret_cast<float*>(buffer->data.data());
691 auto minmax =
692 std::minmax_element(buffer_data, buffer_data + total_size);
693 const float range =
694 std::max(std::abs(*minmax.first), std::abs(*minmax.second));
695 const float quantized_range = 512.0;
696 const float scale = range / quantized_range;
697 return utils::SymmetricQuantizeFloatsToInt16(model, tensor, scale,
698 error_reporter);
699 } else {
700 // Only 8, 16, 32, 10 are supported.
701 // TODO(jianlijianli): extend this to support arbitrary bits.
702 TF_LITE_REPORT_ERROR(
703 error_reporter,
704 "Unable to quantize buffer or min/max value for input %d "
705 "in op %s in subgraph %d, node: %d",
706 input_idx, EnumNameBuiltinOperator(op_code), subgraph_idx, *op_idx);
707 return kTfLiteError;
708 }
709 } else if (utils::HasMinMax(tensor)) {
710 if (IsSubgraphInput(subgraph, tensor_idx) ||
711 tensor_property.state_tensor) {
712 if (tensor_property.number_of_bits == 8) {
713 if (tensor_property.use_derived_scale) {
714 // Currently 8bit tensors in input do not accept derived scale.
715 return kTfLiteError;
716 }
717 TF_LITE_ENSURE_STATUS(utils::QuantizeActivation(
718 tensor, activations_type, error_reporter));
719 } else if (tensor_property.number_of_bits == 16) {
720 TensorT* tensor = subgraph->tensors[tensor_idx].get();
721 float quantized_range = 32767.0;
722 float range = std::max(std::abs(tensor->quantization->min[0]),
723 std::abs(tensor->quantization->max[0]));
724 if (tensor_property.extend_to_power_of_two) {
725 const int power_of_two_scale = utils::GetPowerOfTwoScale(
726 tensor->quantization->min[0], tensor->quantization->max[0]);
727 range = std::pow(2, power_of_two_scale);
728 quantized_range = 32768.0;
729 }
730 const float scale = range / quantized_range;
731 utils::QuantizeActivationToInt16(tensor, scale);
732 }
733 } else {
734 // If the tensor is not a model input, we need to add a Quantize
735 // operation since the preceding op may require a float output.
736 std::string type_string =
737 activations_type == TensorType_INT16 ? "int16" : "int8";
738 std::unique_ptr<TensorT> op_output;
739 utils::MakeTensor(tensor->name + "_" + type_string, tensor->shape,
740 tensor->shape_signature, activations_type,
741 &op_output);
742 op_output->quantization = absl::make_unique<QuantizationParametersT>();
743 op_output->quantization->min.push_back(tensor->quantization->min[0]);
744 op_output->quantization->max.push_back(tensor->quantization->max[0]);
745 TF_LITE_ENSURE_STATUS(utils::QuantizeActivation(
746 op_output.get(), activations_type, error_reporter));
747 const int32_t quant_op_output_idx = subgraph->tensors.size();
748 subgraph->tensors.push_back(std::move(op_output));
749 std::unique_ptr<OperatorT> quant_op;
750 utils::MakeQuantizeOperator(model, &quant_op, tensor_idx,
751 quant_op_output_idx);
752 subgraph->operators.insert(subgraph->operators.begin() + *op_idx,
753 std::move(quant_op));
754 op->inputs[input_idx] = quant_op_output_idx;
755 *op_idx += 1;
756 }
757 } else {
758 TF_LITE_REPORT_ERROR(error_reporter,
759 "Unable to find buffer or min/max value for input "
760 "%d in %s in subgraph %d, node: %d",
761 input_idx, EnumNameBuiltinOperator(op_code),
762 subgraph_idx, *op_idx);
763 return kTfLiteError;
764 }
765 } else if (!property.quantizable && is_input_quantized) {
766 // If the tensor is quantized, we have to add a Dequantize op after
767 // since this op is not quantizable.
768 std::unique_ptr<TensorT> op_output;
769 utils::MakeTensor(tensor->name + "_float", tensor->shape,
770 tensor->shape_signature, TensorType_FLOAT32, &op_output);
771 const int32_t dequant_op_output_idx = subgraph->tensors.size();
772 subgraph->tensors.push_back(std::move(op_output));
773 std::unique_ptr<OperatorT> dequant_op;
774 utils::MakeDequantizeOperator(model, &dequant_op, tensor_idx,
775 dequant_op_output_idx);
776 subgraph->operators.insert(subgraph->operators.begin() + *op_idx,
777 std::move(dequant_op));
778 op->inputs[input_idx] = dequant_op_output_idx;
779 *op_idx += 1;
780 }
781 return kTfLiteOk;
782 }
783
784 // Quantize the op output.
QuantizeOpOutput(ModelT * model,int32_t subgraph_idx,int32_t op_idx,operator_property::OperatorProperty property,const std::pair<int32_t,operator_property::TensorProperty> & output,TensorType activations_type,ErrorReporter * error_reporter)785 TfLiteStatus QuantizeOpOutput(
786 ModelT* model, int32_t subgraph_idx, int32_t op_idx,
787 operator_property::OperatorProperty property,
788 const std::pair<int32_t, operator_property::TensorProperty>& output,
789 TensorType activations_type, ErrorReporter* error_reporter) {
790 int32_t output_idx = output.first;
791 operator_property::TensorProperty tensor_property = output.second;
792 // If the operator is not quantizable, we don't need to do anything for the
793 // output.
794 if (!property.quantizable) {
795 return kTfLiteOk;
796 }
797 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
798 OperatorT* op = subgraph->operators[op_idx].get();
799 const BuiltinOperator op_code =
800 GetBuiltinCode(model->operator_codes[op->opcode_index].get());
801 if (output_idx >= op->outputs.size()) {
802 TF_LITE_REPORT_ERROR(
803 error_reporter,
804 "Required output index %d is larger than the output length of "
805 "op %s at index %d in subgraph %d",
806 output_idx, op->outputs.size(), EnumNameBuiltinOperator(op_code),
807 op_idx, subgraph_idx);
808 return kTfLiteError;
809 }
810
811 TensorT* output_tensor = subgraph->tensors[op->outputs[output_idx]].get();
812 if (utils::QuantizationParametersExist(output_tensor)) {
813 // Skip output if it has been quantized.
814 return kTfLiteOk;
815 }
816 if (ShouldRestrictSameInputOutputScale(property)) {
817 // Copy quantization parameter. For average pool, max pool, etc
818 // min/max can be different but we want them to be the same.
819 // Get scale and zero point of input.
820 if (property.inputs[0].first >= op->inputs.size()) {
821 TF_LITE_REPORT_ERROR(
822 error_reporter,
823 "Required input index %d is larger than the input length of "
824 "op %s at index %d in subgraph %d",
825 property.inputs[0].first, op->inputs.size(),
826 EnumNameBuiltinOperator(op_code), op_idx, subgraph_idx);
827 return kTfLiteError;
828 }
829 const int input_tensor_idx = op->inputs[property.inputs[0].first];
830 TensorT* input_tensor = subgraph->tensors[input_tensor_idx].get();
831 if (input_tensor->quantization->scale.size() != 1 ||
832 input_tensor->quantization->zero_point.size() != 1) {
833 TF_LITE_REPORT_ERROR(error_reporter,
834 "Invalid quantization params for op %s at index %d "
835 "in subgraph %d",
836 EnumNameBuiltinOperator(op_code), op_idx,
837 subgraph_idx);
838 return kTfLiteError;
839 }
840
841 const float input_scale = input_tensor->quantization->scale[0];
842 const int32_t input_zero_point = input_tensor->quantization->zero_point[0];
843
844 // Apply to output.
845 output_tensor->quantization = absl::make_unique<QuantizationParametersT>();
846 output_tensor->quantization->scale.push_back(input_scale);
847 output_tensor->quantization->zero_point.push_back(input_zero_point);
848 if (!input_tensor->quantization->min.empty()) {
849 const float min = input_tensor->quantization->min[0];
850 output_tensor->quantization->min = {min};
851 }
852 if (!input_tensor->quantization->max.empty()) {
853 const float max = input_tensor->quantization->max[0];
854 output_tensor->quantization->max = {max};
855 }
856 output_tensor->type = activations_type;
857 } else if (tensor_property.restriction) {
858 const auto scale_and_zp = activations_type == TensorType_INT16
859 ? tensor_property.restricted_value_int16
860 : tensor_property.restricted_value_int8;
861
862 // Apply to output.
863 output_tensor->quantization = absl::make_unique<QuantizationParametersT>();
864 output_tensor->quantization->scale.push_back(scale_and_zp.first);
865 output_tensor->quantization->zero_point.push_back(scale_and_zp.second);
866 output_tensor->type = activations_type;
867 } else {
868 // Process regular output that doesn't have any restrictions.
869 if (utils::HasMinMax(output_tensor)) {
870 utils::QuantizeActivation(output_tensor, activations_type,
871 error_reporter);
872 } else {
873 TF_LITE_REPORT_ERROR(
874 error_reporter,
875 "Unable to find min/max value for output %d in %s in "
876 "subgraph %d, node: %d",
877 output_idx, EnumNameBuiltinOperator(op_code), subgraph_idx, op_idx);
878 return kTfLiteError;
879 }
880 }
881 return kTfLiteOk;
882 }
883
QuantizeIntemediateTensors(ModelT * model,TensorType activations_type,ErrorReporter * error_reporter)884 TfLiteStatus QuantizeIntemediateTensors(ModelT* model,
885 TensorType activations_type,
886 ErrorReporter* error_reporter) {
887 for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
888 subgraph_idx++) {
889 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
890 for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
891 operator_property::OperatorProperty property =
892 operator_property::GetOperatorProperty(model, subgraph_idx, op_idx);
893 if (!property.intermediates.empty()) {
894 OperatorT* op = subgraph->operators[op_idx].get();
895 const BuiltinOperator op_code =
896 GetBuiltinCode(model->operator_codes[op->opcode_index].get());
897 for (const std::pair<int, operator_property::TensorProperty>& input :
898 property.intermediates) {
899 const int index_local = input.first;
900 const int index_global = op->intermediates[index_local];
901 if (index_global == -1) {
902 // Skip optional tensor.
903 continue;
904 }
905 if (input.second.number_of_bits == 8 &&
906 input.second.symmetric == false) {
907 TensorT* tensor = subgraph->tensors[index_global].get();
908 if (tensor->quantization == nullptr) {
909 continue;
910 }
911 if (utils::HasMinMax(tensor)) {
912 utils::QuantizeActivation(tensor, activations_type,
913 error_reporter);
914 } else {
915 TF_LITE_REPORT_ERROR(error_reporter,
916 "Unable to find min/max value for "
917 "intermediate tensor %d in %s in "
918 "subgraph %d, node: %d",
919 index_local,
920 EnumNameBuiltinOperator(op_code),
921 subgraph_idx, op_idx);
922 return kTfLiteError;
923 }
924 } else if (input.second.number_of_bits == 16 &&
925 input.second.symmetric == true) {
926 TensorT* tensor = subgraph->tensors[index_global].get();
927 if (tensor->quantization == nullptr) {
928 continue;
929 }
930 const float min = tensor->quantization->min[0];
931 const float max = tensor->quantization->max[0];
932 const float range = std::max(std::abs(min), std::abs(max));
933 if (range < 1e-8) {
934 return kTfLiteError;
935 }
936
937 // Get scale and zero point.
938 const float quantized_range = 32767.0;
939 const float scale = range / quantized_range;
940 utils::QuantizeActivationToInt16(tensor, scale);
941 } else {
942 return kTfLiteError;
943 }
944 }
945 }
946 }
947 }
948 return kTfLiteOk;
949 }
950
951 // Quantize tensros that have shared range. For example, in LSTM, the output
952 // tensor and input state tensor should share the same range because they are
953 // using the same scale and zero point.
954 // We have to model this explicitly because the output is modeled as an extra
955 // tensor in LSTM. In calibrator, state tensors are logged both before and after
956 // the inference so the range is fully captured. But output, although it is
957 // identical to activation, is not a state tensor the input value (range) of the
958 // very first inference is not captured.
QuantizeSharedRange(ModelT * model,ErrorReporter * error_reporter)959 TfLiteStatus QuantizeSharedRange(ModelT* model, ErrorReporter* error_reporter) {
960 for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
961 subgraph_idx++) {
962 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
963 for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
964 operator_property::OperatorProperty property =
965 operator_property::GetOperatorProperty(model, subgraph_idx, op_idx);
966 if (!property.intermediates.empty()) {
967 OperatorT* op = subgraph->operators[op_idx].get();
968 for (const std::vector<int>& input : property.restrict_scale) {
969 if (input.empty()) {
970 continue;
971 }
972 // Currently only support pair of twos.
973 // TODO(b/174534943): extend to arbitrary number of tensors.
974 if (input.size() != 2) {
975 return kTfLiteError;
976 }
977 const int index_1 = input[0];
978 const int index_2 = input[1];
979 // TODO(jianlijianli): model input/output.
980 TensorT* tensor_1 = subgraph->tensors[op->inputs[index_1]].get();
981 TensorT* tensor_2 = subgraph->tensors[op->outputs[index_2]].get();
982 const float min_of_min = std::min(tensor_1->quantization->min[0],
983 tensor_2->quantization->min[0]);
984 const float max_of_max = std::max(tensor_1->quantization->max[0],
985 tensor_2->quantization->max[0]);
986 if (min_of_min == 0.0 && max_of_max == 0.0) {
987 return kTfLiteError;
988 }
989
990 // Asmmetric quantization to 8 bit.
991 auto quantization_params =
992 absl::make_unique<QuantizationParametersT>();
993 utils::GetAsymmetricQuantizationParams(
994 min_of_min, max_of_max, -128, 127, quantization_params.get());
995
996 // Populate both tensors with the same parameters.
997 const float scale = quantization_params->scale[0];
998 const int32 zero_point = quantization_params->zero_point[0];
999 for (TensorT* tensor : {tensor_1, tensor_2}) {
1000 tensor->quantization = absl::make_unique<QuantizationParametersT>();
1001 tensor->quantization->scale.push_back(scale);
1002 tensor->quantization->zero_point.push_back(zero_point);
1003 tensor->type = TensorType_INT8;
1004 }
1005 }
1006 }
1007 }
1008 }
1009 return kTfLiteOk;
1010 }
1011
1012 // Quantize inputs and weights.
1013 // Because of ops such as lstm, still need to do per op, instead of weights.
QuantizeWeightsInputOutput(ModelT * model,bool allow_float,const std::unordered_set<string> & operator_names,const std::unordered_set<string> & real_value_op_set,const TensorType & activations_type,ErrorReporter * error_reporter)1014 TfLiteStatus QuantizeWeightsInputOutput(
1015 ModelT* model, bool allow_float,
1016 const std::unordered_set<string>& operator_names,
1017 const std::unordered_set<string>& real_value_op_set,
1018 const TensorType& activations_type, ErrorReporter* error_reporter) {
1019 // Flag to track unsupported ops.
1020 bool quantization_not_supported = false;
1021
1022 // Loop over the graph and quantize ops.
1023 for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1024 subgraph_idx++) {
1025 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1026 for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
1027 OperatorT* op = subgraph->operators[op_idx].get();
1028 const BuiltinOperator op_code =
1029 GetBuiltinCode(model->operator_codes[op->opcode_index].get());
1030 const string operator_name = subgraph->tensors[op->outputs[0]]->name;
1031 operator_property::OperatorProperty property =
1032 GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
1033 operator_name, activations_type);
1034 if (!IsRealValueOp(real_value_op_set, operator_name)) {
1035 continue;
1036 }
1037
1038 if (activations_type == TensorType_INT16 && !property.quantizable &&
1039 !allow_float) {
1040 TF_LITE_REPORT_ERROR(
1041 error_reporter,
1042 "Quantization to 16x8-bit not yet supported for op: '%s'.\n",
1043 EnumNameBuiltinOperator(op_code));
1044 quantization_not_supported = true;
1045 } else if (!property.quantizable && !allow_float) {
1046 if (op_code == BuiltinOperator_DEQUANTIZE &&
1047 std::find(subgraph->outputs.begin(), subgraph->outputs.end(),
1048 op->outputs[0]) != subgraph->outputs.end()) {
1049 continue;
1050 }
1051 TF_LITE_REPORT_ERROR(error_reporter,
1052 "Quantization not yet supported for op: '%s'.\n",
1053 EnumNameBuiltinOperator(op_code));
1054 quantization_not_supported = true;
1055 }
1056
1057 // Quantize operator inputs/weights.
1058 for (const std::pair<int, operator_property::TensorProperty>& input :
1059 GetInputs(op, property)) {
1060 TF_LITE_ENSURE_STATUS(QuantizeOpInput(model, subgraph_idx, &op_idx,
1061 property, input, activations_type,
1062 error_reporter));
1063 }
1064
1065 // Quantize operator outputs.
1066 for (const std::pair<int, operator_property::TensorProperty>& output :
1067 GetOutputs(op, property)) {
1068 TF_LITE_ENSURE_STATUS(
1069 QuantizeOpOutput(model, subgraph_idx, op_idx, property, output,
1070 activations_type, error_reporter));
1071 }
1072 }
1073 }
1074
1075 // Return; emit errors if there are any.
1076 if (quantization_not_supported) {
1077 return kTfLiteError;
1078 }
1079 return kTfLiteOk;
1080 }
1081
1082 // Quantize bias.
QuantizeBiases(ModelT * model,const std::unordered_set<string> & operator_names,const std::unordered_set<string> & real_value_op_set,const TensorType & activations_type,ErrorReporter * error_reporter)1083 TfLiteStatus QuantizeBiases(ModelT* model,
1084 const std::unordered_set<string>& operator_names,
1085 const std::unordered_set<string>& real_value_op_set,
1086 const TensorType& activations_type,
1087 ErrorReporter* error_reporter) {
1088 for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1089 subgraph_idx++) {
1090 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1091 for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
1092 OperatorT* op = subgraph->operators[op_idx].get();
1093 const BuiltinOperator op_code =
1094 GetBuiltinCode(model->operator_codes[op->opcode_index].get());
1095 const string operator_name = subgraph->tensors[op->outputs[0]]->name;
1096 operator_property::OperatorProperty property =
1097 GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
1098 operator_name, activations_type);
1099 if (!property.quantizable ||
1100 !IsRealValueOp(real_value_op_set, operator_name)) {
1101 continue;
1102 }
1103 for (const int bias_idx : property.biases) {
1104 if (bias_idx >= op->inputs.size() ||
1105 op->inputs[bias_idx] == kTfLiteOptionalTensor) {
1106 continue;
1107 }
1108 // Quantize if it is not quantized already as the
1109 // output of another op or input of another op.
1110 TensorT* bias_tensor = subgraph->tensors[op->inputs[bias_idx]].get();
1111 if (!utils::QuantizationParametersExist(bias_tensor)) {
1112 if (utils::HasBuffer(model, subgraph, op->inputs[bias_idx])) {
1113 if (property.inputs.size() != 2) {
1114 TF_LITE_REPORT_ERROR(error_reporter,
1115 "Expect the input length of "
1116 "op %s at index %d in subgraph %d to be 2",
1117 bias_idx, op->inputs.size(),
1118 EnumNameBuiltinOperator(op_code), op_idx,
1119 subgraph_idx);
1120 return kTfLiteError;
1121 }
1122 TensorT* input_tensor =
1123 subgraph->tensors[op->inputs[property.inputs[0].first]].get();
1124 TensorT* weight_tensor =
1125 subgraph->tensors[op->inputs[property.inputs[1].first]].get();
1126 operator_property::TensorProperty weight_property =
1127 property.inputs[1].second;
1128 TF_LITE_ENSURE_STATUS(QuantizeBias(
1129 model, input_tensor, weight_tensor, bias_tensor,
1130 weight_property.per_axis, weight_property.per_axis_index,
1131 activations_type, error_reporter));
1132 }
1133 } else {
1134 // If bias is already quantized, make sure it is quantized to 32 bit.
1135 if (bias_tensor->type != TensorType_INT32) {
1136 TF_LITE_REPORT_ERROR(
1137 error_reporter,
1138 "Bias (\"%s\" at global index %d) of op \"%s\" at op_index %d "
1139 "in subgraph %d is expected to be quantized to INT32 but it is "
1140 "already quantized to %s.\n",
1141 bias_tensor->name.c_str(), op->inputs[bias_idx],
1142 operator_name.c_str(), op_idx, subgraph_idx,
1143 EnumNameTensorType(bias_tensor->type));
1144 }
1145 }
1146 }
1147 }
1148 }
1149 return kTfLiteOk;
1150 }
1151
GetAllOperatorOutputs(ModelT * model)1152 std::unordered_set<string> GetAllOperatorOutputs(ModelT* model) {
1153 std::unordered_set<string> operator_names;
1154 for (int32_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1155 subgraph_idx++) {
1156 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1157 for (int32_t tensor_idx = 0; tensor_idx < subgraph->tensors.size();
1158 tensor_idx++) {
1159 operator_names.insert(subgraph->tensors[tensor_idx]->name);
1160 }
1161 }
1162 return operator_names;
1163 }
1164 // Populate the quantization parameters max and min for input tensors.
1165 // Assumes that dynamic tensors already have stored min, max values and throw
1166 // an error if a tensor does not have min, max quantization parameter or a
1167 // buffer.
1168 // If any static tensors are not inputs to an operation, their max, min values
1169 // will not be filled by this function.
FillQuantizationParams(ModelT * model,const std::unordered_set<string> & operator_names,const std::unordered_set<string> & real_value_op_set,const TensorType & activations_type,ErrorReporter * error_reporter)1170 TfLiteStatus FillQuantizationParams(
1171 ModelT* model, const std::unordered_set<string>& operator_names,
1172 const std::unordered_set<string>& real_value_op_set,
1173 const TensorType& activations_type, ErrorReporter* error_reporter) {
1174 for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1175 subgraph_idx++) {
1176 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1177 for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
1178 OperatorT* op = subgraph->operators[op_idx].get();
1179 const string operator_name = subgraph->tensors[op->outputs[0]]->name;
1180 operator_property::OperatorProperty property =
1181 GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
1182 operator_name, activations_type);
1183 if (!IsRealValueOp(real_value_op_set, operator_name)) {
1184 continue;
1185 }
1186
1187 // Populate max, min for each input tensor.
1188 for (const std::pair<int, operator_property::TensorProperty>& input :
1189 property.inputs) {
1190 // Get tensor.
1191 const int32_t input_idx = input.first;
1192 const int32_t tensor_idx = op->inputs[input_idx];
1193 if (tensor_idx == -1) {
1194 // Skip optional tensor.
1195 continue;
1196 }
1197 TensorT* tensor = subgraph->tensors[tensor_idx].get();
1198
1199 // Static tensor.
1200 if (!utils::HasMinMax(tensor) &&
1201 utils::HasBuffer(model, subgraph, tensor_idx)) {
1202 // Get input float data and tensor dimensions.
1203 const BufferT* buffer = model->buffers[tensor->buffer].get();
1204 const float* float_input_data =
1205 reinterpret_cast<const float*>(buffer->data.data());
1206
1207 if (tensor->quantization == nullptr) {
1208 tensor->quantization = absl::make_unique<QuantizationParametersT>();
1209 }
1210
1211 // Fill per channel max and min with respect to channel_dim_index.
1212 if (input.second.per_axis) {
1213 if (tensor->shape.size() == 4) {
1214 int32_t channel_dim_index = input.second.per_axis_index;
1215 TF_LITE_ENSURE_STATUS(utils::FillPerChannelMinMax(
1216 float_input_data, tensor->shape, channel_dim_index,
1217 tensor->quantization.get(), error_reporter));
1218 } else {
1219 TF_LITE_REPORT_ERROR(
1220 error_reporter,
1221 "Could not fill max min for tensor as the dimension is %d "
1222 "and not 4 as expected.",
1223 tensor->shape.size());
1224 return kTfLiteError;
1225 }
1226
1227 // Fill per layer max and min.
1228 } else if (!utils::HasMinMax(tensor) && !input.second.per_axis &&
1229 utils::HasBuffer(model, subgraph, tensor_idx)) {
1230 uint64_t input_size;
1231 TF_LITE_ENSURE_STATUS(utils::NumElements(*tensor, &input_size));
1232 utils::FillSingleMinMax(float_input_data, input_size,
1233 tensor->quantization.get());
1234 }
1235 if (tensor->quantization->quantized_dimension !=
1236 input.second.per_axis_index) {
1237 TF_LITE_REPORT_ERROR(
1238 error_reporter,
1239 "Quantized dimension for tensor property and quantization "
1240 "parameters do not match. Got %d and %d respectively.",
1241 input.second.per_axis_index,
1242 tensor->quantization->quantized_dimension);
1243 return kTfLiteError;
1244 }
1245
1246 // Dynamic tensor.
1247 } else if (!utils::HasMinMax(tensor) &&
1248 !utils::HasBuffer(model, subgraph, tensor_idx)) {
1249 TF_LITE_REPORT_ERROR(
1250 error_reporter,
1251 "Max and min for dynamic tensors should be"
1252 " recorded during calibration: Failed for tensor %s\n",
1253 tensor->name.c_str());
1254 if (tensor->quantization == nullptr) {
1255 TF_LITE_REPORT_ERROR(error_reporter,
1256 "No quantization params for tensor %s",
1257 tensor->name.c_str());
1258 } else if (tensor->quantization->min.empty() ||
1259 tensor->quantization->max.empty()) {
1260 TF_LITE_REPORT_ERROR(error_reporter, "Empty min/max for tensor %s",
1261 tensor->name.c_str());
1262 }
1263 return kTfLiteError;
1264 }
1265
1266 if (utils::QuantizationParametersExist(tensor)) {
1267 TF_LITE_REPORT_ERROR(
1268 error_reporter,
1269 "Scale and zero points should not be recorded before "
1270 "quantization.");
1271 return kTfLiteError;
1272 }
1273 } // loop over op inputs
1274 } // loop over ops
1275 } // loop over subgraphs
1276 return kTfLiteOk;
1277 }
1278
1279 // Check compatibility of activation, weight and bias scales. Adjust if needed.
EnsureBiasScaleCompatibility(ModelT * model,const std::unordered_set<string> & operator_names,const std::unordered_set<string> & real_value_op_set,const TensorType & activations_type,ErrorReporter * error_reporter)1280 TfLiteStatus EnsureBiasScaleCompatibility(
1281 ModelT* model, const std::unordered_set<string>& operator_names,
1282 const std::unordered_set<string>& real_value_op_set,
1283 const TensorType& activations_type, ErrorReporter* error_reporter) {
1284 for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1285 subgraph_idx++) {
1286 SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1287 for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
1288 OperatorT* op = subgraph->operators[op_idx].get();
1289 const string operator_name = subgraph->tensors[op->outputs[0]]->name;
1290 operator_property::OperatorProperty property =
1291 GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
1292 operator_name, activations_type);
1293 if (!IsRealValueOp(real_value_op_set, operator_name)) {
1294 continue;
1295 }
1296
1297 // Loop over all bias tensors.
1298 for (const int bias_idx : property.biases) {
1299 if (bias_idx >= op->inputs.size() ||
1300 op->inputs[bias_idx] == kTfLiteOptionalTensor) {
1301 continue;
1302 }
1303 TensorT* bias_tensor = subgraph->tensors[op->inputs[bias_idx]].get();
1304 int32_t channel_dim_size = bias_tensor->shape[0];
1305 if (bias_tensor->shape.size() != 1) {
1306 TF_LITE_REPORT_ERROR(error_reporter,
1307 "Expected bias tensor to be a vector.");
1308 return kTfLiteError;
1309 }
1310
1311 if (property.inputs.size() != 2) { // Only works for two input tensors.
1312 TF_LITE_REPORT_ERROR(
1313 error_reporter,
1314 "Expect %d inputs for op %s at index %d in subgraph %d to be 2",
1315 property.inputs.size(), op_idx, subgraph_idx);
1316 return kTfLiteError;
1317 }
1318
1319 if (!property.arbitrary_inputs && property.quantizable) {
1320 // Get input and weight tensors.
1321 TensorT* input_tensor =
1322 subgraph->tensors[op->inputs[property.inputs[0].first]].get();
1323 TensorT* weight_tensor =
1324 subgraph->tensors[op->inputs[property.inputs[1].first]].get();
1325 operator_property::TensorProperty weight_property =
1326 property.inputs[1].second;
1327 TF_LITE_ENSURE(error_reporter, input_tensor->quantization);
1328
1329 // Check quantization parameters exist for input.
1330 if (!utils::HasMinMax(input_tensor)) {
1331 TF_LITE_REPORT_ERROR(
1332 error_reporter,
1333 "Input tensor missing quantization information. Should be "
1334 "populated during calibration.");
1335 return kTfLiteError;
1336 }
1337
1338 // Get input scale for asymmetric quantization.
1339 QuantizationParametersT temp_quant_params = QuantizationParametersT();
1340 TF_LITE_ENSURE_STATUS(
1341 utils::GetQuantizationParams(input_tensor, activations_type,
1342 &temp_quant_params, error_reporter));
1343 if (temp_quant_params.scale.size() != 1) {
1344 TF_LITE_REPORT_ERROR(error_reporter,
1345 "Unexpected input quantization scale size.");
1346 return kTfLiteError;
1347 }
1348 float input_scale = temp_quant_params.scale[0];
1349
1350 // Check that max/min values have been filled for weights.
1351 if (!utils::HasMinMax(weight_tensor)) {
1352 TF_LITE_REPORT_ERROR(
1353 error_reporter,
1354 "Min and/or max values have not been recorded for weight "
1355 "tensor. This should have happened in FillQuantizationParams.");
1356 return kTfLiteError;
1357 }
1358
1359 // Ensure the tensor dimensions are compatible.
1360 if (weight_property.per_axis) {
1361 if (bias_tensor->shape[0] !=
1362 weight_tensor->shape[weight_property.per_axis_index]) {
1363 TF_LITE_REPORT_ERROR(
1364 error_reporter,
1365 "Channel mismatch between bias and weight tensors %d vs %d",
1366 bias_tensor->shape[0],
1367 weight_tensor->shape[weight_property.per_axis_index]);
1368 return kTfLiteError;
1369 }
1370 // Ensure that the number of max/mins matches the channel_dim_size.
1371 if (weight_tensor->quantization->max.size() != channel_dim_size) {
1372 TF_LITE_REPORT_ERROR(
1373 error_reporter,
1374 "Mismatch between number of weight maxs and channels: %d vs "
1375 "%d",
1376 weight_tensor->quantization->max.size(), channel_dim_size);
1377 return kTfLiteError;
1378 }
1379 if (weight_tensor->quantization->min.size() != channel_dim_size) {
1380 TF_LITE_REPORT_ERROR(
1381 error_reporter,
1382 "Mismatch between number of weight mins and channels: %d",
1383 weight_tensor->quantization->min.size());
1384 return kTfLiteError;
1385 }
1386 }
1387
1388 // Get data and size of bias tensor.
1389 const BufferT* buffer = model->buffers[bias_tensor->buffer].get();
1390 const float* bias_data =
1391 reinterpret_cast<const float*>(buffer->data.data());
1392 uint64_t bias_size;
1393 TF_LITE_ENSURE_STATUS(utils::NumElements(*bias_tensor, &bias_size));
1394
1395 // Adjust weight scales if needed.
1396 TF_LITE_ENSURE_STATUS(utils::AdjustWeightsForBiasScale(
1397 weight_tensor->quantization.get(), bias_data, bias_size,
1398 input_scale, error_reporter));
1399
1400 if (utils::QuantizationParametersExist(weight_tensor)) {
1401 TF_LITE_REPORT_ERROR(
1402 error_reporter,
1403 "Scale and zero points should not be recorded for the weight "
1404 "tensor before quantization.");
1405 return kTfLiteError;
1406 }
1407 if (utils::QuantizationParametersExist(input_tensor)) {
1408 TF_LITE_REPORT_ERROR(
1409 error_reporter,
1410 "Scale and zero points should not be recorded for the input "
1411 "tensor before quantization.");
1412 return kTfLiteError;
1413 }
1414 }
1415 }
1416 }
1417 }
1418 return kTfLiteOk;
1419 }
1420
1421 } // namespace
1422
1423 // Assumes that the operators in the model have been topologically sorted.
QuantizeModel(flatbuffers::FlatBufferBuilder * builder,ModelT * model,const TensorType & input_type,const TensorType & output_type,bool allow_float,const std::unordered_set<string> & operator_names,const TensorType & activations_type,ErrorReporter * error_reporter)1424 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
1425 ModelT* model, const TensorType& input_type,
1426 const TensorType& output_type, bool allow_float,
1427 const std::unordered_set<string>& operator_names,
1428 const TensorType& activations_type,
1429 ErrorReporter* error_reporter) {
1430 auto real_value_op_set =
1431 PopulateRealValueOpSet(model, operator_names, activations_type);
1432 TF_LITE_ENSURE_STATUS(
1433 FillQuantizationParams(model, operator_names, real_value_op_set,
1434 activations_type, error_reporter));
1435 TF_LITE_ENSURE_STATUS(
1436 EnsureBiasScaleCompatibility(model, operator_names, real_value_op_set,
1437 activations_type, error_reporter));
1438 TF_LITE_ENSURE_STATUS(
1439 QuantizeIntemediateTensors(model, activations_type, error_reporter));
1440 TF_LITE_ENSURE_STATUS(QuantizeSharedRange(model, error_reporter));
1441 TF_LITE_ENSURE_STATUS(QuantizeWeightsInputOutput(
1442 model, allow_float, operator_names, real_value_op_set, activations_type,
1443 error_reporter));
1444 TF_LITE_ENSURE_STATUS(ApplyConstraints(model, operator_names,
1445 real_value_op_set, activations_type,
1446 error_reporter));
1447 TF_LITE_ENSURE_STATUS(QuantizeBiases(model, operator_names, real_value_op_set,
1448 activations_type, error_reporter));
1449 utils::SetOperatorCodeVersion(model);
1450 TF_LITE_ENSURE_STATUS(SetInputAndOutputTypes(
1451 model, input_type, output_type, activations_type, error_reporter));
1452 SetOperatorPropertyADDSUBOperator(model, activations_type);
1453 flatbuffers::Offset<Model> output_model_location =
1454 Model::Pack(*builder, model);
1455 FinishModelBuffer(*builder, output_model_location);
1456
1457 return kTfLiteOk;
1458 }
1459
QuantizeModelAllOperators(flatbuffers::FlatBufferBuilder * builder,ModelT * model,const TensorType & input_type,const TensorType & output_type,bool allow_float,const TensorType & activations_type,ErrorReporter * error_reporter)1460 TfLiteStatus QuantizeModelAllOperators(flatbuffers::FlatBufferBuilder* builder,
1461 ModelT* model,
1462 const TensorType& input_type,
1463 const TensorType& output_type,
1464 bool allow_float,
1465 const TensorType& activations_type,
1466 ErrorReporter* error_reporter) {
1467 return QuantizeModel(builder, model, input_type, output_type, allow_float,
1468 GetAllOperatorOutputs(model), activations_type,
1469 error_reporter);
1470 }
1471
QuantizeModel(flatbuffers::FlatBufferBuilder * builder,ModelT * model,const TensorType & input_type,const TensorType & output_type,bool allow_float,ErrorReporter * error_reporter)1472 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
1473 ModelT* model, const TensorType& input_type,
1474 const TensorType& output_type, bool allow_float,
1475 ErrorReporter* error_reporter) {
1476 return QuantizeModel(builder, model, input_type, output_type, allow_float,
1477 GetAllOperatorOutputs(model), TensorType_INT8,
1478 error_reporter);
1479 }
1480
QuantizeModel(flatbuffers::FlatBufferBuilder * builder,ModelT * model,const TensorType & input_type,const TensorType & output_type,ErrorReporter * error_reporter)1481 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
1482 ModelT* model, const TensorType& input_type,
1483 const TensorType& output_type,
1484 ErrorReporter* error_reporter) {
1485 return QuantizeModel(builder, model, input_type, output_type,
1486 /*allow_float=*/false, error_reporter);
1487 }
1488
QuantizeModel(flatbuffers::FlatBufferBuilder * builder,ModelT * model,ErrorReporter * error_reporter)1489 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
1490 ModelT* model, ErrorReporter* error_reporter) {
1491 return QuantizeModel(builder, model, TensorType_FLOAT32, TensorType_FLOAT32,
1492 /*allow_float=*/false, error_reporter);
1493 }
1494
1495 } // namespace optimize
1496 } // namespace tflite
1497