• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include "tensorflow/lite/tools/optimize/quantize_model.h"
16 
17 #include <algorithm>
18 #include <cstdint>
19 #include <limits>
20 #include <memory>
21 #include <string>
22 #include <unordered_map>
23 #include <unordered_set>
24 #include <utility>
25 #include <vector>
26 
27 #include "flatbuffers/flexbuffers.h"
28 #include "tensorflow/lite/context.h"
29 #include "tensorflow/lite/core/api/error_reporter.h"
30 #include "tensorflow/lite/model.h"
31 #include "tensorflow/lite/schema/schema_generated.h"
32 #include "tensorflow/lite/schema/schema_utils.h"
33 #include "tensorflow/lite/tools/optimize/model_utils.h"
34 #include "tensorflow/lite/tools/optimize/operator_property.h"
35 #include "tensorflow/lite/tools/optimize/quantization_utils.h"
36 
37 namespace tflite {
38 namespace optimize {
39 
40 namespace {
41 
IsFloatTensor(const SubGraphT * subgraph,int32_t tensor_idx)42 bool IsFloatTensor(const SubGraphT* subgraph, int32_t tensor_idx) {
43   TensorT* tensor = subgraph->tensors[tensor_idx].get();
44   if (tensor->type != TensorType_FLOAT32) {
45     // Skip non-real-valued tensor.
46     return false;
47   }
48   return true;
49 }
50 
51 // Gets the operator property from the operator_property list and additionally
52 // modifies the quantizable parameter based on the user's specified
53 // operator_names.
GetOperatorProperty(const std::unordered_set<string> & operator_names,const ModelT * model,int subgraph_index,int op_idx,const string & operator_name,const TensorType & activations_type)54 operator_property::OperatorProperty GetOperatorProperty(
55     const std::unordered_set<string>& operator_names, const ModelT* model,
56     int subgraph_index, int op_idx, const string& operator_name,
57     const TensorType& activations_type) {
58   operator_property::OperatorProperty property =
59       operator_property::GetOperatorProperty(model, subgraph_index, op_idx);
60   const SubGraphT* subgraph = model->subgraphs[subgraph_index].get();
61   const OperatorT* op = subgraph->operators[op_idx].get();
62   const BuiltinOperator op_code =
63       GetBuiltinCode(model->operator_codes[op->opcode_index].get());
64   if (activations_type == TensorType_INT16 && !property.quantizable_int16) {
65     property.quantizable = false;
66   }
67   // The algorithm adds Dequantize and Quantize, so we don't require them to be
68   // in the operator_names.
69   if (op_code != BuiltinOperator_DEQUANTIZE &&
70       op_code != BuiltinOperator_QUANTIZE) {
71     property.quantizable =
72         property.quantizable &&
73         (operator_names.find(operator_name) != operator_names.end());
74   }
75   return property;
76 }
77 
IsRealValueOp(const std::unordered_set<string> & real_value_op_set,const string & operator_name)78 bool IsRealValueOp(const std::unordered_set<string>& real_value_op_set,
79                    const string& operator_name) {
80   return real_value_op_set.find(operator_name) != real_value_op_set.end();
81 }
82 
83 // Creates a set that contains all quantizable ops that happen to take a
84 // non-float type in the source graph.
PopulateRealValueOpSet(ModelT * model,const std::unordered_set<string> & operator_names,const TensorType & activations_type)85 std::unordered_set<string> PopulateRealValueOpSet(
86     ModelT* model, const std::unordered_set<string>& operator_names,
87     const TensorType& activations_type) {
88   std::unordered_set<string> real_value_op_set;
89   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
90        subgraph_idx++) {
91     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
92     for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
93       OperatorT* op = subgraph->operators[op_idx].get();
94       const string operator_name = subgraph->tensors[op->outputs[0]]->name;
95       operator_property::OperatorProperty property =
96           GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
97                               operator_name, activations_type);
98 
99       if (!property.quantizable) {
100         real_value_op_set.insert(operator_name);
101         continue;
102       }
103 
104       for (const std::pair<int, operator_property::TensorProperty>& input :
105            property.inputs) {
106         const int32_t input_idx = input.first;
107         const int32_t tensor_idx = op->inputs[input_idx];
108         if (IsFloatTensor(subgraph, tensor_idx)) {
109           real_value_op_set.insert(operator_name);
110           break;
111         }
112       }
113       for (const std::pair<int, operator_property::TensorProperty>& output :
114            property.outputs) {
115         const int32_t output_idx = output.first;
116         const int32_t tensor_idx = op->outputs[output_idx];
117         if (IsFloatTensor(subgraph, tensor_idx)) {
118           real_value_op_set.insert(operator_name);
119           break;
120         }
121       }
122 
123       if (property.arbitrary_inputs) {
124         const int32_t tensor_idx = op->inputs[0];
125         if (IsFloatTensor(subgraph, tensor_idx)) {
126           real_value_op_set.insert(operator_name);
127         }
128       }
129 
130       if (property.arbitrary_outputs) {
131         const int32_t tensor_idx = op->outputs[0];
132         if (IsFloatTensor(subgraph, tensor_idx)) {
133           real_value_op_set.insert(operator_name);
134         }
135       }
136     }
137   }
138   return real_value_op_set;
139 }
140 
QuantizeBias(ModelT * model,const TensorT * input_tensor,const TensorT * weight_tensor,TensorT * bias_tensor,bool is_per_channel,int channel_dim_index,const TensorType & activations_type,ErrorReporter * error_reporter)141 TfLiteStatus QuantizeBias(ModelT* model, const TensorT* input_tensor,
142                           const TensorT* weight_tensor, TensorT* bias_tensor,
143                           bool is_per_channel, int channel_dim_index,
144                           const TensorType& activations_type,
145                           ErrorReporter* error_reporter) {
146   if (bias_tensor->shape.size() != 1) {
147     TF_LITE_REPORT_ERROR(error_reporter, "Expected bias tensor shape to be 1.");
148     return kTfLiteError;
149   }
150 
151   int32_t channel_dim_size = bias_tensor->shape[0];
152   TF_LITE_ENSURE(error_reporter, weight_tensor->quantization);
153   std::vector<float> weight_scales = weight_tensor->quantization->scale;
154 
155   if (is_per_channel) {
156     if (bias_tensor->shape[0] != weight_tensor->shape[channel_dim_index]) {
157       TF_LITE_REPORT_ERROR(
158           error_reporter,
159           "Channel mismatch between bias and weight tensors %d vs %d",
160           bias_tensor->shape[0], weight_tensor->shape[channel_dim_index]);
161       return kTfLiteError;
162     }
163     if (!input_tensor->quantization ||
164         input_tensor->quantization->scale.size() != 1) {
165       TF_LITE_REPORT_ERROR(error_reporter,
166                            "Input tensor missing quantization information");
167       return kTfLiteError;
168     }
169 
170     if (weight_scales.size() != channel_dim_size) {
171       TF_LITE_REPORT_ERROR(error_reporter,
172                            "Mismatch weight scale dimension: %d",
173                            weight_scales.size());
174       return kTfLiteError;
175     }
176     if (activations_type == tflite::TensorType_INT16) {
177       return utils::SymmetricPerChannelBiasQuantize<std::int64_t>(
178           model, bias_tensor, input_tensor->quantization->scale[0],
179           weight_scales.data(), channel_dim_size, error_reporter);
180     } else {
181       return utils::SymmetricPerChannelBiasQuantize<std::int32_t>(
182           model, bias_tensor, input_tensor->quantization->scale[0],
183           weight_scales.data(), channel_dim_size, error_reporter);
184     }
185   } else {
186     if (weight_scales.size() != 1) {
187       TF_LITE_REPORT_ERROR(
188           error_reporter,
189           "Expected per-layer weight scale dimension size 1, got %d",
190           weight_scales.size());
191       return kTfLiteError;
192     }
193     if (activations_type == tflite::TensorType_INT16) {
194       return utils::SymmetricPerLayerBiasQuantize<std::int64_t>(
195           model, bias_tensor,
196           input_tensor->quantization->scale[0] * weight_scales[0],
197           error_reporter);
198     } else {
199       return utils::SymmetricPerLayerBiasQuantize<std::int32_t>(
200           model, bias_tensor,
201           input_tensor->quantization->scale[0] * weight_scales[0],
202           error_reporter);
203     }
204   }
205   return kTfLiteError;
206 }
207 
208 // True if the tensor type has to be modified.
TensorTypeChangeRequired(const TensorT * tensor,const TensorType & type)209 bool TensorTypeChangeRequired(const TensorT* tensor, const TensorType& type) {
210   // The quantized model is type INT8/INT16, so if the user provided type is
211   // INT8/INT16, we do not have to do any custom logic. Additionally, if the
212   // current tensor isn't INT8/INT16 quantized, the custom type doesn't apply.
213   bool int8check = type != TensorType_INT8 && tensor->type == TensorType_INT8 &&
214                    !tensor->quantization->scale.empty();
215   bool int16check = type != TensorType_INT16 &&
216                     tensor->type == TensorType_INT16 &&
217                     !tensor->quantization->scale.empty();
218   return (int8check || int16check);
219 }
220 
221 // Check if input is consumed by quantize, which means we don't need to
222 // requantize if the output scale is the same as the input tensor's.
InputQuantizeRequired(const ModelT * model,const SubGraphT * subgraph,int32_t input_idx)223 bool InputQuantizeRequired(const ModelT* model, const SubGraphT* subgraph,
224                            int32_t input_idx) {
225   std::vector<OperatorT*> quantize_ops;
226   for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
227     OperatorT* op = subgraph->operators[op_idx].get();
228     if (std::find(op->inputs.begin(), op->inputs.end(), input_idx) !=
229         op->inputs.end()) {
230       const BuiltinOperator op_code =
231           GetBuiltinCode(model->operator_codes[op->opcode_index].get());
232       if (op_code != BuiltinOperator_QUANTIZE) {
233         return true;
234       }
235       quantize_ops.push_back(op);
236     }
237   }
238   if (quantize_ops.size() == 1) {
239     const auto* tensor = subgraph->tensors[input_idx].get();
240     const auto* op = quantize_ops[0];
241     const int32_t output_idx = op->outputs[0];
242     const auto output_type = subgraph->tensors[output_idx]->type;
243     const float output_scale =
244         subgraph->tensors[output_idx]->quantization->scale[0];
245     const int64_t output_zero_point =
246         subgraph->tensors[output_idx]->quantization->zero_point[0];
247     if (output_type == tensor->type &&
248         output_scale == tensor->quantization->scale[0] &&
249         output_zero_point == tensor->quantization->zero_point[0]) {
250       return false;
251     }
252   }
253   return true;
254 }
255 
256 // Sets the input type, adding a Leading Op node at the start of the model if
257 // necessary.
258 // Returns the new input tensor index.
SetInputType(ModelT * model,SubGraphT * subgraph,const int32_t tensor_idx,const TensorType & input_type,const TensorType & activations_type)259 int32_t SetInputType(ModelT* model, SubGraphT* subgraph,
260                      const int32_t tensor_idx, const TensorType& input_type,
261                      const TensorType& activations_type) {
262   TensorT* tensor = subgraph->tensors[tensor_idx].get();
263   if (!TensorTypeChangeRequired(tensor, input_type)) {
264     return -1;
265   }
266   if (input_type == TensorType_FLOAT32 || input_type == TensorType_UINT8) {
267     std::string type_string =
268         activations_type == TensorType_INT16 ? "int16" : "int8";
269     // Create a new tensor to be the input of the leading Op.
270     std::unique_ptr<TensorT> leading_op_input;
271     if (input_type == TensorType_FLOAT32) {
272       // Add tensor for quantize operator. Scales and zero points are not
273       // needed.
274       const string leading_op_name = tensor->name;
275       const string new_name_original_input = tensor->name + "_" + type_string;
276       tensor->name = new_name_original_input;
277       utils::MakeTensor(leading_op_name, tensor->shape, tensor->shape_signature,
278                         input_type, &leading_op_input);
279     } else {
280       // Get scale and zero point from the first tensor.
281       const float scale = subgraph->tensors[tensor_idx]->quantization->scale[0];
282       const int64_t zero_point =
283           subgraph->tensors[tensor_idx]->quantization->zero_point[0];
284 
285       //  Add tensor for requantize operator. Scale is the existing scale and
286       //  zero point is shifted by +128.
287       TFLITE_DCHECK_GE(zero_point, -128);
288       TFLITE_DCHECK_LE(zero_point, 127);
289       const string leading_op_name = tensor->name;
290       const string new_name_original_input = tensor->name + "_" + type_string;
291       tensor->name = new_name_original_input;
292       utils::MakeTensorWithQuantParam(
293           leading_op_name, tensor->shape, tensor->shape_signature, input_type,
294           scale, zero_point + 128, &leading_op_input);
295     }
296 
297     // Check if quantize op already exists.
298     if (!InputQuantizeRequired(model, subgraph, tensor_idx)) {
299       subgraph->tensors[tensor_idx] = std::move(leading_op_input);
300       return tensor_idx;
301     }
302 
303     const int32_t leading_op_input_idx = subgraph->tensors.size();
304     subgraph->tensors.push_back(std::move(leading_op_input));
305 
306     // Create the leading op, which is Quantize Op that quantize or requantize
307     // the input.
308     std::unique_ptr<OperatorT> leading_op;
309     utils::MakeQuantizeOperator(model, &leading_op, leading_op_input_idx,
310                                 tensor_idx);
311 
312     // Insert the new op at the start of the model.
313     subgraph->operators.insert(subgraph->operators.begin(),
314                                std::move(leading_op));
315     return leading_op_input_idx;
316   }
317   return -1;
318 }
319 
320 // Sets the output type, adding a Tailing Op node at the end of the model if
321 // necessary.
322 // Returns the new output tensor index.
SetOutputType(ModelT * model,SubGraphT * subgraph,const int32_t tensor_idx,const TensorType & output_type,const TensorType & activations_type)323 int32_t SetOutputType(ModelT* model, SubGraphT* subgraph,
324                       const int32_t tensor_idx, const TensorType& output_type,
325                       const TensorType& activations_type) {
326   TensorT* tensor = subgraph->tensors[tensor_idx].get();
327   if (!TensorTypeChangeRequired(tensor, output_type)) {
328     return -1;
329   }
330   if (output_type == TensorType_FLOAT32 || output_type == TensorType_UINT8) {
331     std::string type_string =
332         activations_type == TensorType_INT16 ? "int16" : "int8";
333     // Create a new tensor to be the output of the tailing op.
334     std::unique_ptr<TensorT> tailing_op_output;
335     if (output_type == TensorType_FLOAT32) {
336       const string tailing_op_name = tensor->name;
337       const string new_name_original_output = tensor->name + "_" + type_string;
338       tensor->name = new_name_original_output;
339       utils::MakeTensor(tailing_op_name, tensor->shape, tensor->shape_signature,
340                         output_type, &tailing_op_output);
341     } else {
342       // Get scale and zero point from the last tensor.
343       const float scale = subgraph->tensors[tensor_idx]->quantization->scale[0];
344       const int64_t zero_point =
345           subgraph->tensors[tensor_idx]->quantization->zero_point[0];
346 
347       //  Add tensor for requantize operator. Scale is the existing scale and
348       //  zero point is shifted by +128.
349       TFLITE_DCHECK_GE(zero_point, -128);
350       TFLITE_DCHECK_LE(zero_point, 127);
351       const string tailing_op_name = tensor->name;
352       const string new_name_original_output = tensor->name + "_" + type_string;
353       tensor->name = new_name_original_output;
354       utils::MakeTensorWithQuantParam(
355           tailing_op_name, tensor->shape, tensor->shape_signature, output_type,
356           scale, zero_point + 128, &tailing_op_output);
357     }
358     const int32_t tailing_op_output_idx = subgraph->tensors.size();
359     subgraph->tensors.push_back(std::move(tailing_op_output));
360 
361     // Create the tailing operation.
362     std::unique_ptr<OperatorT> tailing_op;
363     if (output_type == TensorType_FLOAT32) {
364       // Tailing Op is Dequantize Op.
365       utils::MakeDequantizeOperator(model, &tailing_op, tensor_idx,
366                                     tailing_op_output_idx);
367     } else {
368       // Tailing Op is Quantize Op that does requantization.
369       utils::MakeQuantizeOperator(model, &tailing_op, tensor_idx,
370                                   tailing_op_output_idx);
371     }
372     // Add the operator at the end of the model.
373     subgraph->operators.push_back(std::move(tailing_op));
374     return tailing_op_output_idx;
375   }
376   return -1;
377 }
378 
379 // Sets the input and output types to the provided types. Leading and
380 // tailing operations will be added if needed.
381 // For Float input and output, leading op is Quantize and tailing op is
382 // Dequantize.
383 // For Uint8 input and output, leading op is Quantize (uint8 to
384 // int8, can be thought as "requant") and tailing op is also Quantize (int8 to
385 // uint8, can be thought as "requant").
SetInputAndOutputTypes(ModelT * model,const TensorType & input_type,const TensorType & output_type,const TensorType & activations_type,ErrorReporter * error_reporter)386 TfLiteStatus SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
387                                     const TensorType& output_type,
388                                     const TensorType& activations_type,
389                                     ErrorReporter* error_reporter) {
390   for (int subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
391        subgraph_idx++) {
392     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
393 
394     for (int i = 0; i < subgraph->inputs.size(); ++i) {
395       TensorT* tensor = subgraph->tensors[subgraph->inputs[i]].get();
396       // TODO(suharshs): Add support for this case if it ever comes up.
397       if (tensor->type == TensorType_FLOAT32 && input_type != tensor->type) {
398         TF_LITE_REPORT_ERROR(
399             error_reporter,
400             "Unsupported input type %s for input tensor %d of type %s.",
401             EnumNameTensorType(input_type), subgraph->inputs[i],
402             EnumNameTensorType(tensor->type));
403         return kTfLiteError;
404       }
405       const int32_t input_idx = SetInputType(
406           model, subgraph, subgraph->inputs[i], input_type, activations_type);
407       if (input_idx < 0) {
408         continue;
409       }
410       subgraph->inputs[i] = input_idx;
411     }
412     for (int i = 0; i < subgraph->outputs.size(); ++i) {
413       TensorT* tensor = subgraph->tensors[subgraph->outputs[i]].get();
414       // TODO(suharshs): Add support for this case if it ever comes up.
415       if (tensor->type == TensorType_FLOAT32 && output_type != tensor->type) {
416         TF_LITE_REPORT_ERROR(
417             error_reporter,
418             "Unsupported output type %s for output tensor '%s' of type %s.",
419             EnumNameTensorType(output_type), tensor->name.c_str(),
420             EnumNameTensorType(tensor->type));
421         return kTfLiteError;
422       }
423       const int32_t output_idx = SetOutputType(
424           model, subgraph, subgraph->outputs[i], output_type, activations_type);
425       if (output_idx < 0) {
426         continue;
427       }
428       subgraph->outputs[i] = output_idx;
429     }
430   }
431   return kTfLiteOk;
432 }
433 
434 // Apply constraints to ops if they have any.
435 // We have made the restriction that for int8 quantized concat, minimum, and
436 // maximum, the inputs and outputs must have the same scale and zero point.
437 // The other ones with constraints are handled in QuantizeWeightsAndInput.
ApplyConstraints(ModelT * model,const std::unordered_set<string> & operator_names,const std::unordered_set<string> & real_value_op_set,TensorType activations_type,ErrorReporter * error_reporter)438 TfLiteStatus ApplyConstraints(
439     ModelT* model, const std::unordered_set<string>& operator_names,
440     const std::unordered_set<string>& real_value_op_set,
441     TensorType activations_type, ErrorReporter* error_reporter) {
442   for (int subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
443        subgraph_idx++) {
444     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
445     // Iterate backward to avoid messing with index.
446     for (int op_idx = subgraph->operators.size() - 1; op_idx >= 0; op_idx--) {
447       OperatorT* op = subgraph->operators[op_idx].get();
448       const string operator_name = subgraph->tensors[op->outputs[0]]->name;
449       operator_property::OperatorProperty property =
450           GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
451                               operator_name, activations_type);
452       if (!property.quantizable ||
453           !IsRealValueOp(real_value_op_set, operator_name)) {
454         continue;
455       }
456       if (!property.arbitrary_inputs ||
457           !property.restrict_same_input_output_scale) {
458         continue;
459       }
460       // If ApplyConstraints and requant is needed, use the min of min and max
461       // of max, which means using the scale and zero point of output.
462       TensorT* output_tensor = subgraph->tensors[op->outputs[0]].get();
463       if (!utils::QuantizationParametersExist(output_tensor)) {
464         TF_LITE_REPORT_ERROR(
465             error_reporter,
466             "Unable to get scale or zero point from the tensor at %d.",
467             op->outputs[0]);
468         return kTfLiteError;
469       }
470       const float output_scale = output_tensor->quantization->scale[0];
471       const float output_zp = output_tensor->quantization->zero_point[0];
472       for (size_t input_idx = 0; input_idx < op->inputs.size(); ++input_idx) {
473         TensorT* input_tensor = subgraph->tensors[op->inputs[input_idx]].get();
474         if (!utils::QuantizationParametersExist(input_tensor)) {
475           TF_LITE_REPORT_ERROR(
476               error_reporter,
477               "Unable to get scale or zero point from tensor at %d.",
478               op->inputs[input_idx]);
479           return kTfLiteError;
480         }
481         if (input_tensor->quantization->scale[0] == output_scale &&
482             input_tensor->quantization->zero_point[0] == output_zp) {
483           // This input does not need to be requantized.
484           continue;
485         }
486 
487         std::unique_ptr<TensorT> additional_tensor;
488         const string requant_tensor_name = input_tensor->name + "_requantized";
489         utils::MakeTensorWithQuantParam(
490             requant_tensor_name, input_tensor->shape,
491             input_tensor->shape_signature, activations_type, output_scale,
492             output_zp, &additional_tensor);
493         const int32_t additional_tensor_idx = subgraph->tensors.size();
494         subgraph->tensors.push_back(std::move(additional_tensor));
495 
496         // Add requant op before this input.
497         // There are better ways to handle this, which is to try to push the
498         // rescale upwards recursively and hope all upstream ops can absort
499         // this rescale.and only add requant when there is no other way.
500         std::unique_ptr<OperatorT> requant_op;
501         utils::MakeQuantizeOperator(model, &requant_op, op->inputs[input_idx],
502                                     additional_tensor_idx);
503         op->inputs[input_idx] = additional_tensor_idx;
504 
505         subgraph->operators.insert(subgraph->operators.begin() + op_idx,
506                                    std::move(requant_op));
507       }
508     }
509   }
510   return kTfLiteOk;
511 }
512 
513 // In case of int16 activations, there are two implementations of kernels for
514 // ADD/SUB operators. We set the builtin option pot_scale_int16
515 // during quantization so that from now only the general case implementation is
516 // used.
SetOperatorPropertyADDSUBOperator(ModelT * model,const TensorType & activations_type)517 void SetOperatorPropertyADDSUBOperator(ModelT* model,
518                                        const TensorType& activations_type) {
519   if (activations_type != TensorType_INT16) {
520     // This is needed only in case of int16 activations.
521     return;
522   }
523 
524   for (int subgraph_idx = 0, end = model->subgraphs.size(); subgraph_idx < end;
525        subgraph_idx++) {
526     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
527     // Iterate backward to avoid messing with index.
528     for (int op_idx = subgraph->operators.size() - 1; op_idx >= 0; op_idx--) {
529       OperatorT* op = subgraph->operators[op_idx].get();
530       OperatorCodeT* op_code = model->operator_codes[op->opcode_index].get();
531       if (op_code && op_code->builtin_code == BuiltinOperator_ADD) {
532         {
533           auto* options = op->builtin_options.AsAddOptions();
534           if (options) {
535             options->pot_scale_int16 = false;
536           }
537         }
538       }
539       if (op_code && op_code->builtin_code == BuiltinOperator_SUB) {
540         {
541           auto* options = op->builtin_options.AsSubOptions();
542           if (options) {
543             options->pot_scale_int16 = false;
544           }
545         }
546       }
547     }
548   }
549 }
550 
GetInputs(const OperatorT * op,operator_property::OperatorProperty property)551 std::vector<std::pair<int, operator_property::TensorProperty>> GetInputs(
552     const OperatorT* op, operator_property::OperatorProperty property) {
553   std::vector<std::pair<int, operator_property::TensorProperty>> inputs;
554   if (property.arbitrary_inputs || !property.quantizable) {
555     for (int i = 0; i < op->inputs.size(); ++i) {
556       inputs.push_back({i, {}});
557     }
558   } else {
559     inputs = property.inputs;
560   }
561   return inputs;
562 }
563 
GetOutputs(const OperatorT * op,operator_property::OperatorProperty property)564 std::vector<std::pair<int, operator_property::TensorProperty>> GetOutputs(
565     const OperatorT* op, operator_property::OperatorProperty property) {
566   std::vector<std::pair<int, operator_property::TensorProperty>> outputs;
567   if (property.arbitrary_outputs) {
568     for (int i = 0; i < op->outputs.size(); ++i) {
569       outputs.push_back({i, {}});
570     }
571   } else {
572     outputs = property.outputs;
573   }
574   return outputs;
575 }
576 
ShouldRestrictSameInputOutputScale(operator_property::OperatorProperty property)577 bool ShouldRestrictSameInputOutputScale(
578     operator_property::OperatorProperty property) {
579   // Ops with multiple inputs (i.e. concat, max and min) gets restricted in
580   // ApplyConstraints.
581   return (!property.arbitrary_inputs &&
582           property.restrict_same_input_output_scale);
583 }
584 
IsSubgraphInput(SubGraphT * subgraph,int32_t index)585 bool IsSubgraphInput(SubGraphT* subgraph, int32_t index) {
586   for (const int32_t input_idx : subgraph->inputs) {
587     if (index == input_idx) {
588       return true;
589     }
590   }
591   return false;
592 }
593 
594 // Quantize the op input. Will increment op_idx if ops are added.
QuantizeOpInput(ModelT * model,int32_t subgraph_idx,size_t * op_idx,operator_property::OperatorProperty property,const std::pair<int32_t,operator_property::TensorProperty> & input,const TensorType & activations_type,ErrorReporter * error_reporter)595 TfLiteStatus QuantizeOpInput(
596     ModelT* model, int32_t subgraph_idx, size_t* op_idx,
597     operator_property::OperatorProperty property,
598     const std::pair<int32_t, operator_property::TensorProperty>& input,
599     const TensorType& activations_type, ErrorReporter* error_reporter) {
600   int32_t input_idx = input.first;
601   operator_property::TensorProperty tensor_property = input.second;
602   SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
603   OperatorT* op = subgraph->operators[*op_idx].get();
604   const BuiltinOperator op_code =
605       GetBuiltinCode(model->operator_codes[op->opcode_index].get());
606   if (input_idx >= op->inputs.size()) {
607     TF_LITE_REPORT_ERROR(
608         error_reporter,
609         "Required input index %d is larger than the input length of op "
610         "%s at index %d in subgraph %d",
611         input_idx, op->inputs.size(), EnumNameBuiltinOperator(op_code), *op_idx,
612         subgraph_idx);
613     return kTfLiteError;
614   }
615   const int32_t tensor_idx = op->inputs[input_idx];
616   if (tensor_idx == -1) {
617     // Skip optional tensor.
618     return kTfLiteOk;
619   }
620   TensorT* tensor = subgraph->tensors[tensor_idx].get();
621   // Assumes op is quantized to int8.
622   const bool is_input_quantized = utils::QuantizationParametersExist(tensor);
623   if (property.quantizable && !is_input_quantized) {
624     // The operation is quantizable, but the input isn't yet quantized.
625     if (utils::HasBuffer(model, subgraph, tensor_idx)) {
626       // TODO(suharshs): Look at consumers, throw error if one consumer is
627       // per-channel and one per-layer.
628       bool quantize_const_input = property.quantize_input_as_activations &&
629                                   activations_type == TensorType_INT16;
630       if (tensor_property.number_of_bits == 8 && !quantize_const_input) {
631         if (tensor_property.use_derived_scale) {
632           // Currently 8bit tensors in input do not accept derived scale.
633           return kTfLiteError;
634         }
635         if (utils::QuantizeWeight(model, tensor, tensor_property.per_axis,
636                                   tensor_property.per_axis_index,
637                                   error_reporter) != kTfLiteOk) {
638           TF_LITE_REPORT_ERROR(
639               error_reporter,
640               "Unable to quantize buffer or min/max value for input %d "
641               "in op %s in subgraph %d, node: %d",
642               input_idx, EnumNameBuiltinOperator(op_code), subgraph_idx,
643               *op_idx);
644           return kTfLiteError;
645         }
646       } else if (tensor_property.number_of_bits == 16 || quantize_const_input) {
647         if (tensor_property.use_derived_scale) {
648           // Currently 16bit tensors in input do not accept derived scale.
649           return kTfLiteError;
650         }
651         TensorT* tensor = subgraph->tensors[tensor_idx].get();
652         int total_size = 1;
653         for (int i = 0; i < tensor->shape.size(); ++i) {
654           total_size *= tensor->shape[i];
655         }
656         BufferT* buffer = model->buffers[tensor->buffer].get();
657         float* float_data = reinterpret_cast<float*>(buffer->data.data());
658         auto minmax = std::minmax_element(float_data, float_data + total_size);
659         const float min = *minmax.first;
660         const float max = *minmax.second;
661         const float range = std::max(std::abs(min), std::abs(max));
662         // The narrow range quantized value for int16.
663         const float quantize_range = 32767.0;
664         const float scale = range / quantize_range;
665         return utils::SymmetricQuantizeFloatsToInt16(model, tensor, scale,
666                                                      error_reporter);
667       } else if (tensor_property.number_of_bits == 32) {
668         if (!tensor_property.use_derived_scale) {
669           // Currently 32 bit tensors in input only accept derived scale.
670           return kTfLiteError;
671         }
672         TensorT* tensor = subgraph->tensors[tensor_idx].get();
673         const float scale = utils::GetEffectiveScale(
674             model, subgraph, *op_idx,
675             tensor_property.derived_scale.input_tensors,
676             tensor_property.derived_scale.intermediate_tensors,
677             tensor_property.derived_scale.factors);
678         return utils::SymmetricPerLayerBiasQuantize<std::int32_t>(
679             model, tensor, scale, error_reporter);
680 
681       } else if (tensor_property.number_of_bits == 10) {
682         // When the number of bits is 10 (instead of 16), quantize the tensor to
683         // [-512, 512], instead of [-32767, 32767].
684         TensorT* tensor = subgraph->tensors[tensor_idx].get();
685         int total_size = 1;
686         for (int i = 0; i < tensor->shape.size(); ++i) {
687           total_size *= tensor->shape[i];
688         }
689         BufferT* buffer = model->buffers[tensor->buffer].get();
690         float* buffer_data = reinterpret_cast<float*>(buffer->data.data());
691         auto minmax =
692             std::minmax_element(buffer_data, buffer_data + total_size);
693         const float range =
694             std::max(std::abs(*minmax.first), std::abs(*minmax.second));
695         const float quantized_range = 512.0;
696         const float scale = range / quantized_range;
697         return utils::SymmetricQuantizeFloatsToInt16(model, tensor, scale,
698                                                      error_reporter);
699       } else {
700         // Only 8, 16, 32, 10 are supported.
701         // TODO(jianlijianli): extend this to support arbitrary bits.
702         TF_LITE_REPORT_ERROR(
703             error_reporter,
704             "Unable to quantize buffer or min/max value for input %d "
705             "in op %s in subgraph %d, node: %d",
706             input_idx, EnumNameBuiltinOperator(op_code), subgraph_idx, *op_idx);
707         return kTfLiteError;
708       }
709     } else if (utils::HasMinMax(tensor)) {
710       if (IsSubgraphInput(subgraph, tensor_idx) ||
711           tensor_property.state_tensor) {
712         if (tensor_property.number_of_bits == 8) {
713           if (tensor_property.use_derived_scale) {
714             // Currently 8bit tensors in input do not accept derived scale.
715             return kTfLiteError;
716           }
717           TF_LITE_ENSURE_STATUS(utils::QuantizeActivation(
718               tensor, activations_type, error_reporter));
719         } else if (tensor_property.number_of_bits == 16) {
720           TensorT* tensor = subgraph->tensors[tensor_idx].get();
721           float quantized_range = 32767.0;
722           float range = std::max(std::abs(tensor->quantization->min[0]),
723                                  std::abs(tensor->quantization->max[0]));
724           if (tensor_property.extend_to_power_of_two) {
725             const int power_of_two_scale = utils::GetPowerOfTwoScale(
726                 tensor->quantization->min[0], tensor->quantization->max[0]);
727             range = std::pow(2, power_of_two_scale);
728             quantized_range = 32768.0;
729           }
730           const float scale = range / quantized_range;
731           utils::QuantizeActivationToInt16(tensor, scale);
732         }
733       } else {
734         // If the tensor is not a model input, we need to add a Quantize
735         // operation since the preceding op may require a float output.
736         std::string type_string =
737             activations_type == TensorType_INT16 ? "int16" : "int8";
738         std::unique_ptr<TensorT> op_output;
739         utils::MakeTensor(tensor->name + "_" + type_string, tensor->shape,
740                           tensor->shape_signature, activations_type,
741                           &op_output);
742         op_output->quantization = absl::make_unique<QuantizationParametersT>();
743         op_output->quantization->min.push_back(tensor->quantization->min[0]);
744         op_output->quantization->max.push_back(tensor->quantization->max[0]);
745         TF_LITE_ENSURE_STATUS(utils::QuantizeActivation(
746             op_output.get(), activations_type, error_reporter));
747         const int32_t quant_op_output_idx = subgraph->tensors.size();
748         subgraph->tensors.push_back(std::move(op_output));
749         std::unique_ptr<OperatorT> quant_op;
750         utils::MakeQuantizeOperator(model, &quant_op, tensor_idx,
751                                     quant_op_output_idx);
752         subgraph->operators.insert(subgraph->operators.begin() + *op_idx,
753                                    std::move(quant_op));
754         op->inputs[input_idx] = quant_op_output_idx;
755         *op_idx += 1;
756       }
757     } else {
758       TF_LITE_REPORT_ERROR(error_reporter,
759                            "Unable to find buffer or min/max value for input "
760                            "%d in %s in subgraph %d, node: %d",
761                            input_idx, EnumNameBuiltinOperator(op_code),
762                            subgraph_idx, *op_idx);
763       return kTfLiteError;
764     }
765   } else if (!property.quantizable && is_input_quantized) {
766     // If the tensor is quantized, we have to add a Dequantize op after
767     // since this op is not quantizable.
768     std::unique_ptr<TensorT> op_output;
769     utils::MakeTensor(tensor->name + "_float", tensor->shape,
770                       tensor->shape_signature, TensorType_FLOAT32, &op_output);
771     const int32_t dequant_op_output_idx = subgraph->tensors.size();
772     subgraph->tensors.push_back(std::move(op_output));
773     std::unique_ptr<OperatorT> dequant_op;
774     utils::MakeDequantizeOperator(model, &dequant_op, tensor_idx,
775                                   dequant_op_output_idx);
776     subgraph->operators.insert(subgraph->operators.begin() + *op_idx,
777                                std::move(dequant_op));
778     op->inputs[input_idx] = dequant_op_output_idx;
779     *op_idx += 1;
780   }
781   return kTfLiteOk;
782 }
783 
784 // Quantize the op output.
QuantizeOpOutput(ModelT * model,int32_t subgraph_idx,int32_t op_idx,operator_property::OperatorProperty property,const std::pair<int32_t,operator_property::TensorProperty> & output,TensorType activations_type,ErrorReporter * error_reporter)785 TfLiteStatus QuantizeOpOutput(
786     ModelT* model, int32_t subgraph_idx, int32_t op_idx,
787     operator_property::OperatorProperty property,
788     const std::pair<int32_t, operator_property::TensorProperty>& output,
789     TensorType activations_type, ErrorReporter* error_reporter) {
790   int32_t output_idx = output.first;
791   operator_property::TensorProperty tensor_property = output.second;
792   // If the operator is not quantizable, we don't need to do anything for the
793   // output.
794   if (!property.quantizable) {
795     return kTfLiteOk;
796   }
797   SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
798   OperatorT* op = subgraph->operators[op_idx].get();
799   const BuiltinOperator op_code =
800       GetBuiltinCode(model->operator_codes[op->opcode_index].get());
801   if (output_idx >= op->outputs.size()) {
802     TF_LITE_REPORT_ERROR(
803         error_reporter,
804         "Required output index %d is larger than the output length of "
805         "op %s at index %d in subgraph %d",
806         output_idx, op->outputs.size(), EnumNameBuiltinOperator(op_code),
807         op_idx, subgraph_idx);
808     return kTfLiteError;
809   }
810 
811   TensorT* output_tensor = subgraph->tensors[op->outputs[output_idx]].get();
812   if (utils::QuantizationParametersExist(output_tensor)) {
813     // Skip output if it has been quantized.
814     return kTfLiteOk;
815   }
816   if (ShouldRestrictSameInputOutputScale(property)) {
817     // Copy quantization parameter. For average pool, max pool, etc
818     // min/max can be different but we want them to be the same.
819     // Get scale and zero point of input.
820     if (property.inputs[0].first >= op->inputs.size()) {
821       TF_LITE_REPORT_ERROR(
822           error_reporter,
823           "Required input index %d is larger than the input length of "
824           "op %s at index %d in subgraph %d",
825           property.inputs[0].first, op->inputs.size(),
826           EnumNameBuiltinOperator(op_code), op_idx, subgraph_idx);
827       return kTfLiteError;
828     }
829     const int input_tensor_idx = op->inputs[property.inputs[0].first];
830     TensorT* input_tensor = subgraph->tensors[input_tensor_idx].get();
831     if (input_tensor->quantization->scale.size() != 1 ||
832         input_tensor->quantization->zero_point.size() != 1) {
833       TF_LITE_REPORT_ERROR(error_reporter,
834                            "Invalid quantization params for op %s at index %d "
835                            "in subgraph %d",
836                            EnumNameBuiltinOperator(op_code), op_idx,
837                            subgraph_idx);
838       return kTfLiteError;
839     }
840 
841     const float input_scale = input_tensor->quantization->scale[0];
842     const int32_t input_zero_point = input_tensor->quantization->zero_point[0];
843 
844     // Apply to output.
845     output_tensor->quantization = absl::make_unique<QuantizationParametersT>();
846     output_tensor->quantization->scale.push_back(input_scale);
847     output_tensor->quantization->zero_point.push_back(input_zero_point);
848     if (!input_tensor->quantization->min.empty()) {
849       const float min = input_tensor->quantization->min[0];
850       output_tensor->quantization->min = {min};
851     }
852     if (!input_tensor->quantization->max.empty()) {
853       const float max = input_tensor->quantization->max[0];
854       output_tensor->quantization->max = {max};
855     }
856     output_tensor->type = activations_type;
857   } else if (tensor_property.restriction) {
858     const auto scale_and_zp = activations_type == TensorType_INT16
859                                   ? tensor_property.restricted_value_int16
860                                   : tensor_property.restricted_value_int8;
861 
862     // Apply to output.
863     output_tensor->quantization = absl::make_unique<QuantizationParametersT>();
864     output_tensor->quantization->scale.push_back(scale_and_zp.first);
865     output_tensor->quantization->zero_point.push_back(scale_and_zp.second);
866     output_tensor->type = activations_type;
867   } else {
868     // Process regular output that doesn't have any restrictions.
869     if (utils::HasMinMax(output_tensor)) {
870       utils::QuantizeActivation(output_tensor, activations_type,
871                                 error_reporter);
872     } else {
873       TF_LITE_REPORT_ERROR(
874           error_reporter,
875           "Unable to find min/max value for output %d in %s in "
876           "subgraph %d, node: %d",
877           output_idx, EnumNameBuiltinOperator(op_code), subgraph_idx, op_idx);
878       return kTfLiteError;
879     }
880   }
881   return kTfLiteOk;
882 }
883 
QuantizeIntemediateTensors(ModelT * model,TensorType activations_type,ErrorReporter * error_reporter)884 TfLiteStatus QuantizeIntemediateTensors(ModelT* model,
885                                         TensorType activations_type,
886                                         ErrorReporter* error_reporter) {
887   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
888        subgraph_idx++) {
889     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
890     for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
891       operator_property::OperatorProperty property =
892           operator_property::GetOperatorProperty(model, subgraph_idx, op_idx);
893       if (!property.intermediates.empty()) {
894         OperatorT* op = subgraph->operators[op_idx].get();
895         const BuiltinOperator op_code =
896             GetBuiltinCode(model->operator_codes[op->opcode_index].get());
897         for (const std::pair<int, operator_property::TensorProperty>& input :
898              property.intermediates) {
899           const int index_local = input.first;
900           const int index_global = op->intermediates[index_local];
901           if (index_global == -1) {
902             // Skip optional tensor.
903             continue;
904           }
905           if (input.second.number_of_bits == 8 &&
906               input.second.symmetric == false) {
907             TensorT* tensor = subgraph->tensors[index_global].get();
908             if (tensor->quantization == nullptr) {
909               continue;
910             }
911             if (utils::HasMinMax(tensor)) {
912               utils::QuantizeActivation(tensor, activations_type,
913                                         error_reporter);
914             } else {
915               TF_LITE_REPORT_ERROR(error_reporter,
916                                    "Unable to find min/max value for "
917                                    "intermediate tensor %d in %s in "
918                                    "subgraph %d, node: %d",
919                                    index_local,
920                                    EnumNameBuiltinOperator(op_code),
921                                    subgraph_idx, op_idx);
922               return kTfLiteError;
923             }
924           } else if (input.second.number_of_bits == 16 &&
925                      input.second.symmetric == true) {
926             TensorT* tensor = subgraph->tensors[index_global].get();
927             if (tensor->quantization == nullptr) {
928               continue;
929             }
930             const float min = tensor->quantization->min[0];
931             const float max = tensor->quantization->max[0];
932             const float range = std::max(std::abs(min), std::abs(max));
933             if (range < 1e-8) {
934               return kTfLiteError;
935             }
936 
937             // Get scale and zero point.
938             const float quantized_range = 32767.0;
939             const float scale = range / quantized_range;
940             utils::QuantizeActivationToInt16(tensor, scale);
941           } else {
942             return kTfLiteError;
943           }
944         }
945       }
946     }
947   }
948   return kTfLiteOk;
949 }
950 
951 // Quantize tensros that have shared range. For example, in LSTM, the output
952 // tensor and input state tensor should share the same range because they are
953 // using the same scale and zero point.
954 // We have to model this explicitly because the output is modeled as an extra
955 // tensor in LSTM. In calibrator, state tensors are logged both before and after
956 // the inference so the range is fully captured. But output, although it is
957 // identical to activation, is not a state tensor the input value (range) of the
958 // very first inference is not captured.
QuantizeSharedRange(ModelT * model,ErrorReporter * error_reporter)959 TfLiteStatus QuantizeSharedRange(ModelT* model, ErrorReporter* error_reporter) {
960   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
961        subgraph_idx++) {
962     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
963     for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
964       operator_property::OperatorProperty property =
965           operator_property::GetOperatorProperty(model, subgraph_idx, op_idx);
966       if (!property.intermediates.empty()) {
967         OperatorT* op = subgraph->operators[op_idx].get();
968         for (const std::vector<int>& input : property.restrict_scale) {
969           if (input.empty()) {
970             continue;
971           }
972           // Currently only support pair of twos.
973           // TODO(b/174534943): extend to arbitrary number of tensors.
974           if (input.size() != 2) {
975             return kTfLiteError;
976           }
977           const int index_1 = input[0];
978           const int index_2 = input[1];
979           // TODO(jianlijianli): model input/output.
980           TensorT* tensor_1 = subgraph->tensors[op->inputs[index_1]].get();
981           TensorT* tensor_2 = subgraph->tensors[op->outputs[index_2]].get();
982           const float min_of_min = std::min(tensor_1->quantization->min[0],
983                                             tensor_2->quantization->min[0]);
984           const float max_of_max = std::max(tensor_1->quantization->max[0],
985                                             tensor_2->quantization->max[0]);
986           if (min_of_min == 0.0 && max_of_max == 0.0) {
987             return kTfLiteError;
988           }
989 
990           // Asmmetric quantization to 8 bit.
991           auto quantization_params =
992               absl::make_unique<QuantizationParametersT>();
993           utils::GetAsymmetricQuantizationParams(
994               min_of_min, max_of_max, -128, 127, quantization_params.get());
995 
996           // Populate both tensors with the same parameters.
997           const float scale = quantization_params->scale[0];
998           const int32 zero_point = quantization_params->zero_point[0];
999           for (TensorT* tensor : {tensor_1, tensor_2}) {
1000             tensor->quantization = absl::make_unique<QuantizationParametersT>();
1001             tensor->quantization->scale.push_back(scale);
1002             tensor->quantization->zero_point.push_back(zero_point);
1003             tensor->type = TensorType_INT8;
1004           }
1005         }
1006       }
1007     }
1008   }
1009   return kTfLiteOk;
1010 }
1011 
1012 // Quantize inputs and weights.
1013 // Because of ops such as lstm, still need to do per op, instead of weights.
QuantizeWeightsInputOutput(ModelT * model,bool allow_float,const std::unordered_set<string> & operator_names,const std::unordered_set<string> & real_value_op_set,const TensorType & activations_type,ErrorReporter * error_reporter)1014 TfLiteStatus QuantizeWeightsInputOutput(
1015     ModelT* model, bool allow_float,
1016     const std::unordered_set<string>& operator_names,
1017     const std::unordered_set<string>& real_value_op_set,
1018     const TensorType& activations_type, ErrorReporter* error_reporter) {
1019   // Flag to track unsupported ops.
1020   bool quantization_not_supported = false;
1021 
1022   // Loop over the graph and quantize ops.
1023   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1024        subgraph_idx++) {
1025     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1026     for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
1027       OperatorT* op = subgraph->operators[op_idx].get();
1028       const BuiltinOperator op_code =
1029           GetBuiltinCode(model->operator_codes[op->opcode_index].get());
1030       const string operator_name = subgraph->tensors[op->outputs[0]]->name;
1031       operator_property::OperatorProperty property =
1032           GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
1033                               operator_name, activations_type);
1034       if (!IsRealValueOp(real_value_op_set, operator_name)) {
1035         continue;
1036       }
1037 
1038       if (activations_type == TensorType_INT16 && !property.quantizable &&
1039           !allow_float) {
1040         TF_LITE_REPORT_ERROR(
1041             error_reporter,
1042             "Quantization to 16x8-bit not yet supported for op: '%s'.\n",
1043             EnumNameBuiltinOperator(op_code));
1044         quantization_not_supported = true;
1045       } else if (!property.quantizable && !allow_float) {
1046         if (op_code == BuiltinOperator_DEQUANTIZE &&
1047             std::find(subgraph->outputs.begin(), subgraph->outputs.end(),
1048                       op->outputs[0]) != subgraph->outputs.end()) {
1049           continue;
1050         }
1051         TF_LITE_REPORT_ERROR(error_reporter,
1052                              "Quantization not yet supported for op: '%s'.\n",
1053                              EnumNameBuiltinOperator(op_code));
1054         quantization_not_supported = true;
1055       }
1056 
1057       // Quantize operator inputs/weights.
1058       for (const std::pair<int, operator_property::TensorProperty>& input :
1059            GetInputs(op, property)) {
1060         TF_LITE_ENSURE_STATUS(QuantizeOpInput(model, subgraph_idx, &op_idx,
1061                                               property, input, activations_type,
1062                                               error_reporter));
1063       }
1064 
1065       // Quantize operator outputs.
1066       for (const std::pair<int, operator_property::TensorProperty>& output :
1067            GetOutputs(op, property)) {
1068         TF_LITE_ENSURE_STATUS(
1069             QuantizeOpOutput(model, subgraph_idx, op_idx, property, output,
1070                              activations_type, error_reporter));
1071       }
1072     }
1073   }
1074 
1075   // Return; emit errors if there are any.
1076   if (quantization_not_supported) {
1077     return kTfLiteError;
1078   }
1079   return kTfLiteOk;
1080 }
1081 
1082 // Quantize bias.
QuantizeBiases(ModelT * model,const std::unordered_set<string> & operator_names,const std::unordered_set<string> & real_value_op_set,const TensorType & activations_type,ErrorReporter * error_reporter)1083 TfLiteStatus QuantizeBiases(ModelT* model,
1084                             const std::unordered_set<string>& operator_names,
1085                             const std::unordered_set<string>& real_value_op_set,
1086                             const TensorType& activations_type,
1087                             ErrorReporter* error_reporter) {
1088   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1089        subgraph_idx++) {
1090     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1091     for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
1092       OperatorT* op = subgraph->operators[op_idx].get();
1093       const BuiltinOperator op_code =
1094           GetBuiltinCode(model->operator_codes[op->opcode_index].get());
1095       const string operator_name = subgraph->tensors[op->outputs[0]]->name;
1096       operator_property::OperatorProperty property =
1097           GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
1098                               operator_name, activations_type);
1099       if (!property.quantizable ||
1100           !IsRealValueOp(real_value_op_set, operator_name)) {
1101         continue;
1102       }
1103       for (const int bias_idx : property.biases) {
1104         if (bias_idx >= op->inputs.size() ||
1105             op->inputs[bias_idx] == kTfLiteOptionalTensor) {
1106           continue;
1107         }
1108         // Quantize if it is not quantized already as the
1109         // output of another op or input of another op.
1110         TensorT* bias_tensor = subgraph->tensors[op->inputs[bias_idx]].get();
1111         if (!utils::QuantizationParametersExist(bias_tensor)) {
1112           if (utils::HasBuffer(model, subgraph, op->inputs[bias_idx])) {
1113             if (property.inputs.size() != 2) {
1114               TF_LITE_REPORT_ERROR(error_reporter,
1115                                    "Expect the input length of "
1116                                    "op %s at index %d in subgraph %d to be 2",
1117                                    bias_idx, op->inputs.size(),
1118                                    EnumNameBuiltinOperator(op_code), op_idx,
1119                                    subgraph_idx);
1120               return kTfLiteError;
1121             }
1122             TensorT* input_tensor =
1123                 subgraph->tensors[op->inputs[property.inputs[0].first]].get();
1124             TensorT* weight_tensor =
1125                 subgraph->tensors[op->inputs[property.inputs[1].first]].get();
1126             operator_property::TensorProperty weight_property =
1127                 property.inputs[1].second;
1128             TF_LITE_ENSURE_STATUS(QuantizeBias(
1129                 model, input_tensor, weight_tensor, bias_tensor,
1130                 weight_property.per_axis, weight_property.per_axis_index,
1131                 activations_type, error_reporter));
1132           }
1133         } else {
1134           // If bias is already quantized, make sure it is quantized to 32 bit.
1135           if (bias_tensor->type != TensorType_INT32) {
1136             TF_LITE_REPORT_ERROR(
1137                 error_reporter,
1138                 "Bias (\"%s\" at global index %d) of op \"%s\" at op_index %d "
1139                 "in subgraph %d is expected to be quantized to INT32 but it is "
1140                 "already quantized to %s.\n",
1141                 bias_tensor->name.c_str(), op->inputs[bias_idx],
1142                 operator_name.c_str(), op_idx, subgraph_idx,
1143                 EnumNameTensorType(bias_tensor->type));
1144           }
1145         }
1146       }
1147     }
1148   }
1149   return kTfLiteOk;
1150 }
1151 
GetAllOperatorOutputs(ModelT * model)1152 std::unordered_set<string> GetAllOperatorOutputs(ModelT* model) {
1153   std::unordered_set<string> operator_names;
1154   for (int32_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1155        subgraph_idx++) {
1156     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1157     for (int32_t tensor_idx = 0; tensor_idx < subgraph->tensors.size();
1158          tensor_idx++) {
1159       operator_names.insert(subgraph->tensors[tensor_idx]->name);
1160     }
1161   }
1162   return operator_names;
1163 }
1164 // Populate the quantization parameters max and min for input tensors.
1165 // Assumes that dynamic tensors already have stored min, max values and throw
1166 // an error if a tensor does not have min, max quantization parameter or a
1167 // buffer.
1168 // If any static tensors are not inputs to an operation, their max, min values
1169 // will not be filled by this function.
FillQuantizationParams(ModelT * model,const std::unordered_set<string> & operator_names,const std::unordered_set<string> & real_value_op_set,const TensorType & activations_type,ErrorReporter * error_reporter)1170 TfLiteStatus FillQuantizationParams(
1171     ModelT* model, const std::unordered_set<string>& operator_names,
1172     const std::unordered_set<string>& real_value_op_set,
1173     const TensorType& activations_type, ErrorReporter* error_reporter) {
1174   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1175        subgraph_idx++) {
1176     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1177     for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
1178       OperatorT* op = subgraph->operators[op_idx].get();
1179       const string operator_name = subgraph->tensors[op->outputs[0]]->name;
1180       operator_property::OperatorProperty property =
1181           GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
1182                               operator_name, activations_type);
1183       if (!IsRealValueOp(real_value_op_set, operator_name)) {
1184         continue;
1185       }
1186 
1187       // Populate max, min for each input tensor.
1188       for (const std::pair<int, operator_property::TensorProperty>& input :
1189            property.inputs) {
1190         // Get tensor.
1191         const int32_t input_idx = input.first;
1192         const int32_t tensor_idx = op->inputs[input_idx];
1193         if (tensor_idx == -1) {
1194           // Skip optional tensor.
1195           continue;
1196         }
1197         TensorT* tensor = subgraph->tensors[tensor_idx].get();
1198 
1199         // Static tensor.
1200         if (!utils::HasMinMax(tensor) &&
1201             utils::HasBuffer(model, subgraph, tensor_idx)) {
1202           // Get input float data and tensor dimensions.
1203           const BufferT* buffer = model->buffers[tensor->buffer].get();
1204           const float* float_input_data =
1205               reinterpret_cast<const float*>(buffer->data.data());
1206 
1207           if (tensor->quantization == nullptr) {
1208             tensor->quantization = absl::make_unique<QuantizationParametersT>();
1209           }
1210 
1211           // Fill per channel max and min with respect to channel_dim_index.
1212           if (input.second.per_axis) {
1213             if (tensor->shape.size() == 4) {
1214               int32_t channel_dim_index = input.second.per_axis_index;
1215               TF_LITE_ENSURE_STATUS(utils::FillPerChannelMinMax(
1216                   float_input_data, tensor->shape, channel_dim_index,
1217                   tensor->quantization.get(), error_reporter));
1218             } else {
1219               TF_LITE_REPORT_ERROR(
1220                   error_reporter,
1221                   "Could not fill max min for tensor as the dimension is %d "
1222                   "and not 4 as expected.",
1223                   tensor->shape.size());
1224               return kTfLiteError;
1225             }
1226 
1227             // Fill per layer max and min.
1228           } else if (!utils::HasMinMax(tensor) && !input.second.per_axis &&
1229                      utils::HasBuffer(model, subgraph, tensor_idx)) {
1230             uint64_t input_size;
1231             TF_LITE_ENSURE_STATUS(utils::NumElements(*tensor, &input_size));
1232             utils::FillSingleMinMax(float_input_data, input_size,
1233                                     tensor->quantization.get());
1234           }
1235           if (tensor->quantization->quantized_dimension !=
1236               input.second.per_axis_index) {
1237             TF_LITE_REPORT_ERROR(
1238                 error_reporter,
1239                 "Quantized dimension for tensor property and quantization "
1240                 "parameters do not match. Got %d and %d respectively.",
1241                 input.second.per_axis_index,
1242                 tensor->quantization->quantized_dimension);
1243             return kTfLiteError;
1244           }
1245 
1246           // Dynamic tensor.
1247         } else if (!utils::HasMinMax(tensor) &&
1248                    !utils::HasBuffer(model, subgraph, tensor_idx)) {
1249           TF_LITE_REPORT_ERROR(
1250               error_reporter,
1251               "Max and min for dynamic tensors should be"
1252               " recorded during calibration: Failed for tensor %s\n",
1253               tensor->name.c_str());
1254           if (tensor->quantization == nullptr) {
1255             TF_LITE_REPORT_ERROR(error_reporter,
1256                                  "No quantization params for tensor %s",
1257                                  tensor->name.c_str());
1258           } else if (tensor->quantization->min.empty() ||
1259                      tensor->quantization->max.empty()) {
1260             TF_LITE_REPORT_ERROR(error_reporter, "Empty min/max for tensor %s",
1261                                  tensor->name.c_str());
1262           }
1263           return kTfLiteError;
1264         }
1265 
1266         if (utils::QuantizationParametersExist(tensor)) {
1267           TF_LITE_REPORT_ERROR(
1268               error_reporter,
1269               "Scale and zero points should not be recorded before "
1270               "quantization.");
1271           return kTfLiteError;
1272         }
1273       }  // loop over op inputs
1274     }    // loop over ops
1275   }      // loop over subgraphs
1276   return kTfLiteOk;
1277 }
1278 
1279 // Check compatibility of activation, weight and bias scales. Adjust if needed.
EnsureBiasScaleCompatibility(ModelT * model,const std::unordered_set<string> & operator_names,const std::unordered_set<string> & real_value_op_set,const TensorType & activations_type,ErrorReporter * error_reporter)1280 TfLiteStatus EnsureBiasScaleCompatibility(
1281     ModelT* model, const std::unordered_set<string>& operator_names,
1282     const std::unordered_set<string>& real_value_op_set,
1283     const TensorType& activations_type, ErrorReporter* error_reporter) {
1284   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
1285        subgraph_idx++) {
1286     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
1287     for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
1288       OperatorT* op = subgraph->operators[op_idx].get();
1289       const string operator_name = subgraph->tensors[op->outputs[0]]->name;
1290       operator_property::OperatorProperty property =
1291           GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
1292                               operator_name, activations_type);
1293       if (!IsRealValueOp(real_value_op_set, operator_name)) {
1294         continue;
1295       }
1296 
1297       // Loop over all bias tensors.
1298       for (const int bias_idx : property.biases) {
1299         if (bias_idx >= op->inputs.size() ||
1300             op->inputs[bias_idx] == kTfLiteOptionalTensor) {
1301           continue;
1302         }
1303         TensorT* bias_tensor = subgraph->tensors[op->inputs[bias_idx]].get();
1304         int32_t channel_dim_size = bias_tensor->shape[0];
1305         if (bias_tensor->shape.size() != 1) {
1306           TF_LITE_REPORT_ERROR(error_reporter,
1307                                "Expected bias tensor to be a vector.");
1308           return kTfLiteError;
1309         }
1310 
1311         if (property.inputs.size() != 2) {  // Only works for two input tensors.
1312           TF_LITE_REPORT_ERROR(
1313               error_reporter,
1314               "Expect %d inputs for op %s at index %d in subgraph %d to be 2",
1315               property.inputs.size(), op_idx, subgraph_idx);
1316           return kTfLiteError;
1317         }
1318 
1319         if (!property.arbitrary_inputs && property.quantizable) {
1320           // Get input and weight tensors.
1321           TensorT* input_tensor =
1322               subgraph->tensors[op->inputs[property.inputs[0].first]].get();
1323           TensorT* weight_tensor =
1324               subgraph->tensors[op->inputs[property.inputs[1].first]].get();
1325           operator_property::TensorProperty weight_property =
1326               property.inputs[1].second;
1327           TF_LITE_ENSURE(error_reporter, input_tensor->quantization);
1328 
1329           // Check quantization parameters exist for input.
1330           if (!utils::HasMinMax(input_tensor)) {
1331             TF_LITE_REPORT_ERROR(
1332                 error_reporter,
1333                 "Input tensor missing quantization information. Should be "
1334                 "populated during calibration.");
1335             return kTfLiteError;
1336           }
1337 
1338           // Get input scale for asymmetric quantization.
1339           QuantizationParametersT temp_quant_params = QuantizationParametersT();
1340           TF_LITE_ENSURE_STATUS(
1341               utils::GetQuantizationParams(input_tensor, activations_type,
1342                                            &temp_quant_params, error_reporter));
1343           if (temp_quant_params.scale.size() != 1) {
1344             TF_LITE_REPORT_ERROR(error_reporter,
1345                                  "Unexpected input quantization scale size.");
1346             return kTfLiteError;
1347           }
1348           float input_scale = temp_quant_params.scale[0];
1349 
1350           // Check that max/min values have been filled for weights.
1351           if (!utils::HasMinMax(weight_tensor)) {
1352             TF_LITE_REPORT_ERROR(
1353                 error_reporter,
1354                 "Min and/or max values have not been recorded for weight "
1355                 "tensor. This should have happened in FillQuantizationParams.");
1356             return kTfLiteError;
1357           }
1358 
1359           // Ensure the tensor dimensions are compatible.
1360           if (weight_property.per_axis) {
1361             if (bias_tensor->shape[0] !=
1362                 weight_tensor->shape[weight_property.per_axis_index]) {
1363               TF_LITE_REPORT_ERROR(
1364                   error_reporter,
1365                   "Channel mismatch between bias and weight tensors %d vs %d",
1366                   bias_tensor->shape[0],
1367                   weight_tensor->shape[weight_property.per_axis_index]);
1368               return kTfLiteError;
1369             }
1370             // Ensure that the number of max/mins matches the channel_dim_size.
1371             if (weight_tensor->quantization->max.size() != channel_dim_size) {
1372               TF_LITE_REPORT_ERROR(
1373                   error_reporter,
1374                   "Mismatch between number of weight maxs and channels: %d vs "
1375                   "%d",
1376                   weight_tensor->quantization->max.size(), channel_dim_size);
1377               return kTfLiteError;
1378             }
1379             if (weight_tensor->quantization->min.size() != channel_dim_size) {
1380               TF_LITE_REPORT_ERROR(
1381                   error_reporter,
1382                   "Mismatch between number of weight mins and channels: %d",
1383                   weight_tensor->quantization->min.size());
1384               return kTfLiteError;
1385             }
1386           }
1387 
1388           // Get data and size of bias tensor.
1389           const BufferT* buffer = model->buffers[bias_tensor->buffer].get();
1390           const float* bias_data =
1391               reinterpret_cast<const float*>(buffer->data.data());
1392           uint64_t bias_size;
1393           TF_LITE_ENSURE_STATUS(utils::NumElements(*bias_tensor, &bias_size));
1394 
1395           // Adjust weight scales if needed.
1396           TF_LITE_ENSURE_STATUS(utils::AdjustWeightsForBiasScale(
1397               weight_tensor->quantization.get(), bias_data, bias_size,
1398               input_scale, error_reporter));
1399 
1400           if (utils::QuantizationParametersExist(weight_tensor)) {
1401             TF_LITE_REPORT_ERROR(
1402                 error_reporter,
1403                 "Scale and zero points should not be recorded for the weight "
1404                 "tensor before quantization.");
1405             return kTfLiteError;
1406           }
1407           if (utils::QuantizationParametersExist(input_tensor)) {
1408             TF_LITE_REPORT_ERROR(
1409                 error_reporter,
1410                 "Scale and zero points should not be recorded for the input "
1411                 "tensor before quantization.");
1412             return kTfLiteError;
1413           }
1414         }
1415       }
1416     }
1417   }
1418   return kTfLiteOk;
1419 }
1420 
1421 }  // namespace
1422 
1423 // Assumes that the operators in the model have been topologically sorted.
QuantizeModel(flatbuffers::FlatBufferBuilder * builder,ModelT * model,const TensorType & input_type,const TensorType & output_type,bool allow_float,const std::unordered_set<string> & operator_names,const TensorType & activations_type,ErrorReporter * error_reporter)1424 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
1425                            ModelT* model, const TensorType& input_type,
1426                            const TensorType& output_type, bool allow_float,
1427                            const std::unordered_set<string>& operator_names,
1428                            const TensorType& activations_type,
1429                            ErrorReporter* error_reporter) {
1430   auto real_value_op_set =
1431       PopulateRealValueOpSet(model, operator_names, activations_type);
1432   TF_LITE_ENSURE_STATUS(
1433       FillQuantizationParams(model, operator_names, real_value_op_set,
1434                              activations_type, error_reporter));
1435   TF_LITE_ENSURE_STATUS(
1436       EnsureBiasScaleCompatibility(model, operator_names, real_value_op_set,
1437                                    activations_type, error_reporter));
1438   TF_LITE_ENSURE_STATUS(
1439       QuantizeIntemediateTensors(model, activations_type, error_reporter));
1440   TF_LITE_ENSURE_STATUS(QuantizeSharedRange(model, error_reporter));
1441   TF_LITE_ENSURE_STATUS(QuantizeWeightsInputOutput(
1442       model, allow_float, operator_names, real_value_op_set, activations_type,
1443       error_reporter));
1444   TF_LITE_ENSURE_STATUS(ApplyConstraints(model, operator_names,
1445                                          real_value_op_set, activations_type,
1446                                          error_reporter));
1447   TF_LITE_ENSURE_STATUS(QuantizeBiases(model, operator_names, real_value_op_set,
1448                                        activations_type, error_reporter));
1449   utils::SetOperatorCodeVersion(model);
1450   TF_LITE_ENSURE_STATUS(SetInputAndOutputTypes(
1451       model, input_type, output_type, activations_type, error_reporter));
1452   SetOperatorPropertyADDSUBOperator(model, activations_type);
1453   flatbuffers::Offset<Model> output_model_location =
1454       Model::Pack(*builder, model);
1455   FinishModelBuffer(*builder, output_model_location);
1456 
1457   return kTfLiteOk;
1458 }
1459 
QuantizeModelAllOperators(flatbuffers::FlatBufferBuilder * builder,ModelT * model,const TensorType & input_type,const TensorType & output_type,bool allow_float,const TensorType & activations_type,ErrorReporter * error_reporter)1460 TfLiteStatus QuantizeModelAllOperators(flatbuffers::FlatBufferBuilder* builder,
1461                                        ModelT* model,
1462                                        const TensorType& input_type,
1463                                        const TensorType& output_type,
1464                                        bool allow_float,
1465                                        const TensorType& activations_type,
1466                                        ErrorReporter* error_reporter) {
1467   return QuantizeModel(builder, model, input_type, output_type, allow_float,
1468                        GetAllOperatorOutputs(model), activations_type,
1469                        error_reporter);
1470 }
1471 
QuantizeModel(flatbuffers::FlatBufferBuilder * builder,ModelT * model,const TensorType & input_type,const TensorType & output_type,bool allow_float,ErrorReporter * error_reporter)1472 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
1473                            ModelT* model, const TensorType& input_type,
1474                            const TensorType& output_type, bool allow_float,
1475                            ErrorReporter* error_reporter) {
1476   return QuantizeModel(builder, model, input_type, output_type, allow_float,
1477                        GetAllOperatorOutputs(model), TensorType_INT8,
1478                        error_reporter);
1479 }
1480 
QuantizeModel(flatbuffers::FlatBufferBuilder * builder,ModelT * model,const TensorType & input_type,const TensorType & output_type,ErrorReporter * error_reporter)1481 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
1482                            ModelT* model, const TensorType& input_type,
1483                            const TensorType& output_type,
1484                            ErrorReporter* error_reporter) {
1485   return QuantizeModel(builder, model, input_type, output_type,
1486                        /*allow_float=*/false, error_reporter);
1487 }
1488 
QuantizeModel(flatbuffers::FlatBufferBuilder * builder,ModelT * model,ErrorReporter * error_reporter)1489 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
1490                            ModelT* model, ErrorReporter* error_reporter) {
1491   return QuantizeModel(builder, model, TensorType_FLOAT32, TensorType_FLOAT32,
1492                        /*allow_float=*/false, error_reporter);
1493 }
1494 
1495 }  // namespace optimize
1496 }  // namespace tflite
1497