• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/conv.h"
16 
17 #include <stddef.h>
18 
19 #include <cstdint>
20 #include <vector>
21 
22 // Only use multi-threaded Eigen if ruy is disabled.
23 #if !defined(TFLITE_WITH_RUY)
24 #define TFLITE_WITH_MULTITHREADED_EIGEN
25 #endif
26 
27 #include "tensorflow/lite/c/builtin_op_data.h"
28 #include "tensorflow/lite/c/common.h"
29 #include "tensorflow/lite/kernels/cpu_backend_context.h"
30 #if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
31 #include "tensorflow/lite/kernels/eigen_support.h"
32 #endif
33 #include "tensorflow/lite/kernels/internal/compatibility.h"
34 #include "tensorflow/lite/kernels/internal/types.h"
35 // b/131835803 forces us to include multithreaded_conv.h before optimized_ops.h
36 #if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
37 #include "tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h"
38 #endif
39 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
40 #include "tensorflow/lite/kernels/internal/quantization_util.h"
41 #include "tensorflow/lite/kernels/internal/reference/conv.h"
42 #include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
43 #include "tensorflow/lite/kernels/internal/tensor.h"
44 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
45 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
46 #include "tensorflow/lite/kernels/kernel_util.h"
47 #include "tensorflow/lite/kernels/padding.h"
48 #include "tensorflow/lite/util.h"
49 
50 namespace tflite {
51 namespace ops {
52 namespace builtin {
53 namespace conv {
54 
55 // This file has 4 implementation of Conv.
56 enum KernelType {
57   kReference,
58   kGenericOptimized,  // Neon-free
59   // kMultithreadOptimized is a mixture of an Eigen-based kernel when threads
60   // are available and kGenericOptimized when we must use only one thread.
61   kMultithreadOptimized,
62   // The kernel uses use CBLAS interface for matrix multiplication.
63   // It's fast when an optimized CBLAS implementation is available (e.g. Apple
64   // Accelerate Framework), and it's slow when falling back to naive
65   // implementation.
66   kCblasOptimized,
67 };
68 
69 const int kTensorNotAllocated = -1;
70 
71 static constexpr size_t kMaxIm2colBufferSizeMobile = 1024 * 1024 * 1024;  // 1GB
72 
73 struct OpData {
74   // IDs are the arbitrary identifiers used by TF Lite to identify and access
75   // memory buffers.
76   int im2col_id = kTensorNotAllocated;
77   int hwcn_weights_id = kTensorNotAllocated;
78   int input_quantized_id = kTensorNotAllocated;
79   int scaling_factors_id = kTensorNotAllocated;
80   int input_offset_id = kTensorNotAllocated;
81   int accum_scratch_id = kTensorNotAllocated;
82   // Row sums are used to cache filter sums for hybrid zero-point calculations.
83   int row_sums_id = kTensorNotAllocated;
84 
85   TfLitePaddingValues padding;
86   // The scaling factor from input to output (aka the 'real multiplier') can
87   // be represented as a fixed point multiplier plus a left shift.
88   int32_t output_multiplier;
89   int output_shift;
90 
91   // Per channel output multiplier and shift.
92   std::vector<int32_t> per_channel_output_multiplier;
93   std::vector<int> per_channel_output_shift;
94 
95   // The range of the fused activation layer. For example for kNone and
96   // uint8_t these would be 0 and 255.
97   int32_t output_activation_min;
98   int32_t output_activation_max;
99   // Indexes are the offset to the memory buffer in the array used to keep track
100   // of the allocated temporaries.
101   int32_t im2col_index;
102   int32_t hwcn_weights_index;
103   int32_t input_quantized_index;
104   int32_t scaling_factors_index;
105   int32_t accum_scratch_index;
106   int32_t input_offset_index;
107   int32_t row_sums_index;
108 
109   bool need_hwcn_weights = false;
110   bool have_weights_been_transposed = false;
111   bool need_im2col = false;
112   // If it's true, it means im2col is needed but gets disabled because the
113   // temporary im2col tensor requires too much memory (i.e.
114   // >= kMaxIm2colBufferSize);
115   bool im2col_oversized = false;
116 
117   bool supports_multithreaded_kernel = false;
118   bool is_hybrid_per_channel = false;
119   bool compute_hybrid_row_sums = true;
120 };
121 
RuntimePaddingType(TfLitePadding padding)122 inline PaddingType RuntimePaddingType(TfLitePadding padding) {
123   switch (padding) {
124     case TfLitePadding::kTfLitePaddingSame:
125       return PaddingType::kSame;
126     case TfLitePadding::kTfLitePaddingValid:
127       return PaddingType::kValid;
128     case TfLitePadding::kTfLitePaddingUnknown:
129     default:
130       return PaddingType::kNone;
131   }
132 }
133 
Init(TfLiteContext * context,const char * buffer,size_t length)134 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
135   // This is a builtin op, so we don't use the contents in 'buffer', if any.
136   // Instead, we allocate a new object to use as scratch space for im2col, and
137   // to carry information from Prepare() to Eval().
138   auto* data = new OpData;
139 #if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
140   eigen_support::IncrementUsageCounter(context);
141 #endif
142   return data;
143 }
144 
Free(TfLiteContext * context,void * buffer)145 void Free(TfLiteContext* context, void* buffer) {
146 #if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
147   eigen_support::DecrementUsageCounter(context);
148 #endif
149   delete reinterpret_cast<OpData*>(buffer);
150 }
151 
152 // Naive implementation of transpose for floats. Could be optimized to be more
153 // cache friendly, but for now it's a one-time cost on first run, and we would
154 // prefer to remove the need to do this at all eventually.
TransposeFloatTensor(const TfLiteTensor * input,TfLiteTensor * output)155 void TransposeFloatTensor(const TfLiteTensor* input, TfLiteTensor* output) {
156   const int rows = output->dims->data[1];
157   const int cols = output->dims->data[0];
158   const float* input_data = GetTensorData<float>(input);
159   float* output_data = GetTensorData<float>(output);
160   for (int i = 0; i < rows; ++i) {
161     for (int j = 0; j < cols; ++j) {
162       const float in_value = input_data[i * cols + j];
163       output_data[j * rows + i] = in_value;
164     }
165   }
166 }
167 
168 // Check if im2col needs to be allocated, as some version of optimized Conv dont
169 // use it. If any change is supporting im2col in any of the Conv versions, then
170 // it should be updated here as well
IsIm2ColRequired(const TfLiteTensor * input,TfLiteConvParams * params,const TfLiteTensor * filter,OpData * data,bool is_hybrid,KernelType kernel_type)171 bool IsIm2ColRequired(const TfLiteTensor* input, TfLiteConvParams* params,
172                       const TfLiteTensor* filter, OpData* data, bool is_hybrid,
173                       KernelType kernel_type) {
174   // If HWCN weights are required, Im2Col not required
175   if (data->need_hwcn_weights) return false;
176 
177   // segregate based on dilated conv & non-dialated conv
178   const bool need_dilated_im2col =
179       params->dilation_width_factor != 1 || params->dilation_height_factor != 1;
180   const bool need_non_dilated_im2col =
181       params->stride_width != 1 || params->stride_height != 1 ||
182       filter->dims->data[2] != 1 || filter->dims->data[1] != 1;
183 
184   const bool need_im2col = need_dilated_im2col || need_non_dilated_im2col;
185 
186   // Return early as basic requirement is not met
187   if (!need_im2col) return false;
188 
189   // Special case for Hybrid, as it supports only non-dilated im2col currently
190   const bool is_hybrid_non_dilated = is_hybrid && need_non_dilated_im2col;
191   const bool is_quantized =
192       input->type == kTfLiteUInt8 || input->type == kTfLiteInt8;
193 
194   switch (kernel_type) {
195     case kReference:
196       if (is_hybrid) {
197         return true;
198       } else {
199         return false;
200       }
201     case kGenericOptimized:
202     case kCblasOptimized:
203       if (is_hybrid && !need_non_dilated_im2col) {
204         return false;
205       } else {
206         return true;
207       }
208     case kMultithreadOptimized:
209       if (is_hybrid_non_dilated || is_quantized ||
210           !data->supports_multithreaded_kernel) {
211         return true;
212       } else {
213         return false;
214       }
215     default:
216       return false;
217   }
218 }
219 
220 // Allocate temporary tensors (`im2col`, `hwcn_weights` if necessary).
221 // Note: `context->AddTensors` might invalidate pointers to existing tensors.
222 // Therefore the logic to add tensors are isolated into this function.
AllocateTemporaryTensorsIfRequired(TfLiteContext * context,TfLiteNode * node,bool is_hybrid,bool is_per_channel,KernelType kernel_type,size_t im2col_bytes)223 static TfLiteStatus AllocateTemporaryTensorsIfRequired(
224     TfLiteContext* context, TfLiteNode* node, bool is_hybrid,
225     bool is_per_channel, KernelType kernel_type, size_t im2col_bytes) {
226   auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
227   OpData* data = reinterpret_cast<OpData*>(node->user_data);
228 
229   TF_LITE_ENSURE(context, node->inputs->size >= 2);
230   const TfLiteTensor* input;
231   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
232   const TfLiteTensor* filter;
233   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &filter));
234 
235   // If we're using the optimized multithreaded EigenTensor implementation of
236   // convolution, it expects the filter weights to be transposed compared to
237   // the normal TF Lite buffer format. Typical TF Lite weights are
238   // [filter_count, filter_height, filter_width, input_depth], but for the float
239   // implementation we need them as [filter_height, filter_width, input_depth,
240   // filter_count]. We get to that format by transposing, and create a temporary
241   // buffer to store the results.
242   // This path is only used for float processing, so only create the buffer if
243   // we're running with that data type.
244   data->need_hwcn_weights =
245       input->type == kTfLiteFloat32 && data->supports_multithreaded_kernel;
246 
247   // We don't always need to allocate im2col. It is only used in some versions
248   // of the optimized Conv. This test just mimics something that happens inside
249   // optimized_ops.h, in order to avoid a DCHECK(!im2col_data).
250   data->need_im2col =
251       IsIm2ColRequired(input, params, filter, data, is_hybrid, kernel_type);
252 
253   // If im2col_oversized is found to be true, we have to fallback to an
254   // execution path (like kReference in float/quantized cases) that doesn't
255   // require im2col operation. Therefore, we have to skip checking the hybrid
256   // case (but not the hybrid-per-channel one) where there's no such a fallback
257   // execution path.
258   // TODO(b/178743262): Consider making this check conditioned on the available
259   // memory of the system, rather than coupling to the mobile platform check.
260   if (IsMobilePlatform() && !(is_hybrid && !is_per_channel) &&
261       data->need_im2col && im2col_bytes >= kMaxIm2colBufferSizeMobile) {
262     data->need_im2col = false;
263     data->im2col_oversized = true;
264   }
265   int temporaries_count = 0;
266   if (data->need_im2col) {
267     data->im2col_index = temporaries_count;
268     if (data->im2col_id == kTensorNotAllocated) {
269       context->AddTensors(context, 1, &data->im2col_id);
270     }
271     ++temporaries_count;
272   }
273   if (data->need_hwcn_weights) {
274     data->hwcn_weights_index = temporaries_count;
275     if (data->hwcn_weights_id == kTensorNotAllocated) {
276       context->AddTensors(context, 1, &data->hwcn_weights_id);
277     }
278     ++temporaries_count;
279   }
280 
281   if (is_hybrid) {
282     // Allocate tensor to store the on-the-fly quantized inputs.
283     data->input_quantized_index = temporaries_count;
284     if (data->input_quantized_id == kTensorNotAllocated) {
285       TF_LITE_ENSURE_OK(
286           context, context->AddTensors(context, 1, &data->input_quantized_id));
287     }
288     ++temporaries_count;
289 
290     // Allocate tensor to store the quantization params computed during
291     // on-the-fly input quantization.
292     data->scaling_factors_index = temporaries_count;
293     if (data->scaling_factors_id == kTensorNotAllocated) {
294       TF_LITE_ENSURE_OK(
295           context, context->AddTensors(context, 1, &data->scaling_factors_id));
296     }
297     ++temporaries_count;
298 
299     // Allocate tensor to store the accumulators for the matrix multiply.
300     data->accum_scratch_index = temporaries_count;
301     if (data->accum_scratch_id == kTensorNotAllocated) {
302       TF_LITE_ENSURE_OK(
303           context, context->AddTensors(context, 1, &data->accum_scratch_id));
304     }
305     ++temporaries_count;
306     if (is_per_channel) {
307       data->input_offset_index = temporaries_count;
308       if (data->input_offset_id == kTensorNotAllocated) {
309         TF_LITE_ENSURE_OK(
310             context, context->AddTensors(context, 1, &data->input_offset_id));
311       }
312       ++temporaries_count;
313 
314       data->row_sums_index = temporaries_count;
315       if (data->row_sums_id == kTensorNotAllocated) {
316         TF_LITE_ENSURE_OK(context,
317                           context->AddTensors(context, 1, &data->row_sums_id));
318       }
319       ++temporaries_count;
320     }
321   }
322 
323   TfLiteIntArrayFree(node->temporaries);
324   node->temporaries = TfLiteIntArrayCreate(temporaries_count);
325 
326   return kTfLiteOk;
327 }
328 
Prepare(KernelType kernel_type,TfLiteContext * context,TfLiteNode * node)329 TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
330                      TfLiteNode* node) {
331   auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
332   OpData* data = reinterpret_cast<OpData*>(node->user_data);
333 
334   bool has_bias = node->inputs->size == 3;
335   // Check number of inputs/outputs
336   TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
337   TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
338   TfLiteTensor* output;
339   TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
340   const TfLiteTensor* input;
341   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
342   const TfLiteTensor* filter;
343   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &filter));
344 
345   // Check dimensionality of input, filter
346   TF_LITE_ENSURE_EQ(context, input->dims->size, 4);
347   TF_LITE_ENSURE_EQ(context, filter->dims->size, 4);
348   // Check input channels matching filter
349   TF_LITE_ENSURE_EQ(context, input->dims->data[3], filter->dims->data[3]);
350 
351   // Check types. (We assume that UINT8 refers to quantized tensors)
352   TfLiteType input_type = input->type;
353   TF_LITE_ENSURE(context,
354                  input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
355                      input_type == kTfLiteInt8 || input_type == kTfLiteInt16);
356   TF_LITE_ENSURE_TYPES_EQ(context, output->type, input_type);
357 
358   if (input_type == kTfLiteInt16) {
359     TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0);
360     TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
361   }
362 
363   const TfLiteTensor* bias = nullptr;
364 
365   // TODO(ahentz): At this point the optimized versions require 'bias'. We can
366   // either change that or document that convolution requires it.
367   TF_LITE_ENSURE(context, has_bias);
368 
369   if (has_bias) {
370     TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 2, &bias));
371     if (input_type == kTfLiteUInt8 || input_type == kTfLiteInt8) {
372       TF_LITE_ENSURE_TYPES_EQ(context, bias->type, kTfLiteInt32);
373       TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
374     } else if (input_type == kTfLiteInt16) {
375       TF_LITE_ENSURE_TYPES_EQ(context, bias->type, kTfLiteInt64);
376       TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
377     } else {
378       TF_LITE_ENSURE_TYPES_EQ(context, bias->type, input_type);
379     }
380     TF_LITE_ENSURE_EQ(context, NumElements(bias), SizeOfDimension(filter, 0));
381   }
382 
383   const bool is_hybrid =
384       (input->type == kTfLiteFloat32 &&
385        (filter->type == kTfLiteUInt8 || filter->type == kTfLiteInt8));
386 
387   if (is_hybrid && filter->type == kTfLiteInt8 &&
388       filter->quantization.type == kTfLiteAffineQuantization &&
389       filter->quantization.params &&
390       reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params)
391           ->scale &&
392       reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params)
393               ->scale->size > 1) {
394     const auto* affine_quantization =
395         reinterpret_cast<TfLiteAffineQuantization*>(
396             filter->quantization.params);
397     const float scale = affine_quantization->scale->data[0];
398     for (int i = 1; i < affine_quantization->scale->size; i++) {
399       if (affine_quantization->scale->data[i] != scale) {
400         data->is_hybrid_per_channel = true;
401         break;
402       }
403     }
404   }
405 
406   // The multi-threaded kernel supports neither dilation nor hybrid kernels, and
407   // is incompatible with mutable input filters that might change between evals.
408   data->supports_multithreaded_kernel =
409       (kernel_type == kMultithreadOptimized) &&
410       (context->recommended_num_threads != 1) && !is_hybrid &&
411       (params->dilation_width_factor == 1) &&
412       (params->dilation_height_factor == 1) &&
413       (filter->allocation_type != kTfLiteArenaRw) && !IsDynamicTensor(filter);
414 
415   int channels_in = filter->dims->data[3];
416   int channels_out = filter->dims->data[0];
417   int width = input->dims->data[2];
418   int height = input->dims->data[1];
419   int filter_width = filter->dims->data[2];
420   int filter_height = filter->dims->data[1];
421   int batches = input->dims->data[0];
422 
423   // Matching GetWindowedOutputSize in TensorFlow.
424   auto padding = params->padding;
425   int out_width, out_height;
426   data->padding = ComputePaddingHeightWidth(
427       params->stride_height, params->stride_width,
428       params->dilation_height_factor, params->dilation_width_factor, height,
429       width, filter_height, filter_width, padding, &out_height, &out_width);
430 
431   size_t im2col_type_size;
432   TF_LITE_ENSURE_STATUS(GetSizeOfType(context, input->type, &im2col_type_size));
433   // Note that we intentionally promote the first multiplicand (i.e. 'batches')
434   // to 'size_t' to avoid integer overflow here.
435   const size_t im2col_bytes = static_cast<size_t>(batches) * out_height *
436                               out_width * channels_in * filter_height *
437                               filter_width * im2col_type_size;
438   TF_LITE_ENSURE_STATUS(AllocateTemporaryTensorsIfRequired(
439       context, node, is_hybrid, data->is_hybrid_per_channel, kernel_type,
440       im2col_bytes));
441 
442   TF_LITE_ENSURE(context, has_bias);
443 
444   // Note that full fixed-point inference requires that all tensors have their
445   // parameters set. This is usually done during quantized training or
446   // calibration.
447   if (input_type != kTfLiteFloat32) {
448     TF_LITE_ENSURE_EQ(context, filter->quantization.type,
449                       kTfLiteAffineQuantization);
450     const auto* affine_quantization =
451         reinterpret_cast<TfLiteAffineQuantization*>(
452             filter->quantization.params);
453     TF_LITE_ENSURE(context, affine_quantization);
454     TF_LITE_ENSURE(context, affine_quantization->scale);
455     TF_LITE_ENSURE(context, (affine_quantization->scale->size == 1 ||
456                              affine_quantization->scale->size == channels_out));
457 
458     data->per_channel_output_multiplier.resize(channels_out);
459     data->per_channel_output_shift.resize(channels_out);
460     TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
461         context, input, filter, bias, output, params->activation,
462         &data->output_multiplier, &data->output_shift,
463         &data->output_activation_min, &data->output_activation_max,
464         data->per_channel_output_multiplier.data(),
465         data->per_channel_output_shift.data(), channels_out));
466   }
467 
468   TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
469   output_size->data[0] = batches;
470   output_size->data[1] = out_height;
471   output_size->data[2] = out_width;
472   output_size->data[3] = channels_out;
473   auto output_status = context->ResizeTensor(context, output, output_size);
474 
475   if (output_status != kTfLiteOk) return output_status;
476 
477   if (data->need_im2col) {
478     node->temporaries->data[data->im2col_index] = data->im2col_id;
479 
480     TfLiteIntArray* im2col_size = TfLiteIntArrayCreate(4);
481 
482     int input_depth = input->dims->data[3];
483     im2col_size->data[0] = output_size->data[0];
484     im2col_size->data[1] = output_size->data[1];
485     im2col_size->data[2] = output_size->data[2];
486     im2col_size->data[3] = input_depth * filter_height * filter_width;
487 
488     TfLiteTensor* im2col =
489         &context->tensors[node->temporaries->data[data->im2col_index]];
490     im2col->type = input->type;
491     if (is_hybrid) {
492       im2col->type = filter->type;
493     }
494     im2col->allocation_type = kTfLiteArenaRw;
495     auto im2col_status = context->ResizeTensor(context, im2col, im2col_size);
496     if (im2col_status != kTfLiteOk) return im2col_status;
497   }
498 
499   if (data->need_hwcn_weights) {
500     node->temporaries->data[data->hwcn_weights_index] = data->hwcn_weights_id;
501     TfLiteIntArray* hwcn_weights_size = TfLiteIntArrayCreate(2);
502 
503     // Because we're treating the filter weights as a matrix when we do the
504     // transpose, we allocate the buffer with a two-dimensional shape, where one
505     // dimension is the number of elements in each filter, and the second is the
506     // total number of filters.
507     int input_depth = input->dims->data[3];
508     hwcn_weights_size->data[0] = (filter_height * filter_width * input_depth);
509     hwcn_weights_size->data[1] = channels_out;
510 
511     TfLiteTensor* hwcn_weights =
512         &context->tensors[node->temporaries->data[data->hwcn_weights_index]];
513     hwcn_weights->type = input_type;
514     hwcn_weights->allocation_type = kTfLiteArenaRwPersistent;
515 
516     auto hwcn_weights_status =
517         context->ResizeTensor(context, hwcn_weights, hwcn_weights_size);
518     if (hwcn_weights_status != kTfLiteOk) return hwcn_weights_status;
519 
520     // TODO(petewarden): If Resize() is called when the size hasn't actually
521     // changed, this will do extra redundant work.
522     data->have_weights_been_transposed = false;
523   }
524 
525   if (is_hybrid) {
526     node->temporaries->data[data->input_quantized_index] =
527         data->input_quantized_id;
528     TfLiteTensor* input_quantized;
529     TF_LITE_ENSURE_OK(
530         context, GetTemporarySafe(context, node, data->input_quantized_index,
531                                   &input_quantized));
532     input_quantized->type = kTfLiteInt8;
533     input_quantized->allocation_type = kTfLiteArenaRw;
534     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
535       TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
536       TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
537                                                        input_quantized_size));
538     }
539 
540     node->temporaries->data[data->scaling_factors_index] =
541         data->scaling_factors_id;
542     TfLiteTensor* scaling_factors;
543     TF_LITE_ENSURE_OK(
544         context, GetTemporarySafe(context, node, data->scaling_factors_index,
545                                   &scaling_factors));
546     scaling_factors->type = kTfLiteFloat32;
547     scaling_factors->allocation_type = kTfLiteArenaRw;
548     // Only one scale factor per batch is typically necessary. See optimized
549     // implementation for why we need to allocate for the height of the inputs
550     // flattened to 2D.
551     TF_LITE_ENSURE(context, channels_in != 0);
552     const int height = NumElements(input) / channels_in;
553     int scaling_dims[1] = {height};
554     if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, 1, scaling_dims)) {
555       TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
556       scaling_factors_size->data[0] = height;
557       TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
558                                                        scaling_factors_size));
559     }
560 
561     node->temporaries->data[data->accum_scratch_index] = data->accum_scratch_id;
562     TfLiteTensor* accum_scratch;
563     TF_LITE_ENSURE_OK(context,
564                       GetTemporarySafe(context, node, data->accum_scratch_index,
565                                        &accum_scratch));
566     accum_scratch->type = kTfLiteInt32;
567     accum_scratch->allocation_type = kTfLiteArenaRw;
568     const int scratch_width = batches * out_height * out_width;
569     int accum_scratch_dims[2] = {channels_out, scratch_width};
570     if (!TfLiteIntArrayEqualsArray(accum_scratch->dims, 2,
571                                    accum_scratch_dims)) {
572       TfLiteIntArray* accum_scratch_size = TfLiteIntArrayCreate(2);
573       accum_scratch_size->data[0] = channels_out;
574       accum_scratch_size->data[1] = scratch_width;
575       TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, accum_scratch,
576                                                        accum_scratch_size));
577     }
578 
579     if (data->is_hybrid_per_channel) {
580       const auto* affine_quantization =
581           reinterpret_cast<TfLiteAffineQuantization*>(
582               filter->quantization.params);
583       TF_LITE_ENSURE_EQ(
584           context, affine_quantization->scale->size,
585           filter->dims->data[affine_quantization->quantized_dimension]);
586       node->temporaries->data[data->input_offset_index] = data->input_offset_id;
587       TfLiteTensor* input_offsets;
588       TF_LITE_ENSURE_OK(
589           context, GetTemporarySafe(context, node, data->input_offset_index,
590                                     &input_offsets));
591       input_offsets->type = kTfLiteInt32;
592       input_offsets->allocation_type = kTfLiteArenaRw;
593       // See above comment for the need to allocate for height of inputs.
594       TF_LITE_ENSURE(context, channels_in != 0);
595       const int height = NumElements(input) / channels_in;
596       const int input_offset_dims[1] = {height};
597       if (!TfLiteIntArrayEqualsArray(input_offsets->dims, 1,
598                                      input_offset_dims)) {
599         TfLiteIntArray* input_offsets_size = TfLiteIntArrayCreate(1);
600         input_offsets_size->data[0] = input_offset_dims[0];
601         TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_offsets,
602                                                          input_offsets_size));
603       }
604       node->temporaries->data[data->row_sums_index] = data->row_sums_id;
605       TfLiteTensor* row_sums;
606       TF_LITE_ENSURE_OK(
607           context,
608           GetTemporarySafe(context, node, data->row_sums_index, &row_sums));
609       row_sums->type = kTfLiteInt32;
610       row_sums->allocation_type = kTfLiteArenaRwPersistent;
611       // See above comment for the need to allocate for height of inputs.
612       const int row_sums_dims[1] = {channels_out};
613       if (!TfLiteIntArrayEqualsArray(row_sums->dims, 1, row_sums_dims)) {
614         TfLiteIntArray* row_sums_size = TfLiteIntArrayCreate(1);
615         row_sums_size->data[0] = row_sums_dims[0];
616         TF_LITE_ENSURE_OK(
617             context, context->ResizeTensor(context, row_sums, row_sums_size));
618       }
619     }
620   }
621   return kTfLiteOk;
622 }
623 
624 template <KernelType kernel_type>
Prepare(TfLiteContext * context,TfLiteNode * node)625 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
626   return Prepare(kernel_type, context, node);
627 }
628 
629 template <KernelType kernel_type>
EvalQuantized(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,const TfLiteTensor * input,const TfLiteTensor * filter,const TfLiteTensor * bias,TfLiteTensor * im2col,TfLiteTensor * output)630 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
631                    TfLiteConvParams* params, OpData* data,
632                    const TfLiteTensor* input, const TfLiteTensor* filter,
633                    const TfLiteTensor* bias, TfLiteTensor* im2col,
634                    TfLiteTensor* output) {
635   auto input_offset = -input->params.zero_point;
636   auto filter_offset = -filter->params.zero_point;
637   auto output_offset = output->params.zero_point;
638 
639   KernelType effective_kernel_type;
640   if ((kernel_type == kMultithreadOptimized ||
641        kernel_type == kCblasOptimized) &&
642       (params->dilation_width_factor != 1 ||
643        params->dilation_height_factor != 1)) {
644     // kMultithreadOptimized and kCblasOptimized do not support dilation.
645     // Therefore, fallback to optimized.
646     effective_kernel_type = kGenericOptimized;
647   } else {
648     effective_kernel_type = kernel_type;
649   }
650 
651   // We have to fallback to reference execution path when im2col is needed but
652   // disabled because to-be-allocated temporary im2col tensor is too large.
653   // See b/178743262 for the detailed motivation.
654   if (data->im2col_oversized) {
655     effective_kernel_type = kReference;
656   }
657 
658   ConvParams op_params;
659   op_params.padding_type = PaddingType::kSame;
660   op_params.padding_values.width = data->padding.width;
661   op_params.padding_values.height = data->padding.height;
662   op_params.dilation_width_factor = params->dilation_width_factor;
663   op_params.dilation_height_factor = params->dilation_height_factor;
664   op_params.stride_width = params->stride_width;
665   op_params.stride_height = params->stride_height;
666   op_params.input_offset = input_offset;
667   op_params.weights_offset = filter_offset;
668   op_params.output_offset = output_offset;
669   op_params.output_multiplier = data->output_multiplier;
670   op_params.output_shift = -data->output_shift;
671   op_params.quantized_activation_min = data->output_activation_min;
672   op_params.quantized_activation_max = data->output_activation_max;
673   switch (effective_kernel_type) {
674     case kReference: {
675       reference_ops::Conv(
676           op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
677           GetTensorShape(filter), GetTensorData<uint8_t>(filter),
678           GetTensorShape(bias), GetTensorData<int32_t>(bias),
679           GetTensorShape(output), GetTensorData<uint8_t>(output),
680           GetTensorShape(im2col), GetTensorData<uint8_t>(im2col),
681           /* cpu_backend_context = */ nullptr);
682       break;
683     }
684     case kGenericOptimized:
685     case kMultithreadOptimized:
686     case kCblasOptimized: {
687       // There is only one optimized implementation for Quantized Conv.
688       optimized_ops::Conv(
689           op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
690           GetTensorShape(filter), GetTensorData<uint8_t>(filter),
691           GetTensorShape(bias), GetTensorData<int32_t>(bias),
692           GetTensorShape(output), GetTensorData<uint8_t>(output),
693           GetTensorShape(im2col), GetTensorData<uint8_t>(im2col),
694           CpuBackendContext::GetFromContext(context));
695       break;
696     }
697   }
698 }
699 
700 template <KernelType kernel_type>
EvalQuantizedPerChannel(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,const TfLiteTensor * input,const TfLiteTensor * filter,const TfLiteTensor * bias,TfLiteTensor * output,TfLiteTensor * im2col)701 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
702                              TfLiteConvParams* params, OpData* data,
703                              const TfLiteTensor* input,
704                              const TfLiteTensor* filter,
705                              const TfLiteTensor* bias, TfLiteTensor* output,
706                              TfLiteTensor* im2col) {
707   ConvParams op_params;
708   op_params.input_offset = -input->params.zero_point;
709   op_params.output_offset = output->params.zero_point;
710   op_params.stride_height = params->stride_height;
711   op_params.stride_width = params->stride_width;
712   op_params.dilation_height_factor = params->dilation_height_factor;
713   op_params.dilation_width_factor = params->dilation_width_factor;
714   op_params.padding_values.height = data->padding.height;
715   op_params.padding_values.width = data->padding.width;
716   op_params.quantized_activation_min = data->output_activation_min;
717   op_params.quantized_activation_max = data->output_activation_max;
718 
719   KernelType effective_kernel_type = kernel_type;
720   // We have to fallback to reference execution path when im2col is needed but
721   // disabled because to-be-allocated temporary im2col tensor is too large.
722   // See b/178743262 for the detailed motivation.
723   if (data->im2col_oversized) {
724     effective_kernel_type = kReference;
725   }
726 
727   switch (effective_kernel_type) {
728     case kReference: {
729       reference_integer_ops::ConvPerChannel(
730           op_params, data->per_channel_output_multiplier.data(),
731           data->per_channel_output_shift.data(), GetTensorShape(input),
732           GetTensorData<int8>(input), GetTensorShape(filter),
733           GetTensorData<int8>(filter), GetTensorShape(bias),
734           GetTensorData<int32>(bias), GetTensorShape(output),
735           GetTensorData<int8>(output));
736       break;
737     }
738     case kGenericOptimized:
739     case kMultithreadOptimized:
740     case kCblasOptimized: {
741       optimized_integer_ops::ConvPerChannel(
742           op_params, data->per_channel_output_multiplier.data(),
743           data->per_channel_output_shift.data(), GetTensorShape(input),
744           GetTensorData<int8>(input), GetTensorShape(filter),
745           GetTensorData<int8>(filter), GetTensorShape(bias),
746           GetTensorData<int32>(bias), GetTensorShape(output),
747           GetTensorData<int8>(output), GetTensorShape(im2col),
748           GetTensorData<int8>(im2col),
749           CpuBackendContext::GetFromContext(context));
750       break;
751     }
752   }
753 }
754 
755 template <KernelType kernel_type>
EvalQuantizedPerChannel16x8(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,const TfLiteTensor * input,const TfLiteTensor * filter,const TfLiteTensor * bias,TfLiteTensor * output,TfLiteTensor * im2col)756 void EvalQuantizedPerChannel16x8(TfLiteContext* context, TfLiteNode* node,
757                                  TfLiteConvParams* params, OpData* data,
758                                  const TfLiteTensor* input,
759                                  const TfLiteTensor* filter,
760                                  const TfLiteTensor* bias, TfLiteTensor* output,
761                                  TfLiteTensor* im2col) {
762   ConvParams op_params;
763   op_params.input_offset = -input->params.zero_point;
764   op_params.output_offset = output->params.zero_point;
765   op_params.stride_height = params->stride_height;
766   op_params.stride_width = params->stride_width;
767   op_params.dilation_height_factor = params->dilation_height_factor;
768   op_params.dilation_width_factor = params->dilation_width_factor;
769   op_params.padding_values.height = data->padding.height;
770   op_params.padding_values.width = data->padding.width;
771   op_params.quantized_activation_min = data->output_activation_min;
772   op_params.quantized_activation_max = data->output_activation_max;
773 
774   switch (kernel_type) {
775     case kGenericOptimized:
776     case kMultithreadOptimized:
777     case kCblasOptimized:
778     case kReference: {
779       reference_integer_ops::ConvPerChannel(
780           op_params, data->per_channel_output_multiplier.data(),
781           data->per_channel_output_shift.data(), GetTensorShape(input),
782           GetTensorData<int16>(input), GetTensorShape(filter),
783           GetTensorData<int8>(filter), GetTensorShape(bias),
784           GetTensorData<std::int64_t>(bias), GetTensorShape(output),
785           GetTensorData<int16>(output));
786       break;
787     }
788   }
789 }
790 
791 template <KernelType kernel_type>
EvalFloat(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,const TfLiteTensor * input,const TfLiteTensor * filter,const TfLiteTensor * bias,TfLiteTensor * im2col,TfLiteTensor * hwcn_weights,TfLiteTensor * output)792 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
793                TfLiteConvParams* params, OpData* data,
794                const TfLiteTensor* input, const TfLiteTensor* filter,
795                const TfLiteTensor* bias, TfLiteTensor* im2col,
796                TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
797   float output_activation_min, output_activation_max;
798   CalculateActivationRange(params->activation, &output_activation_min,
799                            &output_activation_max);
800   KernelType effective_kernel_type = kernel_type;
801   // Fall back to the optimized path if multi-threaded conv is unsupported.
802   if ((kernel_type == kMultithreadOptimized) &&
803       !data->supports_multithreaded_kernel) {
804     effective_kernel_type = kGenericOptimized;
805   }
806 
807   // When im2col is needed (which is implied when 'im2col_oversized' is true),
808   // the GEMMM-based optimized path requires im2col data be allocated to ensure
809   // the correctness. Therefore, when im2col is disabled because of the
810   // oversized temporary im2col tensor, fallback to a non-optimized path is
811   // needed.
812   // See b/178743262 for the detailed motivation.
813   if (data->im2col_oversized) {
814     effective_kernel_type = kReference;
815 #if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
816     // As detailed by tflite::multithreaded_ops::Conv implementation in
817     // multithreaded_conv.h, the Eigen-based execution doesn't need im2col data.
818     // Therefore, we could rely on it as a better-optimized fallback than the
819     // reference one.
820     if (data->supports_multithreaded_kernel) {
821       effective_kernel_type = kMultithreadOptimized;
822     }
823 #endif
824   }
825 
826   ConvParams op_params;
827   op_params.padding_type = RuntimePaddingType(params->padding);
828   op_params.padding_values.width = data->padding.width;
829   op_params.padding_values.height = data->padding.height;
830   op_params.stride_width = params->stride_width;
831   op_params.stride_height = params->stride_height;
832   op_params.dilation_width_factor = params->dilation_width_factor;
833   op_params.dilation_height_factor = params->dilation_height_factor;
834   op_params.float_activation_min = output_activation_min;
835   op_params.float_activation_max = output_activation_max;
836   switch (effective_kernel_type) {
837     case kReference: {
838       reference_ops::Conv(op_params, GetTensorShape(input),
839                           GetTensorData<float>(input), GetTensorShape(filter),
840                           GetTensorData<float>(filter), GetTensorShape(bias),
841                           GetTensorData<float>(bias), GetTensorShape(output),
842                           GetTensorData<float>(output), GetTensorShape(im2col),
843                           GetTensorData<float>(im2col));
844       break;
845     }
846     case kCblasOptimized:
847     case kGenericOptimized: {
848       optimized_ops::Conv(op_params, GetTensorShape(input),
849                           GetTensorData<float>(input), GetTensorShape(filter),
850                           GetTensorData<float>(filter), GetTensorShape(bias),
851                           GetTensorData<float>(bias), GetTensorShape(output),
852                           GetTensorData<float>(output), GetTensorShape(im2col),
853                           GetTensorData<float>(im2col),
854                           CpuBackendContext::GetFromContext(context));
855       break;
856     }
857     case kMultithreadOptimized: {
858 #if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
859       const float* filter_data;
860       if (data->need_hwcn_weights) {
861         filter_data = GetTensorData<float>(hwcn_weights);
862       } else {
863         filter_data = GetTensorData<float>(filter);
864       }
865       multithreaded_ops::Conv(
866           *eigen_support::GetThreadPoolDevice(context), op_params,
867           GetTensorShape(input), GetTensorData<float>(input),
868           GetTensorShape(filter), filter_data, GetTensorShape(bias),
869           GetTensorData<float>(bias), GetTensorShape(output),
870           GetTensorData<float>(output), GetTensorShape(im2col),
871           GetTensorData<float>(im2col));
872       break;
873 #else   // !defined(TFLITE_WITH_MULTITHREADED_EIGEN)
874       // See Register_CONV_2D: we should never be here when TFLITE_WITH_RUY
875       // was enabled. We #if out this code in order to get the corresponding
876       // binary size benefits.
877       TFLITE_DCHECK(false);
878 #endif  // defined(TFLITE_WITH_MULTITHREADED_EIGEN)
879     }
880   }
881 }
882 
883 template <KernelType kernel_type>
EvalHybridPerChannel(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,const TfLiteTensor * input,const TfLiteTensor * filter,const TfLiteTensor * bias,TfLiteTensor * im2col,TfLiteTensor * output)884 TfLiteStatus EvalHybridPerChannel(TfLiteContext* context, TfLiteNode* node,
885                                   TfLiteConvParams* params, OpData* data,
886                                   const TfLiteTensor* input,
887                                   const TfLiteTensor* filter,
888                                   const TfLiteTensor* bias,
889                                   TfLiteTensor* im2col, TfLiteTensor* output) {
890   float output_activation_min, output_activation_max;
891   CalculateActivationRange(params->activation, &output_activation_min,
892                            &output_activation_max);
893 
894   const int batch_size = SizeOfDimension(input, 0);
895   TF_LITE_ENSURE(context, batch_size != 0);
896   const int input_size = NumElements(input) / batch_size;
897   TfLiteTensor* quantized_input_tensor;
898   TF_LITE_ENSURE_OK(context,
899                     GetTemporarySafe(context, node, data->input_quantized_index,
900                                      &quantized_input_tensor));
901   int8_t* quantized_input_ptr_batch =
902       GetTensorData<int8_t>(quantized_input_tensor);
903   TfLiteTensor* scaling_factors_tensor;
904   TF_LITE_ENSURE_OK(context,
905                     GetTemporarySafe(context, node, data->scaling_factors_index,
906                                      &scaling_factors_tensor));
907   float* scaling_factors_ptr = GetTensorData<float>(scaling_factors_tensor);
908   TfLiteTensor* input_offset_tensor;
909   TF_LITE_ENSURE_OK(context,
910                     GetTemporarySafe(context, node, data->input_offset_index,
911                                      &input_offset_tensor));
912   int32_t* input_offset_ptr = GetTensorData<int32_t>(input_offset_tensor);
913 
914   for (int b = 0; b < batch_size; ++b) {
915     const int offset = b * input_size;
916     tensor_utils::AsymmetricQuantizeFloats(
917         GetTensorData<float>(input) + offset, input_size,
918         quantized_input_ptr_batch + offset, &scaling_factors_ptr[b],
919         &input_offset_ptr[b]);
920   }
921 
922   int8_t* im2col_ptr = nullptr;
923   int8_t* filter_ptr = nullptr;
924   if (im2col != nullptr) {
925     im2col_ptr = im2col->data.int8;
926   }
927   filter_ptr = filter->data.int8;
928   const auto* affine_quantization =
929       reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params);
930 
931   KernelType effective_kernel_type = kernel_type;
932   // We have to fallback to reference execution path when im2col is needed but
933   // disabled because to-be-allocated temporary im2col tensor is too large.
934   // See b/178743262 for the detailed motivation.
935   if (data->im2col_oversized) {
936     effective_kernel_type = kReference;
937   }
938 
939   ConvParams op_params;
940   op_params.padding_type = PaddingType::kSame;
941   op_params.padding_values.width = data->padding.width;
942   op_params.padding_values.height = data->padding.height;
943   op_params.dilation_width_factor = params->dilation_width_factor;
944   op_params.dilation_height_factor = params->dilation_height_factor;
945   op_params.stride_width = params->stride_width;
946   op_params.stride_height = params->stride_height;
947   op_params.float_activation_min = output_activation_min;
948   op_params.float_activation_max = output_activation_max;
949   switch (effective_kernel_type) {
950     case kReference:
951       reference_ops::HybridConvPerChannel(
952           op_params, scaling_factors_ptr, GetTensorShape(input),
953           quantized_input_ptr_batch, GetTensorShape(filter), filter_ptr,
954           GetTensorShape(bias), GetTensorData<float>(bias),
955           GetTensorShape(output), GetTensorData<float>(output),
956           GetTensorShape(im2col), im2col_ptr, affine_quantization->scale->data,
957           input_offset_ptr);
958       break;
959     case kGenericOptimized:
960     case kMultithreadOptimized:
961     case kCblasOptimized: {
962       TfLiteTensor* row_sums;
963       TF_LITE_ENSURE_OK(
964           context,
965           GetTemporarySafe(context, node, data->row_sums_index, &row_sums));
966       TfLiteTensor* scratch;
967       TF_LITE_ENSURE_OK(
968           context,
969           GetTemporarySafe(context, node, data->accum_scratch_index, &scratch));
970       optimized_ops::HybridConvPerChannel(
971           op_params, scaling_factors_ptr, GetTensorShape(input),
972           quantized_input_ptr_batch, GetTensorShape(filter), filter_ptr,
973           GetTensorShape(bias), GetTensorData<float>(bias),
974           GetTensorShape(output), GetTensorData<float>(output),
975           GetTensorShape(im2col), im2col_ptr, affine_quantization->scale->data,
976           input_offset_ptr, GetTensorShape(scratch),
977           GetTensorData<int32>(scratch), GetTensorData<int32_t>(row_sums),
978           &data->compute_hybrid_row_sums,
979           CpuBackendContext::GetFromContext(context));
980       data->compute_hybrid_row_sums = false;
981       break;
982     }
983   }
984 
985   return kTfLiteOk;
986 }
987 
988 template <KernelType kernel_type>
EvalHybrid(TfLiteContext * context,TfLiteNode * node,TfLiteConvParams * params,OpData * data,const TfLiteTensor * input,const TfLiteTensor * filter,const TfLiteTensor * bias,TfLiteTensor * im2col,TfLiteTensor * accum_scratch,TfLiteTensor * output)989 TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
990                         TfLiteConvParams* params, OpData* data,
991                         const TfLiteTensor* input, const TfLiteTensor* filter,
992                         const TfLiteTensor* bias, TfLiteTensor* im2col,
993                         TfLiteTensor* accum_scratch, TfLiteTensor* output) {
994   float output_activation_min, output_activation_max;
995   CalculateActivationRange(params->activation, &output_activation_min,
996                            &output_activation_max);
997 
998   const int batch_size = SizeOfDimension(input, 0);
999   TF_LITE_ENSURE(context, batch_size != 0);
1000   const int input_size = NumElements(input) / batch_size;
1001 
1002   const float* input_ptr = GetTensorData<float>(input);
1003   TfLiteTensor* quantized_input_tensor;
1004   TF_LITE_ENSURE_OK(context,
1005                     GetTemporarySafe(context, node, data->input_quantized_index,
1006                                      &quantized_input_tensor));
1007   int8_t* quantized_input_ptr_batch =
1008       GetTensorData<int8_t>(quantized_input_tensor);
1009   TfLiteTensor* scaling_factors_tensor;
1010   TF_LITE_ENSURE_OK(context,
1011                     GetTemporarySafe(context, node, data->scaling_factors_index,
1012                                      &scaling_factors_tensor));
1013   float* scaling_factors_ptr = GetTensorData<float>(scaling_factors_tensor);
1014 
1015   // Per-batch input quantization for higher accuracy.
1016   {
1017     ruy::profiler::ScopeLabel label("ConvHybridQuantizeInputs");
1018     for (int b = 0; b < batch_size; ++b) {
1019       float unused_min, unused_max;
1020       const int offset = b * input_size;
1021       tensor_utils::SymmetricQuantizeFloats(
1022           input_ptr + offset, input_size, quantized_input_ptr_batch + offset,
1023           &unused_min, &unused_max, &scaling_factors_ptr[b]);
1024       scaling_factors_ptr[b] *= filter->params.scale;
1025     }
1026   }
1027 
1028   switch (kernel_type) {
1029     case kReference:
1030     case kGenericOptimized:
1031     case kMultithreadOptimized:
1032     case kCblasOptimized: {
1033       // There is only one implementation for hybrid kernel.
1034       ConvParams op_params;
1035       op_params.padding_type = PaddingType::kSame;
1036       op_params.padding_values.width = data->padding.width;
1037       op_params.padding_values.height = data->padding.height;
1038       op_params.stride_width = params->stride_width;
1039       op_params.stride_height = params->stride_height;
1040       op_params.dilation_width_factor = params->dilation_width_factor;
1041       op_params.dilation_height_factor = params->dilation_height_factor;
1042       op_params.float_activation_min = output_activation_min;
1043       op_params.float_activation_max = output_activation_max;
1044       optimized_ops::HybridConv(
1045           op_params, scaling_factors_ptr, GetTensorShape(input),
1046           quantized_input_ptr_batch, GetTensorShape(filter),
1047           GetTensorData<int8_t>(filter), GetTensorShape(bias),
1048           GetTensorData<float>(bias), GetTensorShape(accum_scratch),
1049           GetTensorData<int32_t>(accum_scratch), GetTensorShape(output),
1050           GetTensorData<float>(output), GetTensorShape(im2col),
1051           GetTensorData<int8_t>(im2col),
1052           CpuBackendContext::GetFromContext(context));
1053       break;
1054     }
1055   }
1056 
1057   return kTfLiteOk;
1058 }
1059 
1060 template <KernelType kernel_type, TfLiteType input_type>
EvalImpl(TfLiteContext * context,TfLiteNode * node)1061 TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node) {
1062   auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
1063   OpData* data = reinterpret_cast<OpData*>(node->user_data);
1064 
1065   TfLiteTensor* output;
1066   TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
1067   const TfLiteTensor* input;
1068   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
1069   const TfLiteTensor* filter;
1070   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &filter));
1071   bool has_bias = node->inputs->size == 3;
1072   const TfLiteTensor* bias = has_bias ? GetInput(context, node, 2) : nullptr;
1073   TfLiteTensor* im2col =
1074       data->need_im2col
1075           ? &context->tensors[node->temporaries->data[data->im2col_index]]
1076           : nullptr;
1077   TfLiteTensor* hwcn_weights =
1078       data->need_hwcn_weights
1079           ? &context->tensors[node->temporaries->data[data->hwcn_weights_index]]
1080           : nullptr;
1081 
1082   if (data->need_hwcn_weights && !data->have_weights_been_transposed) {
1083     TransposeFloatTensor(filter, hwcn_weights);
1084     data->have_weights_been_transposed = true;
1085   }
1086 
1087   TFLITE_DCHECK_EQ(input_type, input->type);
1088   switch (input_type) {  // Already know in/outtypes are same.
1089     case kTfLiteFloat32:
1090       if (filter->type == kTfLiteUInt8 || filter->type == kTfLiteInt8) {
1091         if (data->is_hybrid_per_channel) {
1092           TF_LITE_ENSURE_OK(context, EvalHybridPerChannel<kernel_type>(
1093                                          context, node, params, data, input,
1094                                          filter, bias, im2col, output));
1095         } else {
1096           TfLiteTensor* accum_scratch =
1097               &context->tensors[node->temporaries
1098                                     ->data[data->accum_scratch_index]];
1099           TF_LITE_ENSURE_OK(context,
1100                             EvalHybrid<kernel_type>(context, node, params, data,
1101                                                     input, filter, bias, im2col,
1102                                                     accum_scratch, output));
1103         }
1104       } else {
1105         EvalFloat<kernel_type>(context, node, params, data, input, filter, bias,
1106                                im2col, hwcn_weights, output);
1107       }
1108       break;
1109     case kTfLiteUInt8:
1110       EvalQuantized<kernel_type>(context, node, params, data, input, filter,
1111                                  bias, im2col, output);
1112       break;
1113     case kTfLiteInt8:
1114       EvalQuantizedPerChannel<kernel_type>(context, node, params, data, input,
1115                                            filter, bias, output, im2col);
1116       break;
1117     case kTfLiteInt16:
1118       EvalQuantizedPerChannel16x8<kernel_type>(
1119           context, node, params, data, input, filter, bias, output, im2col);
1120       break;
1121     default:
1122       TF_LITE_KERNEL_LOG(context, "Type %s currently not supported.",
1123                          TfLiteTypeGetName(input->type));
1124       return kTfLiteError;
1125   }
1126   return kTfLiteOk;
1127 }
1128 
1129 template <KernelType kernel_type>
Eval(TfLiteContext * context,TfLiteNode * node)1130 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
1131   const TfLiteTensor* input;
1132   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
1133 
1134   switch (input->type) {
1135     case kTfLiteFloat32:
1136       return EvalImpl<kernel_type, kTfLiteFloat32>(context, node);
1137     case kTfLiteUInt8:
1138       return EvalImpl<kernel_type, kTfLiteUInt8>(context, node);
1139     case kTfLiteInt8:
1140       return EvalImpl<kernel_type, kTfLiteInt8>(context, node);
1141     case kTfLiteInt16:
1142       return EvalImpl<kernel_type, kTfLiteInt16>(context, node);
1143     default:
1144       TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.",
1145                          TfLiteTypeGetName(input->type));
1146       return kTfLiteError;
1147   }
1148 }
1149 
1150 }  // namespace conv
1151 
Register_CONVOLUTION_REF()1152 TfLiteRegistration* Register_CONVOLUTION_REF() {
1153   static TfLiteRegistration r = {conv::Init, conv::Free,
1154                                  conv::Prepare<conv::kReference>,
1155                                  conv::Eval<conv::kReference>};
1156   return &r;
1157 }
1158 
Register_CONVOLUTION_GENERIC_OPT()1159 TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT() {
1160   static TfLiteRegistration r = {conv::Init, conv::Free,
1161                                  conv::Prepare<conv::kGenericOptimized>,
1162                                  conv::Eval<conv::kGenericOptimized>};
1163   return &r;
1164 }
1165 
Register_CONVOLUTION_GENERIC_OPT_UINT8()1166 TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT_UINT8() {
1167   static TfLiteRegistration r = {
1168       conv::Init, conv::Free, conv::Prepare<conv::kGenericOptimized>,
1169       conv::EvalImpl<conv::kGenericOptimized, kTfLiteUInt8>};
1170   return &r;
1171 }
1172 
Register_CONVOLUTION_MULTITHREADED_OPT()1173 TfLiteRegistration* Register_CONVOLUTION_MULTITHREADED_OPT() {
1174   static TfLiteRegistration r = {conv::Init, conv::Free,
1175                                  conv::Prepare<conv::kMultithreadOptimized>,
1176                                  conv::Eval<conv::kMultithreadOptimized>};
1177   return &r;
1178 }
1179 
Register_CONVOLUTION_CBLAS_OPT()1180 TfLiteRegistration* Register_CONVOLUTION_CBLAS_OPT() {
1181   static TfLiteRegistration r = {conv::Init, conv::Free,
1182                                  conv::Prepare<conv::kCblasOptimized>,
1183                                  conv::Eval<conv::kCblasOptimized>};
1184   return &r;
1185 }
1186 
Register_CONV_2D()1187 TfLiteRegistration* Register_CONV_2D() {
1188 #if defined TFLITE_USE_APPLE_ACCELERATE_FOR_CONV
1189   return Register_CONVOLUTION_CBLAS_OPT();
1190 #elif defined TFLITE_WITH_MULTITHREADED_EIGEN
1191   return Register_CONVOLUTION_MULTITHREADED_OPT();
1192 #else
1193   return Register_CONVOLUTION_GENERIC_OPT();
1194 #endif
1195 }
1196 
1197 // Warning: Clients using this variant are responsible for ensuring that their
1198 // models only need the UINT8 type. TFLite's op registration mechanism doesn't
1199 // yet allow for more nuanced registration mechanisms.
Register_CONV_2D_UINT8()1200 TfLiteRegistration* Register_CONV_2D_UINT8() {
1201 #if defined TFLITE_WITH_RUY
1202   // TFLITE_WITH_RUY optimizes the generic kernel type.
1203   return Register_CONVOLUTION_GENERIC_OPT_UINT8();
1204 #else
1205   return Register_CONV_2D();
1206 #endif
1207 }
1208 
1209 }  // namespace builtin
1210 }  // namespace ops
1211 }  // namespace tflite
1212